diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001, + "grad_norm": 85.6961898803711, + "learning_rate": 9.999e-06, + "loss": 0.8687, + "step": 10 + }, + { + "epoch": 0.002, + "grad_norm": 83.53839874267578, + "learning_rate": 9.997000000000001e-06, + "loss": 1.418, + "step": 20 + }, + { + "epoch": 0.003, + "grad_norm": 95.85935974121094, + "learning_rate": 9.995000000000002e-06, + "loss": 1.1484, + "step": 30 + }, + { + "epoch": 0.004, + "grad_norm": 44.594295501708984, + "learning_rate": 9.993e-06, + "loss": 0.8815, + "step": 40 + }, + { + "epoch": 0.005, + "grad_norm": 116.1628189086914, + "learning_rate": 9.991000000000001e-06, + "loss": 1.2495, + "step": 50 + }, + { + "epoch": 0.006, + "grad_norm": 33.84364318847656, + "learning_rate": 9.989e-06, + "loss": 0.6394, + "step": 60 + }, + { + "epoch": 0.007, + "grad_norm": 25.787431716918945, + "learning_rate": 9.987000000000001e-06, + "loss": 1.0792, + "step": 70 + }, + { + "epoch": 0.008, + "grad_norm": 43.854469299316406, + "learning_rate": 9.985000000000002e-06, + "loss": 0.8326, + "step": 80 + }, + { + "epoch": 0.009, + "grad_norm": 101.5726547241211, + "learning_rate": 9.983e-06, + "loss": 1.2902, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 61.49679183959961, + "learning_rate": 9.981000000000002e-06, + "loss": 0.5976, + "step": 100 + }, + { + "epoch": 0.011, + "grad_norm": 83.0029525756836, + "learning_rate": 9.979e-06, + "loss": 1.0847, + "step": 110 + }, + { + "epoch": 0.012, + "grad_norm": 81.3443374633789, + "learning_rate": 9.977000000000001e-06, + "loss": 0.9674, + "step": 120 + }, + { + "epoch": 0.013, + "grad_norm": 61.71631622314453, + "learning_rate": 9.975000000000002e-06, + "loss": 0.7648, + "step": 130 + }, + { + "epoch": 0.014, + "grad_norm": 52.06660461425781, + "learning_rate": 9.973000000000001e-06, + "loss": 0.6408, + "step": 140 + }, + { + "epoch": 0.015, + "grad_norm": 67.05174255371094, + "learning_rate": 9.971e-06, + "loss": 1.1016, + "step": 150 + }, + { + "epoch": 0.016, + "grad_norm": 70.35164642333984, + "learning_rate": 9.969e-06, + "loss": 1.2591, + "step": 160 + }, + { + "epoch": 0.017, + "grad_norm": 113.0372085571289, + "learning_rate": 9.967000000000001e-06, + "loss": 0.8569, + "step": 170 + }, + { + "epoch": 0.018, + "grad_norm": 46.71174621582031, + "learning_rate": 9.965000000000002e-06, + "loss": 1.0649, + "step": 180 + }, + { + "epoch": 0.019, + "grad_norm": 49.677127838134766, + "learning_rate": 9.963000000000001e-06, + "loss": 0.7769, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 44.4288444519043, + "learning_rate": 9.961e-06, + "loss": 0.8335, + "step": 200 + }, + { + "epoch": 0.021, + "grad_norm": 49.02952194213867, + "learning_rate": 9.959e-06, + "loss": 1.0531, + "step": 210 + }, + { + "epoch": 0.022, + "grad_norm": 42.99825668334961, + "learning_rate": 9.957000000000001e-06, + "loss": 0.9388, + "step": 220 + }, + { + "epoch": 0.023, + "grad_norm": 33.82646560668945, + "learning_rate": 9.955000000000002e-06, + "loss": 0.8255, + "step": 230 + }, + { + "epoch": 0.024, + "grad_norm": 31.44774627685547, + "learning_rate": 9.953000000000001e-06, + "loss": 0.8447, + "step": 240 + }, + { + "epoch": 0.025, + "grad_norm": 46.26844024658203, + "learning_rate": 9.951e-06, + "loss": 0.8758, + "step": 250 + }, + { + "epoch": 0.026, + "grad_norm": 32.29243469238281, + "learning_rate": 9.949e-06, + "loss": 0.5973, + "step": 260 + }, + { + "epoch": 0.027, + "grad_norm": 44.86991882324219, + "learning_rate": 9.947000000000001e-06, + "loss": 0.8255, + "step": 270 + }, + { + "epoch": 0.028, + "grad_norm": 53.69334411621094, + "learning_rate": 9.945e-06, + "loss": 0.9522, + "step": 280 + }, + { + "epoch": 0.029, + "grad_norm": 65.80093383789062, + "learning_rate": 9.943000000000001e-06, + "loss": 0.8104, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 29.859413146972656, + "learning_rate": 9.941e-06, + "loss": 0.6795, + "step": 300 + }, + { + "epoch": 0.031, + "grad_norm": 37.98838424682617, + "learning_rate": 9.939000000000001e-06, + "loss": 0.7588, + "step": 310 + }, + { + "epoch": 0.032, + "grad_norm": 16.02583885192871, + "learning_rate": 9.937000000000002e-06, + "loss": 0.7691, + "step": 320 + }, + { + "epoch": 0.033, + "grad_norm": 30.105602264404297, + "learning_rate": 9.935e-06, + "loss": 0.8934, + "step": 330 + }, + { + "epoch": 0.034, + "grad_norm": 27.63981819152832, + "learning_rate": 9.933e-06, + "loss": 1.0111, + "step": 340 + }, + { + "epoch": 0.035, + "grad_norm": 46.90068435668945, + "learning_rate": 9.931e-06, + "loss": 0.6812, + "step": 350 + }, + { + "epoch": 0.036, + "grad_norm": 45.553165435791016, + "learning_rate": 9.929000000000001e-06, + "loss": 0.9058, + "step": 360 + }, + { + "epoch": 0.037, + "grad_norm": 30.0932559967041, + "learning_rate": 9.927000000000002e-06, + "loss": 0.6499, + "step": 370 + }, + { + "epoch": 0.038, + "grad_norm": 52.90816879272461, + "learning_rate": 9.925e-06, + "loss": 0.8783, + "step": 380 + }, + { + "epoch": 0.039, + "grad_norm": 31.532318115234375, + "learning_rate": 9.923e-06, + "loss": 0.8316, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 15.857837677001953, + "learning_rate": 9.921e-06, + "loss": 0.7904, + "step": 400 + }, + { + "epoch": 0.041, + "grad_norm": 32.60575866699219, + "learning_rate": 9.919000000000001e-06, + "loss": 0.5657, + "step": 410 + }, + { + "epoch": 0.042, + "grad_norm": 26.059589385986328, + "learning_rate": 9.917000000000002e-06, + "loss": 0.8572, + "step": 420 + }, + { + "epoch": 0.043, + "grad_norm": 39.00318908691406, + "learning_rate": 9.915e-06, + "loss": 0.713, + "step": 430 + }, + { + "epoch": 0.044, + "grad_norm": 5.215452671051025, + "learning_rate": 9.913e-06, + "loss": 0.6709, + "step": 440 + }, + { + "epoch": 0.045, + "grad_norm": 15.460463523864746, + "learning_rate": 9.911e-06, + "loss": 0.639, + "step": 450 + }, + { + "epoch": 0.046, + "grad_norm": 60.24131774902344, + "learning_rate": 9.909000000000001e-06, + "loss": 0.8114, + "step": 460 + }, + { + "epoch": 0.047, + "grad_norm": 28.567232131958008, + "learning_rate": 9.907000000000002e-06, + "loss": 0.6119, + "step": 470 + }, + { + "epoch": 0.048, + "grad_norm": 90.1940689086914, + "learning_rate": 9.905000000000001e-06, + "loss": 0.7393, + "step": 480 + }, + { + "epoch": 0.049, + "grad_norm": 60.29534912109375, + "learning_rate": 9.903e-06, + "loss": 0.587, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 29.43375587463379, + "learning_rate": 9.901e-06, + "loss": 0.8577, + "step": 500 + }, + { + "epoch": 0.051, + "grad_norm": 14.599111557006836, + "learning_rate": 9.899000000000001e-06, + "loss": 0.5109, + "step": 510 + }, + { + "epoch": 0.052, + "grad_norm": 28.859493255615234, + "learning_rate": 9.897e-06, + "loss": 0.6818, + "step": 520 + }, + { + "epoch": 0.053, + "grad_norm": 91.49598693847656, + "learning_rate": 9.895000000000001e-06, + "loss": 0.7266, + "step": 530 + }, + { + "epoch": 0.054, + "grad_norm": 43.01456832885742, + "learning_rate": 9.893e-06, + "loss": 0.9112, + "step": 540 + }, + { + "epoch": 0.055, + "grad_norm": 89.42814636230469, + "learning_rate": 9.891e-06, + "loss": 0.9735, + "step": 550 + }, + { + "epoch": 0.056, + "grad_norm": 31.47621726989746, + "learning_rate": 9.889000000000001e-06, + "loss": 0.7226, + "step": 560 + }, + { + "epoch": 0.057, + "grad_norm": 41.30959701538086, + "learning_rate": 9.887e-06, + "loss": 0.7916, + "step": 570 + }, + { + "epoch": 0.058, + "grad_norm": 46.82521438598633, + "learning_rate": 9.885000000000001e-06, + "loss": 0.7079, + "step": 580 + }, + { + "epoch": 0.059, + "grad_norm": 44.034061431884766, + "learning_rate": 9.8832e-06, + "loss": 0.7888, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 35.98831558227539, + "learning_rate": 9.881200000000001e-06, + "loss": 0.6411, + "step": 600 + }, + { + "epoch": 0.061, + "grad_norm": 46.98432540893555, + "learning_rate": 9.8792e-06, + "loss": 0.8364, + "step": 610 + }, + { + "epoch": 0.062, + "grad_norm": 52.23887252807617, + "learning_rate": 9.877200000000001e-06, + "loss": 0.8968, + "step": 620 + }, + { + "epoch": 0.063, + "grad_norm": 61.01598358154297, + "learning_rate": 9.8752e-06, + "loss": 0.8694, + "step": 630 + }, + { + "epoch": 0.064, + "grad_norm": 37.633644104003906, + "learning_rate": 9.8732e-06, + "loss": 0.9208, + "step": 640 + }, + { + "epoch": 0.065, + "grad_norm": 29.999998092651367, + "learning_rate": 9.871200000000001e-06, + "loss": 0.6904, + "step": 650 + }, + { + "epoch": 0.066, + "grad_norm": 15.523207664489746, + "learning_rate": 9.8692e-06, + "loss": 0.7705, + "step": 660 + }, + { + "epoch": 0.067, + "grad_norm": 34.08747863769531, + "learning_rate": 9.867200000000001e-06, + "loss": 0.9855, + "step": 670 + }, + { + "epoch": 0.068, + "grad_norm": 8.73726749420166, + "learning_rate": 9.8652e-06, + "loss": 0.6722, + "step": 680 + }, + { + "epoch": 0.069, + "grad_norm": 28.97559928894043, + "learning_rate": 9.8632e-06, + "loss": 0.7683, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 29.997812271118164, + "learning_rate": 9.861200000000001e-06, + "loss": 0.6177, + "step": 700 + }, + { + "epoch": 0.071, + "grad_norm": 25.937652587890625, + "learning_rate": 9.8592e-06, + "loss": 0.8951, + "step": 710 + }, + { + "epoch": 0.072, + "grad_norm": 51.03562545776367, + "learning_rate": 9.857200000000001e-06, + "loss": 0.8368, + "step": 720 + }, + { + "epoch": 0.073, + "grad_norm": 16.005319595336914, + "learning_rate": 9.8552e-06, + "loss": 0.6646, + "step": 730 + }, + { + "epoch": 0.074, + "grad_norm": 34.2577018737793, + "learning_rate": 9.8532e-06, + "loss": 0.9092, + "step": 740 + }, + { + "epoch": 0.075, + "grad_norm": 20.1041316986084, + "learning_rate": 9.851200000000001e-06, + "loss": 0.7914, + "step": 750 + }, + { + "epoch": 0.076, + "grad_norm": 30.207904815673828, + "learning_rate": 9.8492e-06, + "loss": 0.9624, + "step": 760 + }, + { + "epoch": 0.077, + "grad_norm": 15.775735855102539, + "learning_rate": 9.847200000000001e-06, + "loss": 0.6117, + "step": 770 + }, + { + "epoch": 0.078, + "grad_norm": 18.047714233398438, + "learning_rate": 9.8452e-06, + "loss": 0.706, + "step": 780 + }, + { + "epoch": 0.079, + "grad_norm": 17.006519317626953, + "learning_rate": 9.843200000000001e-06, + "loss": 0.7123, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 28.868335723876953, + "learning_rate": 9.841200000000002e-06, + "loss": 0.7956, + "step": 800 + }, + { + "epoch": 0.081, + "grad_norm": 42.61716842651367, + "learning_rate": 9.8392e-06, + "loss": 0.7163, + "step": 810 + }, + { + "epoch": 0.082, + "grad_norm": 21.97026824951172, + "learning_rate": 9.837200000000001e-06, + "loss": 0.8151, + "step": 820 + }, + { + "epoch": 0.083, + "grad_norm": 14.37778091430664, + "learning_rate": 9.8354e-06, + "loss": 0.8179, + "step": 830 + }, + { + "epoch": 0.084, + "grad_norm": 24.292551040649414, + "learning_rate": 9.833400000000001e-06, + "loss": 0.8949, + "step": 840 + }, + { + "epoch": 0.085, + "grad_norm": 31.621431350708008, + "learning_rate": 9.8314e-06, + "loss": 0.9193, + "step": 850 + }, + { + "epoch": 0.086, + "grad_norm": 22.210474014282227, + "learning_rate": 9.829400000000001e-06, + "loss": 0.7189, + "step": 860 + }, + { + "epoch": 0.087, + "grad_norm": 22.361305236816406, + "learning_rate": 9.8274e-06, + "loss": 0.7964, + "step": 870 + }, + { + "epoch": 0.088, + "grad_norm": 20.3325252532959, + "learning_rate": 9.8254e-06, + "loss": 0.7291, + "step": 880 + }, + { + "epoch": 0.089, + "grad_norm": 34.202213287353516, + "learning_rate": 9.823400000000002e-06, + "loss": 0.7168, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 78.82975006103516, + "learning_rate": 9.8214e-06, + "loss": 0.7371, + "step": 900 + }, + { + "epoch": 0.091, + "grad_norm": 27.09836769104004, + "learning_rate": 9.819400000000001e-06, + "loss": 0.8064, + "step": 910 + }, + { + "epoch": 0.092, + "grad_norm": 31.45117950439453, + "learning_rate": 9.8174e-06, + "loss": 0.9471, + "step": 920 + }, + { + "epoch": 0.093, + "grad_norm": 74.64904022216797, + "learning_rate": 9.815400000000001e-06, + "loss": 0.8619, + "step": 930 + }, + { + "epoch": 0.094, + "grad_norm": 21.551801681518555, + "learning_rate": 9.8134e-06, + "loss": 0.8292, + "step": 940 + }, + { + "epoch": 0.095, + "grad_norm": 12.729266166687012, + "learning_rate": 9.8114e-06, + "loss": 0.6445, + "step": 950 + }, + { + "epoch": 0.096, + "grad_norm": 17.378259658813477, + "learning_rate": 9.809400000000001e-06, + "loss": 0.6488, + "step": 960 + }, + { + "epoch": 0.097, + "grad_norm": 35.8254508972168, + "learning_rate": 9.8074e-06, + "loss": 0.9045, + "step": 970 + }, + { + "epoch": 0.098, + "grad_norm": 37.96717071533203, + "learning_rate": 9.805400000000001e-06, + "loss": 0.8012, + "step": 980 + }, + { + "epoch": 0.099, + "grad_norm": 25.49050521850586, + "learning_rate": 9.8034e-06, + "loss": 1.0158, + "step": 990 + }, + { + "epoch": 0.1, + "grad_norm": 17.505203247070312, + "learning_rate": 9.8014e-06, + "loss": 0.7721, + "step": 1000 + }, + { + "epoch": 0.101, + "grad_norm": 13.868611335754395, + "learning_rate": 9.799400000000001e-06, + "loss": 0.6163, + "step": 1010 + }, + { + "epoch": 0.102, + "grad_norm": 23.564525604248047, + "learning_rate": 9.797400000000002e-06, + "loss": 0.7585, + "step": 1020 + }, + { + "epoch": 0.103, + "grad_norm": 23.647708892822266, + "learning_rate": 9.795400000000001e-06, + "loss": 0.5131, + "step": 1030 + }, + { + "epoch": 0.104, + "grad_norm": 26.601909637451172, + "learning_rate": 9.7934e-06, + "loss": 0.8254, + "step": 1040 + }, + { + "epoch": 0.105, + "grad_norm": 13.147595405578613, + "learning_rate": 9.7914e-06, + "loss": 0.7593, + "step": 1050 + }, + { + "epoch": 0.106, + "grad_norm": 41.67106628417969, + "learning_rate": 9.789400000000002e-06, + "loss": 0.7138, + "step": 1060 + }, + { + "epoch": 0.107, + "grad_norm": 17.47908592224121, + "learning_rate": 9.7874e-06, + "loss": 0.7506, + "step": 1070 + }, + { + "epoch": 0.108, + "grad_norm": 53.28046798706055, + "learning_rate": 9.785400000000001e-06, + "loss": 0.7343, + "step": 1080 + }, + { + "epoch": 0.109, + "grad_norm": 34.919734954833984, + "learning_rate": 9.7834e-06, + "loss": 0.8653, + "step": 1090 + }, + { + "epoch": 0.11, + "grad_norm": 26.23090171813965, + "learning_rate": 9.781400000000001e-06, + "loss": 0.717, + "step": 1100 + }, + { + "epoch": 0.111, + "grad_norm": 22.868240356445312, + "learning_rate": 9.779400000000002e-06, + "loss": 0.5845, + "step": 1110 + }, + { + "epoch": 0.112, + "grad_norm": 32.55518341064453, + "learning_rate": 9.7774e-06, + "loss": 0.8183, + "step": 1120 + }, + { + "epoch": 0.113, + "grad_norm": 14.902539253234863, + "learning_rate": 9.7754e-06, + "loss": 0.7217, + "step": 1130 + }, + { + "epoch": 0.114, + "grad_norm": 39.822017669677734, + "learning_rate": 9.7734e-06, + "loss": 0.6007, + "step": 1140 + }, + { + "epoch": 0.115, + "grad_norm": 94.34113311767578, + "learning_rate": 9.771400000000001e-06, + "loss": 0.6261, + "step": 1150 + }, + { + "epoch": 0.116, + "grad_norm": 46.82630157470703, + "learning_rate": 9.769400000000002e-06, + "loss": 0.9339, + "step": 1160 + }, + { + "epoch": 0.117, + "grad_norm": 29.478790283203125, + "learning_rate": 9.7674e-06, + "loss": 0.5997, + "step": 1170 + }, + { + "epoch": 0.118, + "grad_norm": 24.89198875427246, + "learning_rate": 9.7654e-06, + "loss": 0.9387, + "step": 1180 + }, + { + "epoch": 0.119, + "grad_norm": 41.31000518798828, + "learning_rate": 9.7634e-06, + "loss": 0.7602, + "step": 1190 + }, + { + "epoch": 0.12, + "grad_norm": 22.63460350036621, + "learning_rate": 9.761400000000001e-06, + "loss": 0.6417, + "step": 1200 + }, + { + "epoch": 0.121, + "grad_norm": 30.559040069580078, + "learning_rate": 9.759400000000002e-06, + "loss": 0.8557, + "step": 1210 + }, + { + "epoch": 0.122, + "grad_norm": 35.72277069091797, + "learning_rate": 9.7574e-06, + "loss": 0.7212, + "step": 1220 + }, + { + "epoch": 0.123, + "grad_norm": 19.481369018554688, + "learning_rate": 9.7554e-06, + "loss": 0.7667, + "step": 1230 + }, + { + "epoch": 0.124, + "grad_norm": 5.373721599578857, + "learning_rate": 9.7534e-06, + "loss": 0.5497, + "step": 1240 + }, + { + "epoch": 0.125, + "grad_norm": 40.84144592285156, + "learning_rate": 9.751400000000001e-06, + "loss": 0.717, + "step": 1250 + }, + { + "epoch": 0.126, + "grad_norm": 23.940317153930664, + "learning_rate": 9.749400000000002e-06, + "loss": 0.7161, + "step": 1260 + }, + { + "epoch": 0.127, + "grad_norm": 21.63367462158203, + "learning_rate": 9.747400000000001e-06, + "loss": 0.8505, + "step": 1270 + }, + { + "epoch": 0.128, + "grad_norm": 24.01799774169922, + "learning_rate": 9.7454e-06, + "loss": 0.6953, + "step": 1280 + }, + { + "epoch": 0.129, + "grad_norm": 31.635465621948242, + "learning_rate": 9.7434e-06, + "loss": 0.7121, + "step": 1290 + }, + { + "epoch": 0.13, + "grad_norm": 24.7454776763916, + "learning_rate": 9.741400000000001e-06, + "loss": 0.8494, + "step": 1300 + }, + { + "epoch": 0.131, + "grad_norm": 25.52398109436035, + "learning_rate": 9.7394e-06, + "loss": 0.7229, + "step": 1310 + }, + { + "epoch": 0.132, + "grad_norm": 30.034908294677734, + "learning_rate": 9.737400000000001e-06, + "loss": 0.7787, + "step": 1320 + }, + { + "epoch": 0.133, + "grad_norm": 11.845321655273438, + "learning_rate": 9.7354e-06, + "loss": 0.8125, + "step": 1330 + }, + { + "epoch": 0.134, + "grad_norm": 25.581470489501953, + "learning_rate": 9.7334e-06, + "loss": 0.619, + "step": 1340 + }, + { + "epoch": 0.135, + "grad_norm": 37.586360931396484, + "learning_rate": 9.731400000000001e-06, + "loss": 0.7928, + "step": 1350 + }, + { + "epoch": 0.136, + "grad_norm": 40.47153091430664, + "learning_rate": 9.7294e-06, + "loss": 0.7839, + "step": 1360 + }, + { + "epoch": 0.137, + "grad_norm": 25.44156837463379, + "learning_rate": 9.727400000000001e-06, + "loss": 0.6141, + "step": 1370 + }, + { + "epoch": 0.138, + "grad_norm": 23.60976219177246, + "learning_rate": 9.7254e-06, + "loss": 0.9117, + "step": 1380 + }, + { + "epoch": 0.139, + "grad_norm": 23.967390060424805, + "learning_rate": 9.7234e-06, + "loss": 0.6394, + "step": 1390 + }, + { + "epoch": 0.14, + "grad_norm": 25.87657928466797, + "learning_rate": 9.721400000000001e-06, + "loss": 0.6832, + "step": 1400 + }, + { + "epoch": 0.141, + "grad_norm": 18.35723114013672, + "learning_rate": 9.7194e-06, + "loss": 0.6657, + "step": 1410 + }, + { + "epoch": 0.142, + "grad_norm": 44.30083465576172, + "learning_rate": 9.717400000000001e-06, + "loss": 0.7639, + "step": 1420 + }, + { + "epoch": 0.143, + "grad_norm": 30.185522079467773, + "learning_rate": 9.7154e-06, + "loss": 0.8301, + "step": 1430 + }, + { + "epoch": 0.144, + "grad_norm": 16.693023681640625, + "learning_rate": 9.713400000000001e-06, + "loss": 0.7492, + "step": 1440 + }, + { + "epoch": 0.145, + "grad_norm": 46.640113830566406, + "learning_rate": 9.711400000000002e-06, + "loss": 0.7384, + "step": 1450 + }, + { + "epoch": 0.146, + "grad_norm": 14.902393341064453, + "learning_rate": 9.7094e-06, + "loss": 0.6725, + "step": 1460 + }, + { + "epoch": 0.147, + "grad_norm": 24.02662467956543, + "learning_rate": 9.7076e-06, + "loss": 0.7805, + "step": 1470 + }, + { + "epoch": 0.148, + "grad_norm": 169.85626220703125, + "learning_rate": 9.7056e-06, + "loss": 0.9443, + "step": 1480 + }, + { + "epoch": 0.149, + "grad_norm": 25.04290771484375, + "learning_rate": 9.703600000000001e-06, + "loss": 0.7477, + "step": 1490 + }, + { + "epoch": 0.15, + "grad_norm": 21.511314392089844, + "learning_rate": 9.7016e-06, + "loss": 0.6933, + "step": 1500 + }, + { + "epoch": 0.151, + "grad_norm": 34.140377044677734, + "learning_rate": 9.699600000000001e-06, + "loss": 0.7412, + "step": 1510 + }, + { + "epoch": 0.152, + "grad_norm": 29.6060848236084, + "learning_rate": 9.6976e-06, + "loss": 0.669, + "step": 1520 + }, + { + "epoch": 0.153, + "grad_norm": 26.4044246673584, + "learning_rate": 9.6956e-06, + "loss": 0.7043, + "step": 1530 + }, + { + "epoch": 0.154, + "grad_norm": 35.81393814086914, + "learning_rate": 9.693600000000002e-06, + "loss": 0.6838, + "step": 1540 + }, + { + "epoch": 0.155, + "grad_norm": 31.620975494384766, + "learning_rate": 9.6916e-06, + "loss": 0.5365, + "step": 1550 + }, + { + "epoch": 0.156, + "grad_norm": 25.63037109375, + "learning_rate": 9.689600000000001e-06, + "loss": 0.5885, + "step": 1560 + }, + { + "epoch": 0.157, + "grad_norm": 54.12228012084961, + "learning_rate": 9.6876e-06, + "loss": 0.8745, + "step": 1570 + }, + { + "epoch": 0.158, + "grad_norm": 32.007720947265625, + "learning_rate": 9.685600000000001e-06, + "loss": 0.8223, + "step": 1580 + }, + { + "epoch": 0.159, + "grad_norm": 29.423542022705078, + "learning_rate": 9.683600000000002e-06, + "loss": 0.8957, + "step": 1590 + }, + { + "epoch": 0.16, + "grad_norm": 20.0703125, + "learning_rate": 9.6816e-06, + "loss": 0.5619, + "step": 1600 + }, + { + "epoch": 0.161, + "grad_norm": 24.575084686279297, + "learning_rate": 9.679600000000001e-06, + "loss": 0.8571, + "step": 1610 + }, + { + "epoch": 0.162, + "grad_norm": 18.266128540039062, + "learning_rate": 9.6776e-06, + "loss": 0.7458, + "step": 1620 + }, + { + "epoch": 0.163, + "grad_norm": 16.532207489013672, + "learning_rate": 9.675600000000001e-06, + "loss": 0.7339, + "step": 1630 + }, + { + "epoch": 0.164, + "grad_norm": 32.43312072753906, + "learning_rate": 9.6736e-06, + "loss": 0.8568, + "step": 1640 + }, + { + "epoch": 0.165, + "grad_norm": 28.956008911132812, + "learning_rate": 9.6716e-06, + "loss": 0.698, + "step": 1650 + }, + { + "epoch": 0.166, + "grad_norm": 22.16632652282715, + "learning_rate": 9.669600000000001e-06, + "loss": 0.6527, + "step": 1660 + }, + { + "epoch": 0.167, + "grad_norm": 38.9682502746582, + "learning_rate": 9.6676e-06, + "loss": 0.7314, + "step": 1670 + }, + { + "epoch": 0.168, + "grad_norm": 35.68903350830078, + "learning_rate": 9.665600000000001e-06, + "loss": 1.1168, + "step": 1680 + }, + { + "epoch": 0.169, + "grad_norm": 7.560277462005615, + "learning_rate": 9.6636e-06, + "loss": 0.7461, + "step": 1690 + }, + { + "epoch": 0.17, + "grad_norm": 12.822694778442383, + "learning_rate": 9.6616e-06, + "loss": 0.7006, + "step": 1700 + }, + { + "epoch": 0.171, + "grad_norm": 36.905982971191406, + "learning_rate": 9.659600000000002e-06, + "loss": 0.5886, + "step": 1710 + }, + { + "epoch": 0.172, + "grad_norm": 32.26405715942383, + "learning_rate": 9.6576e-06, + "loss": 0.7071, + "step": 1720 + }, + { + "epoch": 0.173, + "grad_norm": 27.82394027709961, + "learning_rate": 9.655600000000001e-06, + "loss": 0.6715, + "step": 1730 + }, + { + "epoch": 0.174, + "grad_norm": 30.092735290527344, + "learning_rate": 9.6536e-06, + "loss": 0.7702, + "step": 1740 + }, + { + "epoch": 0.175, + "grad_norm": 55.14793395996094, + "learning_rate": 9.651600000000001e-06, + "loss": 0.9611, + "step": 1750 + }, + { + "epoch": 0.176, + "grad_norm": 33.633914947509766, + "learning_rate": 9.649600000000002e-06, + "loss": 0.8827, + "step": 1760 + }, + { + "epoch": 0.177, + "grad_norm": 21.440275192260742, + "learning_rate": 9.6476e-06, + "loss": 0.6002, + "step": 1770 + }, + { + "epoch": 0.178, + "grad_norm": 54.77036666870117, + "learning_rate": 9.645600000000001e-06, + "loss": 0.6598, + "step": 1780 + }, + { + "epoch": 0.179, + "grad_norm": 9.992846488952637, + "learning_rate": 9.6436e-06, + "loss": 0.7752, + "step": 1790 + }, + { + "epoch": 0.18, + "grad_norm": 21.36663246154785, + "learning_rate": 9.641600000000001e-06, + "loss": 0.6992, + "step": 1800 + }, + { + "epoch": 0.181, + "grad_norm": 22.170259475708008, + "learning_rate": 9.639600000000002e-06, + "loss": 0.6805, + "step": 1810 + }, + { + "epoch": 0.182, + "grad_norm": 38.906578063964844, + "learning_rate": 9.6376e-06, + "loss": 0.8172, + "step": 1820 + }, + { + "epoch": 0.183, + "grad_norm": 54.6015739440918, + "learning_rate": 9.6356e-06, + "loss": 0.9855, + "step": 1830 + }, + { + "epoch": 0.184, + "grad_norm": 59.32664108276367, + "learning_rate": 9.6336e-06, + "loss": 0.8278, + "step": 1840 + }, + { + "epoch": 0.185, + "grad_norm": 14.2304048538208, + "learning_rate": 9.631600000000001e-06, + "loss": 0.8763, + "step": 1850 + }, + { + "epoch": 0.186, + "grad_norm": 29.78812026977539, + "learning_rate": 9.629600000000002e-06, + "loss": 0.9618, + "step": 1860 + }, + { + "epoch": 0.187, + "grad_norm": 14.675698280334473, + "learning_rate": 9.6276e-06, + "loss": 0.6545, + "step": 1870 + }, + { + "epoch": 0.188, + "grad_norm": 24.466876983642578, + "learning_rate": 9.6256e-06, + "loss": 0.7284, + "step": 1880 + }, + { + "epoch": 0.189, + "grad_norm": 7.637043476104736, + "learning_rate": 9.6236e-06, + "loss": 0.723, + "step": 1890 + }, + { + "epoch": 0.19, + "grad_norm": 19.321901321411133, + "learning_rate": 9.621600000000001e-06, + "loss": 0.8099, + "step": 1900 + }, + { + "epoch": 0.191, + "grad_norm": 18.512929916381836, + "learning_rate": 9.619600000000002e-06, + "loss": 0.7765, + "step": 1910 + }, + { + "epoch": 0.192, + "grad_norm": 16.116796493530273, + "learning_rate": 9.617600000000001e-06, + "loss": 0.7287, + "step": 1920 + }, + { + "epoch": 0.193, + "grad_norm": 29.692922592163086, + "learning_rate": 9.6156e-06, + "loss": 0.8104, + "step": 1930 + }, + { + "epoch": 0.194, + "grad_norm": 34.71347427368164, + "learning_rate": 9.6136e-06, + "loss": 0.6277, + "step": 1940 + }, + { + "epoch": 0.195, + "grad_norm": 8.180668830871582, + "learning_rate": 9.611600000000001e-06, + "loss": 0.5576, + "step": 1950 + }, + { + "epoch": 0.196, + "grad_norm": 24.448158264160156, + "learning_rate": 9.609600000000002e-06, + "loss": 0.6852, + "step": 1960 + }, + { + "epoch": 0.197, + "grad_norm": 29.8192081451416, + "learning_rate": 9.607600000000001e-06, + "loss": 0.5809, + "step": 1970 + }, + { + "epoch": 0.198, + "grad_norm": 25.80632972717285, + "learning_rate": 9.6056e-06, + "loss": 0.9618, + "step": 1980 + }, + { + "epoch": 0.199, + "grad_norm": 44.71590042114258, + "learning_rate": 9.6036e-06, + "loss": 0.9214, + "step": 1990 + }, + { + "epoch": 0.2, + "grad_norm": 25.80580711364746, + "learning_rate": 9.601600000000001e-06, + "loss": 0.6859, + "step": 2000 + }, + { + "epoch": 0.201, + "grad_norm": 11.477659225463867, + "learning_rate": 9.5996e-06, + "loss": 0.6822, + "step": 2010 + }, + { + "epoch": 0.202, + "grad_norm": 25.939342498779297, + "learning_rate": 9.597600000000001e-06, + "loss": 0.7485, + "step": 2020 + }, + { + "epoch": 0.203, + "grad_norm": 30.334260940551758, + "learning_rate": 9.5956e-06, + "loss": 0.8371, + "step": 2030 + }, + { + "epoch": 0.204, + "grad_norm": 34.063961029052734, + "learning_rate": 9.5936e-06, + "loss": 1.0189, + "step": 2040 + }, + { + "epoch": 0.205, + "grad_norm": 19.317014694213867, + "learning_rate": 9.591600000000001e-06, + "loss": 0.6944, + "step": 2050 + }, + { + "epoch": 0.206, + "grad_norm": 26.51333999633789, + "learning_rate": 9.5896e-06, + "loss": 0.6656, + "step": 2060 + }, + { + "epoch": 0.207, + "grad_norm": 10.476828575134277, + "learning_rate": 9.5876e-06, + "loss": 0.6312, + "step": 2070 + }, + { + "epoch": 0.208, + "grad_norm": 27.542985916137695, + "learning_rate": 9.5856e-06, + "loss": 0.5976, + "step": 2080 + }, + { + "epoch": 0.209, + "grad_norm": 31.365577697753906, + "learning_rate": 9.583600000000001e-06, + "loss": 0.9775, + "step": 2090 + }, + { + "epoch": 0.21, + "grad_norm": 40.898616790771484, + "learning_rate": 9.581600000000002e-06, + "loss": 0.5258, + "step": 2100 + }, + { + "epoch": 0.211, + "grad_norm": 15.9881591796875, + "learning_rate": 9.5796e-06, + "loss": 0.7965, + "step": 2110 + }, + { + "epoch": 0.212, + "grad_norm": 25.101795196533203, + "learning_rate": 9.5776e-06, + "loss": 0.5959, + "step": 2120 + }, + { + "epoch": 0.213, + "grad_norm": 39.432334899902344, + "learning_rate": 9.5756e-06, + "loss": 0.8365, + "step": 2130 + }, + { + "epoch": 0.214, + "grad_norm": 41.93818283081055, + "learning_rate": 9.573600000000001e-06, + "loss": 0.7073, + "step": 2140 + }, + { + "epoch": 0.215, + "grad_norm": 5.648146629333496, + "learning_rate": 9.571600000000002e-06, + "loss": 0.7014, + "step": 2150 + }, + { + "epoch": 0.216, + "grad_norm": 35.35142517089844, + "learning_rate": 9.5696e-06, + "loss": 0.7572, + "step": 2160 + }, + { + "epoch": 0.217, + "grad_norm": 23.585378646850586, + "learning_rate": 9.567600000000001e-06, + "loss": 0.7152, + "step": 2170 + }, + { + "epoch": 0.218, + "grad_norm": 19.717735290527344, + "learning_rate": 9.5656e-06, + "loss": 0.6492, + "step": 2180 + }, + { + "epoch": 0.219, + "grad_norm": 10.743090629577637, + "learning_rate": 9.563600000000001e-06, + "loss": 0.7152, + "step": 2190 + }, + { + "epoch": 0.22, + "grad_norm": 33.635536193847656, + "learning_rate": 9.5616e-06, + "loss": 0.6707, + "step": 2200 + }, + { + "epoch": 0.221, + "grad_norm": 24.042377471923828, + "learning_rate": 9.5596e-06, + "loss": 0.5237, + "step": 2210 + }, + { + "epoch": 0.222, + "grad_norm": 48.697303771972656, + "learning_rate": 9.557600000000001e-06, + "loss": 0.7784, + "step": 2220 + }, + { + "epoch": 0.223, + "grad_norm": 62.7592658996582, + "learning_rate": 9.5556e-06, + "loss": 0.5507, + "step": 2230 + }, + { + "epoch": 0.224, + "grad_norm": 38.88966369628906, + "learning_rate": 9.553600000000001e-06, + "loss": 0.8598, + "step": 2240 + }, + { + "epoch": 0.225, + "grad_norm": 29.976181030273438, + "learning_rate": 9.5516e-06, + "loss": 0.8038, + "step": 2250 + }, + { + "epoch": 0.226, + "grad_norm": 18.87337875366211, + "learning_rate": 9.549600000000001e-06, + "loss": 0.4878, + "step": 2260 + }, + { + "epoch": 0.227, + "grad_norm": 38.70804214477539, + "learning_rate": 9.547600000000002e-06, + "loss": 0.8455, + "step": 2270 + }, + { + "epoch": 0.228, + "grad_norm": 9.813104629516602, + "learning_rate": 9.5456e-06, + "loss": 0.4139, + "step": 2280 + }, + { + "epoch": 0.229, + "grad_norm": 55.018157958984375, + "learning_rate": 9.543600000000001e-06, + "loss": 1.0867, + "step": 2290 + }, + { + "epoch": 0.23, + "grad_norm": 4.863101005554199, + "learning_rate": 9.5416e-06, + "loss": 0.6808, + "step": 2300 + }, + { + "epoch": 0.231, + "grad_norm": 21.807111740112305, + "learning_rate": 9.539600000000001e-06, + "loss": 0.9384, + "step": 2310 + }, + { + "epoch": 0.232, + "grad_norm": 27.61972999572754, + "learning_rate": 9.537600000000002e-06, + "loss": 0.7813, + "step": 2320 + }, + { + "epoch": 0.233, + "grad_norm": 48.68235397338867, + "learning_rate": 9.5356e-06, + "loss": 0.81, + "step": 2330 + }, + { + "epoch": 0.234, + "grad_norm": 26.039478302001953, + "learning_rate": 9.533600000000001e-06, + "loss": 0.7981, + "step": 2340 + }, + { + "epoch": 0.235, + "grad_norm": 25.755840301513672, + "learning_rate": 9.5316e-06, + "loss": 0.7751, + "step": 2350 + }, + { + "epoch": 0.236, + "grad_norm": 25.323991775512695, + "learning_rate": 9.529600000000001e-06, + "loss": 0.8628, + "step": 2360 + }, + { + "epoch": 0.237, + "grad_norm": 25.357213973999023, + "learning_rate": 9.527600000000002e-06, + "loss": 0.6644, + "step": 2370 + }, + { + "epoch": 0.238, + "grad_norm": 49.66263198852539, + "learning_rate": 9.5256e-06, + "loss": 0.6991, + "step": 2380 + }, + { + "epoch": 0.239, + "grad_norm": 23.181354522705078, + "learning_rate": 9.523600000000001e-06, + "loss": 0.9643, + "step": 2390 + }, + { + "epoch": 0.24, + "grad_norm": 18.877561569213867, + "learning_rate": 9.5216e-06, + "loss": 0.9063, + "step": 2400 + }, + { + "epoch": 0.241, + "grad_norm": 8.216836929321289, + "learning_rate": 9.519600000000001e-06, + "loss": 0.823, + "step": 2410 + }, + { + "epoch": 0.242, + "grad_norm": 16.65896987915039, + "learning_rate": 9.517600000000002e-06, + "loss": 0.587, + "step": 2420 + }, + { + "epoch": 0.243, + "grad_norm": 12.24148178100586, + "learning_rate": 9.515600000000001e-06, + "loss": 0.5954, + "step": 2430 + }, + { + "epoch": 0.244, + "grad_norm": 24.519540786743164, + "learning_rate": 9.5136e-06, + "loss": 0.862, + "step": 2440 + }, + { + "epoch": 0.245, + "grad_norm": 6.640892028808594, + "learning_rate": 9.5116e-06, + "loss": 0.8153, + "step": 2450 + }, + { + "epoch": 0.246, + "grad_norm": 35.18092346191406, + "learning_rate": 9.509600000000001e-06, + "loss": 0.8742, + "step": 2460 + }, + { + "epoch": 0.247, + "grad_norm": 33.762699127197266, + "learning_rate": 9.507600000000002e-06, + "loss": 1.0229, + "step": 2470 + }, + { + "epoch": 0.248, + "grad_norm": 22.815547943115234, + "learning_rate": 9.505600000000001e-06, + "loss": 0.7814, + "step": 2480 + }, + { + "epoch": 0.249, + "grad_norm": 46.83992385864258, + "learning_rate": 9.5036e-06, + "loss": 0.8382, + "step": 2490 + }, + { + "epoch": 0.25, + "grad_norm": 18.844913482666016, + "learning_rate": 9.5016e-06, + "loss": 0.5333, + "step": 2500 + }, + { + "epoch": 0.251, + "grad_norm": 18.164127349853516, + "learning_rate": 9.499600000000001e-06, + "loss": 0.6708, + "step": 2510 + }, + { + "epoch": 0.252, + "grad_norm": 40.63623046875, + "learning_rate": 9.497600000000002e-06, + "loss": 0.7313, + "step": 2520 + }, + { + "epoch": 0.253, + "grad_norm": 32.359256744384766, + "learning_rate": 9.495600000000001e-06, + "loss": 0.7149, + "step": 2530 + }, + { + "epoch": 0.254, + "grad_norm": 22.28809356689453, + "learning_rate": 9.4936e-06, + "loss": 0.8074, + "step": 2540 + }, + { + "epoch": 0.255, + "grad_norm": 19.11900520324707, + "learning_rate": 9.4916e-06, + "loss": 0.8352, + "step": 2550 + }, + { + "epoch": 0.256, + "grad_norm": 44.57843017578125, + "learning_rate": 9.489600000000001e-06, + "loss": 0.6453, + "step": 2560 + }, + { + "epoch": 0.257, + "grad_norm": 13.637824058532715, + "learning_rate": 9.4876e-06, + "loss": 0.7488, + "step": 2570 + }, + { + "epoch": 0.258, + "grad_norm": 13.40095043182373, + "learning_rate": 9.485600000000001e-06, + "loss": 0.7969, + "step": 2580 + }, + { + "epoch": 0.259, + "grad_norm": 31.110763549804688, + "learning_rate": 9.4836e-06, + "loss": 0.8985, + "step": 2590 + }, + { + "epoch": 0.26, + "grad_norm": 21.71891212463379, + "learning_rate": 9.4816e-06, + "loss": 0.5378, + "step": 2600 + }, + { + "epoch": 0.261, + "grad_norm": 43.970008850097656, + "learning_rate": 9.479600000000002e-06, + "loss": 0.7833, + "step": 2610 + }, + { + "epoch": 0.262, + "grad_norm": 25.333736419677734, + "learning_rate": 9.4776e-06, + "loss": 0.8815, + "step": 2620 + }, + { + "epoch": 0.263, + "grad_norm": 22.61463737487793, + "learning_rate": 9.4756e-06, + "loss": 0.6996, + "step": 2630 + }, + { + "epoch": 0.264, + "grad_norm": 29.46462059020996, + "learning_rate": 9.4736e-06, + "loss": 0.7356, + "step": 2640 + }, + { + "epoch": 0.265, + "grad_norm": 19.242321014404297, + "learning_rate": 9.471600000000001e-06, + "loss": 0.9462, + "step": 2650 + }, + { + "epoch": 0.266, + "grad_norm": 29.384977340698242, + "learning_rate": 9.469600000000002e-06, + "loss": 0.8388, + "step": 2660 + }, + { + "epoch": 0.267, + "grad_norm": 25.901931762695312, + "learning_rate": 9.4676e-06, + "loss": 0.5984, + "step": 2670 + }, + { + "epoch": 0.268, + "grad_norm": 18.902681350708008, + "learning_rate": 9.4656e-06, + "loss": 0.6226, + "step": 2680 + }, + { + "epoch": 0.269, + "grad_norm": 38.78340148925781, + "learning_rate": 9.4636e-06, + "loss": 0.729, + "step": 2690 + }, + { + "epoch": 0.27, + "grad_norm": 29.76132583618164, + "learning_rate": 9.461600000000001e-06, + "loss": 0.8336, + "step": 2700 + }, + { + "epoch": 0.271, + "grad_norm": 26.546491622924805, + "learning_rate": 9.459600000000002e-06, + "loss": 0.7328, + "step": 2710 + }, + { + "epoch": 0.272, + "grad_norm": 14.950640678405762, + "learning_rate": 9.4576e-06, + "loss": 0.6283, + "step": 2720 + }, + { + "epoch": 0.273, + "grad_norm": 23.570337295532227, + "learning_rate": 9.4556e-06, + "loss": 0.7588, + "step": 2730 + }, + { + "epoch": 0.274, + "grad_norm": 33.28166961669922, + "learning_rate": 9.4536e-06, + "loss": 0.8514, + "step": 2740 + }, + { + "epoch": 0.275, + "grad_norm": 62.68339920043945, + "learning_rate": 9.451600000000001e-06, + "loss": 0.6574, + "step": 2750 + }, + { + "epoch": 0.276, + "grad_norm": 20.3784236907959, + "learning_rate": 9.449600000000002e-06, + "loss": 1.0182, + "step": 2760 + }, + { + "epoch": 0.277, + "grad_norm": 8.02643871307373, + "learning_rate": 9.4476e-06, + "loss": 0.8452, + "step": 2770 + }, + { + "epoch": 0.278, + "grad_norm": 15.038368225097656, + "learning_rate": 9.4456e-06, + "loss": 0.7439, + "step": 2780 + }, + { + "epoch": 0.279, + "grad_norm": 30.92201042175293, + "learning_rate": 9.4436e-06, + "loss": 0.9571, + "step": 2790 + }, + { + "epoch": 0.28, + "grad_norm": 28.545917510986328, + "learning_rate": 9.441600000000001e-06, + "loss": 0.734, + "step": 2800 + }, + { + "epoch": 0.281, + "grad_norm": 23.49970054626465, + "learning_rate": 9.4396e-06, + "loss": 0.9253, + "step": 2810 + }, + { + "epoch": 0.282, + "grad_norm": 24.050922393798828, + "learning_rate": 9.437600000000001e-06, + "loss": 0.8319, + "step": 2820 + }, + { + "epoch": 0.283, + "grad_norm": 13.687472343444824, + "learning_rate": 9.4356e-06, + "loss": 0.8918, + "step": 2830 + }, + { + "epoch": 0.284, + "grad_norm": 13.137202262878418, + "learning_rate": 9.4336e-06, + "loss": 0.9102, + "step": 2840 + }, + { + "epoch": 0.285, + "grad_norm": 10.729533195495605, + "learning_rate": 9.431600000000001e-06, + "loss": 0.7587, + "step": 2850 + }, + { + "epoch": 0.286, + "grad_norm": 30.572080612182617, + "learning_rate": 9.4296e-06, + "loss": 0.8736, + "step": 2860 + }, + { + "epoch": 0.287, + "grad_norm": 53.83828353881836, + "learning_rate": 9.427600000000001e-06, + "loss": 0.8348, + "step": 2870 + }, + { + "epoch": 0.288, + "grad_norm": 14.523385047912598, + "learning_rate": 9.4256e-06, + "loss": 0.8429, + "step": 2880 + }, + { + "epoch": 0.289, + "grad_norm": 22.808746337890625, + "learning_rate": 9.4236e-06, + "loss": 0.7983, + "step": 2890 + }, + { + "epoch": 0.29, + "grad_norm": 18.653146743774414, + "learning_rate": 9.421600000000001e-06, + "loss": 0.7738, + "step": 2900 + }, + { + "epoch": 0.291, + "grad_norm": 11.08884334564209, + "learning_rate": 9.4196e-06, + "loss": 0.8135, + "step": 2910 + }, + { + "epoch": 0.292, + "grad_norm": 11.973413467407227, + "learning_rate": 9.417600000000001e-06, + "loss": 0.703, + "step": 2920 + }, + { + "epoch": 0.293, + "grad_norm": 41.609397888183594, + "learning_rate": 9.4156e-06, + "loss": 0.5246, + "step": 2930 + }, + { + "epoch": 0.294, + "grad_norm": 7.794600963592529, + "learning_rate": 9.4136e-06, + "loss": 0.6844, + "step": 2940 + }, + { + "epoch": 0.295, + "grad_norm": 47.65550231933594, + "learning_rate": 9.411600000000002e-06, + "loss": 0.7167, + "step": 2950 + }, + { + "epoch": 0.296, + "grad_norm": 28.062644958496094, + "learning_rate": 9.4096e-06, + "loss": 0.7536, + "step": 2960 + }, + { + "epoch": 0.297, + "grad_norm": 11.334671020507812, + "learning_rate": 9.407600000000001e-06, + "loss": 0.7353, + "step": 2970 + }, + { + "epoch": 0.298, + "grad_norm": 38.403076171875, + "learning_rate": 9.4056e-06, + "loss": 0.8809, + "step": 2980 + }, + { + "epoch": 0.299, + "grad_norm": 21.346092224121094, + "learning_rate": 9.403600000000001e-06, + "loss": 0.5042, + "step": 2990 + }, + { + "epoch": 0.3, + "grad_norm": 15.392056465148926, + "learning_rate": 9.4016e-06, + "loss": 0.5377, + "step": 3000 + }, + { + "epoch": 0.301, + "grad_norm": 28.40464210510254, + "learning_rate": 9.3996e-06, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.302, + "grad_norm": 38.49671173095703, + "learning_rate": 9.397600000000001e-06, + "loss": 0.8251, + "step": 3020 + }, + { + "epoch": 0.303, + "grad_norm": 22.052934646606445, + "learning_rate": 9.3956e-06, + "loss": 0.6006, + "step": 3030 + }, + { + "epoch": 0.304, + "grad_norm": 44.930458068847656, + "learning_rate": 9.393600000000001e-06, + "loss": 0.9245, + "step": 3040 + }, + { + "epoch": 0.305, + "grad_norm": 56.81517028808594, + "learning_rate": 9.3916e-06, + "loss": 0.6957, + "step": 3050 + }, + { + "epoch": 0.306, + "grad_norm": 43.52264404296875, + "learning_rate": 9.3896e-06, + "loss": 0.7816, + "step": 3060 + }, + { + "epoch": 0.307, + "grad_norm": 24.788782119750977, + "learning_rate": 9.387600000000001e-06, + "loss": 0.7707, + "step": 3070 + }, + { + "epoch": 0.308, + "grad_norm": 40.297454833984375, + "learning_rate": 9.3856e-06, + "loss": 0.6905, + "step": 3080 + }, + { + "epoch": 0.309, + "grad_norm": 16.154659271240234, + "learning_rate": 9.383600000000001e-06, + "loss": 0.6413, + "step": 3090 + }, + { + "epoch": 0.31, + "grad_norm": 24.906816482543945, + "learning_rate": 9.3816e-06, + "loss": 0.6919, + "step": 3100 + }, + { + "epoch": 0.311, + "grad_norm": 9.070622444152832, + "learning_rate": 9.3796e-06, + "loss": 0.7292, + "step": 3110 + }, + { + "epoch": 0.312, + "grad_norm": 25.71718406677246, + "learning_rate": 9.377600000000001e-06, + "loss": 0.767, + "step": 3120 + }, + { + "epoch": 0.313, + "grad_norm": 19.88960075378418, + "learning_rate": 9.3756e-06, + "loss": 0.8662, + "step": 3130 + }, + { + "epoch": 0.314, + "grad_norm": 21.357192993164062, + "learning_rate": 9.373600000000001e-06, + "loss": 0.5693, + "step": 3140 + }, + { + "epoch": 0.315, + "grad_norm": 39.57415771484375, + "learning_rate": 9.3716e-06, + "loss": 0.8721, + "step": 3150 + }, + { + "epoch": 0.316, + "grad_norm": 24.280065536499023, + "learning_rate": 9.369600000000001e-06, + "loss": 0.7282, + "step": 3160 + }, + { + "epoch": 0.317, + "grad_norm": 34.4137077331543, + "learning_rate": 9.367600000000002e-06, + "loss": 0.9604, + "step": 3170 + }, + { + "epoch": 0.318, + "grad_norm": 28.870506286621094, + "learning_rate": 9.3656e-06, + "loss": 0.7249, + "step": 3180 + }, + { + "epoch": 0.319, + "grad_norm": 31.467092514038086, + "learning_rate": 9.363600000000001e-06, + "loss": 0.7248, + "step": 3190 + }, + { + "epoch": 0.32, + "grad_norm": 50.64829635620117, + "learning_rate": 9.3616e-06, + "loss": 0.8032, + "step": 3200 + }, + { + "epoch": 0.321, + "grad_norm": 33.74970626831055, + "learning_rate": 9.359600000000001e-06, + "loss": 0.7636, + "step": 3210 + }, + { + "epoch": 0.322, + "grad_norm": 20.4691162109375, + "learning_rate": 9.357600000000002e-06, + "loss": 0.7254, + "step": 3220 + }, + { + "epoch": 0.323, + "grad_norm": 28.088464736938477, + "learning_rate": 9.3556e-06, + "loss": 0.4999, + "step": 3230 + }, + { + "epoch": 0.324, + "grad_norm": 35.50647735595703, + "learning_rate": 9.3536e-06, + "loss": 0.5983, + "step": 3240 + }, + { + "epoch": 0.325, + "grad_norm": 26.18625259399414, + "learning_rate": 9.3516e-06, + "loss": 0.8037, + "step": 3250 + }, + { + "epoch": 0.326, + "grad_norm": 33.88105392456055, + "learning_rate": 9.349600000000001e-06, + "loss": 0.6124, + "step": 3260 + }, + { + "epoch": 0.327, + "grad_norm": 28.297372817993164, + "learning_rate": 9.347600000000002e-06, + "loss": 0.82, + "step": 3270 + }, + { + "epoch": 0.328, + "grad_norm": 26.45084571838379, + "learning_rate": 9.3456e-06, + "loss": 0.7743, + "step": 3280 + }, + { + "epoch": 0.329, + "grad_norm": 45.73817443847656, + "learning_rate": 9.3436e-06, + "loss": 0.589, + "step": 3290 + }, + { + "epoch": 0.33, + "grad_norm": 30.2762508392334, + "learning_rate": 9.3416e-06, + "loss": 0.6909, + "step": 3300 + }, + { + "epoch": 0.331, + "grad_norm": 16.09083366394043, + "learning_rate": 9.339600000000001e-06, + "loss": 0.5877, + "step": 3310 + }, + { + "epoch": 0.332, + "grad_norm": 28.67357635498047, + "learning_rate": 9.337600000000002e-06, + "loss": 0.7198, + "step": 3320 + }, + { + "epoch": 0.333, + "grad_norm": 33.21657943725586, + "learning_rate": 9.335600000000001e-06, + "loss": 0.5661, + "step": 3330 + }, + { + "epoch": 0.334, + "grad_norm": 30.5037841796875, + "learning_rate": 9.3336e-06, + "loss": 0.7943, + "step": 3340 + }, + { + "epoch": 0.335, + "grad_norm": 34.517642974853516, + "learning_rate": 9.3316e-06, + "loss": 0.6678, + "step": 3350 + }, + { + "epoch": 0.336, + "grad_norm": 47.0224723815918, + "learning_rate": 9.329600000000001e-06, + "loss": 0.7275, + "step": 3360 + }, + { + "epoch": 0.337, + "grad_norm": 33.63166427612305, + "learning_rate": 9.3276e-06, + "loss": 0.6178, + "step": 3370 + }, + { + "epoch": 0.338, + "grad_norm": 16.433677673339844, + "learning_rate": 9.325600000000001e-06, + "loss": 0.8244, + "step": 3380 + }, + { + "epoch": 0.339, + "grad_norm": 31.32615089416504, + "learning_rate": 9.3236e-06, + "loss": 0.7691, + "step": 3390 + }, + { + "epoch": 0.34, + "grad_norm": 22.45935821533203, + "learning_rate": 9.3216e-06, + "loss": 0.5259, + "step": 3400 + }, + { + "epoch": 0.341, + "grad_norm": 10.065863609313965, + "learning_rate": 9.319600000000001e-06, + "loss": 0.6325, + "step": 3410 + }, + { + "epoch": 0.342, + "grad_norm": 19.148534774780273, + "learning_rate": 9.3176e-06, + "loss": 0.5141, + "step": 3420 + }, + { + "epoch": 0.343, + "grad_norm": 30.4162654876709, + "learning_rate": 9.315600000000001e-06, + "loss": 0.7945, + "step": 3430 + }, + { + "epoch": 0.344, + "grad_norm": 53.84823226928711, + "learning_rate": 9.3136e-06, + "loss": 0.7548, + "step": 3440 + }, + { + "epoch": 0.345, + "grad_norm": 3.4882357120513916, + "learning_rate": 9.3116e-06, + "loss": 0.5881, + "step": 3450 + }, + { + "epoch": 0.346, + "grad_norm": 53.241600036621094, + "learning_rate": 9.309600000000001e-06, + "loss": 0.6289, + "step": 3460 + }, + { + "epoch": 0.347, + "grad_norm": 31.7698917388916, + "learning_rate": 9.3076e-06, + "loss": 0.7232, + "step": 3470 + }, + { + "epoch": 0.348, + "grad_norm": 13.279865264892578, + "learning_rate": 9.305600000000001e-06, + "loss": 0.5825, + "step": 3480 + }, + { + "epoch": 0.349, + "grad_norm": 14.930980682373047, + "learning_rate": 9.3036e-06, + "loss": 0.3373, + "step": 3490 + }, + { + "epoch": 0.35, + "grad_norm": 44.31536865234375, + "learning_rate": 9.301600000000001e-06, + "loss": 0.9145, + "step": 3500 + }, + { + "epoch": 0.351, + "grad_norm": 11.612078666687012, + "learning_rate": 9.299600000000002e-06, + "loss": 0.7549, + "step": 3510 + }, + { + "epoch": 0.352, + "grad_norm": 19.171342849731445, + "learning_rate": 9.2976e-06, + "loss": 0.6653, + "step": 3520 + }, + { + "epoch": 0.353, + "grad_norm": 13.827188491821289, + "learning_rate": 9.295600000000001e-06, + "loss": 0.8802, + "step": 3530 + }, + { + "epoch": 0.354, + "grad_norm": 17.23624610900879, + "learning_rate": 9.2936e-06, + "loss": 0.8222, + "step": 3540 + }, + { + "epoch": 0.355, + "grad_norm": 17.355684280395508, + "learning_rate": 9.291600000000001e-06, + "loss": 0.5068, + "step": 3550 + }, + { + "epoch": 0.356, + "grad_norm": 34.479156494140625, + "learning_rate": 9.289600000000002e-06, + "loss": 0.784, + "step": 3560 + }, + { + "epoch": 0.357, + "grad_norm": 29.13589859008789, + "learning_rate": 9.2876e-06, + "loss": 0.7737, + "step": 3570 + }, + { + "epoch": 0.358, + "grad_norm": 33.8214225769043, + "learning_rate": 9.285600000000001e-06, + "loss": 0.8217, + "step": 3580 + }, + { + "epoch": 0.359, + "grad_norm": 41.37405776977539, + "learning_rate": 9.2836e-06, + "loss": 0.7403, + "step": 3590 + }, + { + "epoch": 0.36, + "grad_norm": 31.224672317504883, + "learning_rate": 9.281600000000001e-06, + "loss": 0.692, + "step": 3600 + }, + { + "epoch": 0.361, + "grad_norm": 21.726320266723633, + "learning_rate": 9.2796e-06, + "loss": 0.6739, + "step": 3610 + }, + { + "epoch": 0.362, + "grad_norm": 3.601778030395508, + "learning_rate": 9.2776e-06, + "loss": 0.5509, + "step": 3620 + }, + { + "epoch": 0.363, + "grad_norm": 34.594703674316406, + "learning_rate": 9.275600000000001e-06, + "loss": 0.9467, + "step": 3630 + }, + { + "epoch": 0.364, + "grad_norm": 14.18492603302002, + "learning_rate": 9.2736e-06, + "loss": 0.6881, + "step": 3640 + }, + { + "epoch": 0.365, + "grad_norm": 10.007046699523926, + "learning_rate": 9.271600000000001e-06, + "loss": 0.5958, + "step": 3650 + }, + { + "epoch": 0.366, + "grad_norm": 26.733074188232422, + "learning_rate": 9.2696e-06, + "loss": 0.7343, + "step": 3660 + }, + { + "epoch": 0.367, + "grad_norm": 14.416747093200684, + "learning_rate": 9.267600000000001e-06, + "loss": 0.8798, + "step": 3670 + }, + { + "epoch": 0.368, + "grad_norm": 26.770458221435547, + "learning_rate": 9.265600000000002e-06, + "loss": 0.7193, + "step": 3680 + }, + { + "epoch": 0.369, + "grad_norm": 15.376206398010254, + "learning_rate": 9.2636e-06, + "loss": 0.7509, + "step": 3690 + }, + { + "epoch": 0.37, + "grad_norm": 20.67218589782715, + "learning_rate": 9.261600000000001e-06, + "loss": 0.6673, + "step": 3700 + }, + { + "epoch": 0.371, + "grad_norm": 26.45255470275879, + "learning_rate": 9.2596e-06, + "loss": 0.8062, + "step": 3710 + }, + { + "epoch": 0.372, + "grad_norm": 29.3276309967041, + "learning_rate": 9.257600000000001e-06, + "loss": 0.6185, + "step": 3720 + }, + { + "epoch": 0.373, + "grad_norm": 24.602161407470703, + "learning_rate": 9.255600000000002e-06, + "loss": 0.7317, + "step": 3730 + }, + { + "epoch": 0.374, + "grad_norm": 29.78669548034668, + "learning_rate": 9.2536e-06, + "loss": 0.863, + "step": 3740 + }, + { + "epoch": 0.375, + "grad_norm": 16.003917694091797, + "learning_rate": 9.251600000000001e-06, + "loss": 0.6759, + "step": 3750 + }, + { + "epoch": 0.376, + "grad_norm": 5.657262325286865, + "learning_rate": 9.2496e-06, + "loss": 0.3055, + "step": 3760 + }, + { + "epoch": 0.377, + "grad_norm": 83.36643981933594, + "learning_rate": 9.247600000000001e-06, + "loss": 0.7421, + "step": 3770 + }, + { + "epoch": 0.378, + "grad_norm": 36.70709991455078, + "learning_rate": 9.245600000000002e-06, + "loss": 0.7514, + "step": 3780 + }, + { + "epoch": 0.379, + "grad_norm": 38.573974609375, + "learning_rate": 9.2436e-06, + "loss": 0.6547, + "step": 3790 + }, + { + "epoch": 0.38, + "grad_norm": 45.923824310302734, + "learning_rate": 9.2416e-06, + "loss": 0.6894, + "step": 3800 + }, + { + "epoch": 0.381, + "grad_norm": 26.007081985473633, + "learning_rate": 9.2396e-06, + "loss": 0.5406, + "step": 3810 + }, + { + "epoch": 0.382, + "grad_norm": 42.57291030883789, + "learning_rate": 9.237600000000001e-06, + "loss": 0.5942, + "step": 3820 + }, + { + "epoch": 0.383, + "grad_norm": 37.315677642822266, + "learning_rate": 9.235600000000002e-06, + "loss": 0.7275, + "step": 3830 + }, + { + "epoch": 0.384, + "grad_norm": 18.446134567260742, + "learning_rate": 9.2336e-06, + "loss": 0.6234, + "step": 3840 + }, + { + "epoch": 0.385, + "grad_norm": 21.959327697753906, + "learning_rate": 9.2316e-06, + "loss": 0.9139, + "step": 3850 + }, + { + "epoch": 0.386, + "grad_norm": 16.19249725341797, + "learning_rate": 9.229800000000001e-06, + "loss": 0.9198, + "step": 3860 + }, + { + "epoch": 0.387, + "grad_norm": 24.801454544067383, + "learning_rate": 9.227800000000002e-06, + "loss": 0.8498, + "step": 3870 + }, + { + "epoch": 0.388, + "grad_norm": 20.562063217163086, + "learning_rate": 9.2258e-06, + "loss": 0.9006, + "step": 3880 + }, + { + "epoch": 0.389, + "grad_norm": 27.41887855529785, + "learning_rate": 9.223800000000001e-06, + "loss": 0.6526, + "step": 3890 + }, + { + "epoch": 0.39, + "grad_norm": 10.00798225402832, + "learning_rate": 9.2218e-06, + "loss": 0.6432, + "step": 3900 + }, + { + "epoch": 0.391, + "grad_norm": 22.514768600463867, + "learning_rate": 9.219800000000001e-06, + "loss": 0.8321, + "step": 3910 + }, + { + "epoch": 0.392, + "grad_norm": 22.683561325073242, + "learning_rate": 9.217800000000002e-06, + "loss": 0.6577, + "step": 3920 + }, + { + "epoch": 0.393, + "grad_norm": 38.5459098815918, + "learning_rate": 9.2158e-06, + "loss": 0.7376, + "step": 3930 + }, + { + "epoch": 0.394, + "grad_norm": 7.456604480743408, + "learning_rate": 9.2138e-06, + "loss": 0.5598, + "step": 3940 + }, + { + "epoch": 0.395, + "grad_norm": 22.23004913330078, + "learning_rate": 9.2118e-06, + "loss": 0.7943, + "step": 3950 + }, + { + "epoch": 0.396, + "grad_norm": 14.584146499633789, + "learning_rate": 9.209800000000001e-06, + "loss": 0.802, + "step": 3960 + }, + { + "epoch": 0.397, + "grad_norm": 5.253112316131592, + "learning_rate": 9.207800000000002e-06, + "loss": 0.698, + "step": 3970 + }, + { + "epoch": 0.398, + "grad_norm": 23.153915405273438, + "learning_rate": 9.205800000000001e-06, + "loss": 0.7036, + "step": 3980 + }, + { + "epoch": 0.399, + "grad_norm": 32.066951751708984, + "learning_rate": 9.2038e-06, + "loss": 0.7933, + "step": 3990 + }, + { + "epoch": 0.4, + "grad_norm": 17.568124771118164, + "learning_rate": 9.2018e-06, + "loss": 0.5587, + "step": 4000 + }, + { + "epoch": 0.401, + "grad_norm": 62.3616943359375, + "learning_rate": 9.199800000000001e-06, + "loss": 0.8117, + "step": 4010 + }, + { + "epoch": 0.402, + "grad_norm": 3.00107479095459, + "learning_rate": 9.197800000000002e-06, + "loss": 0.6618, + "step": 4020 + }, + { + "epoch": 0.403, + "grad_norm": 12.941449165344238, + "learning_rate": 9.195800000000001e-06, + "loss": 0.5325, + "step": 4030 + }, + { + "epoch": 0.404, + "grad_norm": 39.835296630859375, + "learning_rate": 9.1938e-06, + "loss": 0.6589, + "step": 4040 + }, + { + "epoch": 0.405, + "grad_norm": 44.73147201538086, + "learning_rate": 9.1918e-06, + "loss": 0.7197, + "step": 4050 + }, + { + "epoch": 0.406, + "grad_norm": 29.60512924194336, + "learning_rate": 9.189800000000001e-06, + "loss": 0.6466, + "step": 4060 + }, + { + "epoch": 0.407, + "grad_norm": 11.546348571777344, + "learning_rate": 9.1878e-06, + "loss": 0.5344, + "step": 4070 + }, + { + "epoch": 0.408, + "grad_norm": 23.700122833251953, + "learning_rate": 9.185800000000001e-06, + "loss": 0.9021, + "step": 4080 + }, + { + "epoch": 0.409, + "grad_norm": 35.56307601928711, + "learning_rate": 9.1838e-06, + "loss": 0.8117, + "step": 4090 + }, + { + "epoch": 0.41, + "grad_norm": 19.759023666381836, + "learning_rate": 9.1818e-06, + "loss": 0.9188, + "step": 4100 + }, + { + "epoch": 0.411, + "grad_norm": 25.926471710205078, + "learning_rate": 9.179800000000001e-06, + "loss": 0.514, + "step": 4110 + }, + { + "epoch": 0.412, + "grad_norm": 28.19685173034668, + "learning_rate": 9.1778e-06, + "loss": 0.5714, + "step": 4120 + }, + { + "epoch": 0.413, + "grad_norm": 29.041406631469727, + "learning_rate": 9.1758e-06, + "loss": 0.6469, + "step": 4130 + }, + { + "epoch": 0.414, + "grad_norm": 27.198598861694336, + "learning_rate": 9.1738e-06, + "loss": 0.7159, + "step": 4140 + }, + { + "epoch": 0.415, + "grad_norm": 20.31609535217285, + "learning_rate": 9.171800000000001e-06, + "loss": 0.7121, + "step": 4150 + }, + { + "epoch": 0.416, + "grad_norm": 49.4409065246582, + "learning_rate": 9.169800000000002e-06, + "loss": 0.818, + "step": 4160 + }, + { + "epoch": 0.417, + "grad_norm": 9.885689735412598, + "learning_rate": 9.1678e-06, + "loss": 0.7141, + "step": 4170 + }, + { + "epoch": 0.418, + "grad_norm": 50.130733489990234, + "learning_rate": 9.1658e-06, + "loss": 0.5292, + "step": 4180 + }, + { + "epoch": 0.419, + "grad_norm": 35.08009719848633, + "learning_rate": 9.1638e-06, + "loss": 0.7885, + "step": 4190 + }, + { + "epoch": 0.42, + "grad_norm": 45.62377166748047, + "learning_rate": 9.161800000000001e-06, + "loss": 0.831, + "step": 4200 + }, + { + "epoch": 0.421, + "grad_norm": 25.133146286010742, + "learning_rate": 9.159800000000002e-06, + "loss": 0.7066, + "step": 4210 + }, + { + "epoch": 0.422, + "grad_norm": 36.50223159790039, + "learning_rate": 9.1578e-06, + "loss": 0.9691, + "step": 4220 + }, + { + "epoch": 0.423, + "grad_norm": 45.53746795654297, + "learning_rate": 9.1558e-06, + "loss": 0.9619, + "step": 4230 + }, + { + "epoch": 0.424, + "grad_norm": 26.474241256713867, + "learning_rate": 9.1538e-06, + "loss": 0.7747, + "step": 4240 + }, + { + "epoch": 0.425, + "grad_norm": 39.54206848144531, + "learning_rate": 9.151800000000001e-06, + "loss": 0.8397, + "step": 4250 + }, + { + "epoch": 0.426, + "grad_norm": 27.512920379638672, + "learning_rate": 9.149800000000002e-06, + "loss": 0.6549, + "step": 4260 + }, + { + "epoch": 0.427, + "grad_norm": 15.803271293640137, + "learning_rate": 9.1478e-06, + "loss": 0.8272, + "step": 4270 + }, + { + "epoch": 0.428, + "grad_norm": 22.9871768951416, + "learning_rate": 9.1458e-06, + "loss": 0.6667, + "step": 4280 + }, + { + "epoch": 0.429, + "grad_norm": 20.597959518432617, + "learning_rate": 9.1438e-06, + "loss": 0.5645, + "step": 4290 + }, + { + "epoch": 0.43, + "grad_norm": 39.363590240478516, + "learning_rate": 9.141800000000001e-06, + "loss": 0.8324, + "step": 4300 + }, + { + "epoch": 0.431, + "grad_norm": 34.98888397216797, + "learning_rate": 9.1398e-06, + "loss": 0.9688, + "step": 4310 + }, + { + "epoch": 0.432, + "grad_norm": 20.55095100402832, + "learning_rate": 9.1378e-06, + "loss": 0.772, + "step": 4320 + }, + { + "epoch": 0.433, + "grad_norm": 30.377758026123047, + "learning_rate": 9.1358e-06, + "loss": 0.9099, + "step": 4330 + }, + { + "epoch": 0.434, + "grad_norm": 31.90020179748535, + "learning_rate": 9.1338e-06, + "loss": 0.9172, + "step": 4340 + }, + { + "epoch": 0.435, + "grad_norm": 28.935474395751953, + "learning_rate": 9.131800000000001e-06, + "loss": 0.5936, + "step": 4350 + }, + { + "epoch": 0.436, + "grad_norm": 31.516016006469727, + "learning_rate": 9.1298e-06, + "loss": 0.7803, + "step": 4360 + }, + { + "epoch": 0.437, + "grad_norm": 89.58314514160156, + "learning_rate": 9.127800000000001e-06, + "loss": 1.0418, + "step": 4370 + }, + { + "epoch": 0.438, + "grad_norm": 27.75042152404785, + "learning_rate": 9.1258e-06, + "loss": 0.6251, + "step": 4380 + }, + { + "epoch": 0.439, + "grad_norm": 18.389522552490234, + "learning_rate": 9.1238e-06, + "loss": 0.6817, + "step": 4390 + }, + { + "epoch": 0.44, + "grad_norm": 30.223163604736328, + "learning_rate": 9.121800000000001e-06, + "loss": 0.741, + "step": 4400 + }, + { + "epoch": 0.441, + "grad_norm": 18.69849395751953, + "learning_rate": 9.1198e-06, + "loss": 0.5822, + "step": 4410 + }, + { + "epoch": 0.442, + "grad_norm": 16.272411346435547, + "learning_rate": 9.117800000000001e-06, + "loss": 0.6329, + "step": 4420 + }, + { + "epoch": 0.443, + "grad_norm": 23.2576904296875, + "learning_rate": 9.1158e-06, + "loss": 0.8683, + "step": 4430 + }, + { + "epoch": 0.444, + "grad_norm": 19.748661041259766, + "learning_rate": 9.1138e-06, + "loss": 0.8318, + "step": 4440 + }, + { + "epoch": 0.445, + "grad_norm": 34.46539306640625, + "learning_rate": 9.111800000000001e-06, + "loss": 0.6966, + "step": 4450 + }, + { + "epoch": 0.446, + "grad_norm": 35.254310607910156, + "learning_rate": 9.1098e-06, + "loss": 0.7138, + "step": 4460 + }, + { + "epoch": 0.447, + "grad_norm": 32.39616775512695, + "learning_rate": 9.107800000000001e-06, + "loss": 0.6498, + "step": 4470 + }, + { + "epoch": 0.448, + "grad_norm": 23.23666000366211, + "learning_rate": 9.1058e-06, + "loss": 0.9085, + "step": 4480 + }, + { + "epoch": 0.449, + "grad_norm": 28.106037139892578, + "learning_rate": 9.1038e-06, + "loss": 0.6794, + "step": 4490 + }, + { + "epoch": 0.45, + "grad_norm": 37.72524642944336, + "learning_rate": 9.1018e-06, + "loss": 0.5902, + "step": 4500 + }, + { + "epoch": 0.451, + "grad_norm": 22.42662239074707, + "learning_rate": 9.0998e-06, + "loss": 0.7308, + "step": 4510 + }, + { + "epoch": 0.452, + "grad_norm": 20.135847091674805, + "learning_rate": 9.097800000000001e-06, + "loss": 0.6052, + "step": 4520 + }, + { + "epoch": 0.453, + "grad_norm": 3.0650992393493652, + "learning_rate": 9.095800000000002e-06, + "loss": 0.6174, + "step": 4530 + }, + { + "epoch": 0.454, + "grad_norm": 309.6864013671875, + "learning_rate": 9.093800000000001e-06, + "loss": 1.0139, + "step": 4540 + }, + { + "epoch": 0.455, + "grad_norm": 32.27659606933594, + "learning_rate": 9.0918e-06, + "loss": 0.8334, + "step": 4550 + }, + { + "epoch": 0.456, + "grad_norm": 4.334067344665527, + "learning_rate": 9.0898e-06, + "loss": 0.6487, + "step": 4560 + }, + { + "epoch": 0.457, + "grad_norm": 7.285878658294678, + "learning_rate": 9.087800000000001e-06, + "loss": 0.5274, + "step": 4570 + }, + { + "epoch": 0.458, + "grad_norm": 68.66569519042969, + "learning_rate": 9.085800000000002e-06, + "loss": 0.7734, + "step": 4580 + }, + { + "epoch": 0.459, + "grad_norm": 36.57508850097656, + "learning_rate": 9.083800000000001e-06, + "loss": 0.8785, + "step": 4590 + }, + { + "epoch": 0.46, + "grad_norm": 49.88356018066406, + "learning_rate": 9.0818e-06, + "loss": 0.6072, + "step": 4600 + }, + { + "epoch": 0.461, + "grad_norm": 28.811969757080078, + "learning_rate": 9.0798e-06, + "loss": 0.701, + "step": 4610 + }, + { + "epoch": 0.462, + "grad_norm": 38.20576477050781, + "learning_rate": 9.077800000000001e-06, + "loss": 0.6207, + "step": 4620 + }, + { + "epoch": 0.463, + "grad_norm": 34.115169525146484, + "learning_rate": 9.075800000000002e-06, + "loss": 0.7065, + "step": 4630 + }, + { + "epoch": 0.464, + "grad_norm": 16.451030731201172, + "learning_rate": 9.073800000000001e-06, + "loss": 0.6654, + "step": 4640 + }, + { + "epoch": 0.465, + "grad_norm": 50.430564880371094, + "learning_rate": 9.0718e-06, + "loss": 0.576, + "step": 4650 + }, + { + "epoch": 0.466, + "grad_norm": 34.623878479003906, + "learning_rate": 9.0698e-06, + "loss": 0.4831, + "step": 4660 + }, + { + "epoch": 0.467, + "grad_norm": 28.038461685180664, + "learning_rate": 9.067800000000002e-06, + "loss": 0.6403, + "step": 4670 + }, + { + "epoch": 0.468, + "grad_norm": 18.259531021118164, + "learning_rate": 9.0658e-06, + "loss": 0.6536, + "step": 4680 + }, + { + "epoch": 0.469, + "grad_norm": 27.56315803527832, + "learning_rate": 9.063800000000001e-06, + "loss": 0.8802, + "step": 4690 + }, + { + "epoch": 0.47, + "grad_norm": 24.04572868347168, + "learning_rate": 9.0618e-06, + "loss": 0.7131, + "step": 4700 + }, + { + "epoch": 0.471, + "grad_norm": 34.79553985595703, + "learning_rate": 9.059800000000001e-06, + "loss": 0.9388, + "step": 4710 + }, + { + "epoch": 0.472, + "grad_norm": 20.2165470123291, + "learning_rate": 9.057800000000002e-06, + "loss": 0.7212, + "step": 4720 + }, + { + "epoch": 0.473, + "grad_norm": 26.301525115966797, + "learning_rate": 9.0558e-06, + "loss": 0.8441, + "step": 4730 + }, + { + "epoch": 0.474, + "grad_norm": 30.426250457763672, + "learning_rate": 9.0538e-06, + "loss": 0.6735, + "step": 4740 + }, + { + "epoch": 0.475, + "grad_norm": 16.273841857910156, + "learning_rate": 9.0518e-06, + "loss": 0.7563, + "step": 4750 + }, + { + "epoch": 0.476, + "grad_norm": 17.23615264892578, + "learning_rate": 9.049800000000001e-06, + "loss": 0.6499, + "step": 4760 + }, + { + "epoch": 0.477, + "grad_norm": 13.083768844604492, + "learning_rate": 9.047800000000002e-06, + "loss": 0.5476, + "step": 4770 + }, + { + "epoch": 0.478, + "grad_norm": 26.91759490966797, + "learning_rate": 9.0458e-06, + "loss": 0.5266, + "step": 4780 + }, + { + "epoch": 0.479, + "grad_norm": 98.05184173583984, + "learning_rate": 9.0438e-06, + "loss": 0.8325, + "step": 4790 + }, + { + "epoch": 0.48, + "grad_norm": 23.273862838745117, + "learning_rate": 9.0418e-06, + "loss": 0.9052, + "step": 4800 + }, + { + "epoch": 0.481, + "grad_norm": 19.2165584564209, + "learning_rate": 9.039800000000001e-06, + "loss": 0.9031, + "step": 4810 + }, + { + "epoch": 0.482, + "grad_norm": 18.614849090576172, + "learning_rate": 9.037800000000002e-06, + "loss": 0.7544, + "step": 4820 + }, + { + "epoch": 0.483, + "grad_norm": 13.22616195678711, + "learning_rate": 9.0358e-06, + "loss": 0.8122, + "step": 4830 + }, + { + "epoch": 0.484, + "grad_norm": 26.158973693847656, + "learning_rate": 9.0338e-06, + "loss": 0.8789, + "step": 4840 + }, + { + "epoch": 0.485, + "grad_norm": 24.33793830871582, + "learning_rate": 9.0318e-06, + "loss": 0.7403, + "step": 4850 + }, + { + "epoch": 0.486, + "grad_norm": 13.415386199951172, + "learning_rate": 9.029800000000001e-06, + "loss": 0.6783, + "step": 4860 + }, + { + "epoch": 0.487, + "grad_norm": 26.246070861816406, + "learning_rate": 9.0278e-06, + "loss": 0.8308, + "step": 4870 + }, + { + "epoch": 0.488, + "grad_norm": 25.685985565185547, + "learning_rate": 9.025800000000001e-06, + "loss": 0.8727, + "step": 4880 + }, + { + "epoch": 0.489, + "grad_norm": 27.296411514282227, + "learning_rate": 9.0238e-06, + "loss": 0.8406, + "step": 4890 + }, + { + "epoch": 0.49, + "grad_norm": 14.517251014709473, + "learning_rate": 9.0218e-06, + "loss": 0.7182, + "step": 4900 + }, + { + "epoch": 0.491, + "grad_norm": 37.61231994628906, + "learning_rate": 9.019800000000001e-06, + "loss": 0.9733, + "step": 4910 + }, + { + "epoch": 0.492, + "grad_norm": 14.171152114868164, + "learning_rate": 9.0178e-06, + "loss": 0.7648, + "step": 4920 + }, + { + "epoch": 0.493, + "grad_norm": 18.871143341064453, + "learning_rate": 9.015800000000001e-06, + "loss": 0.8402, + "step": 4930 + }, + { + "epoch": 0.494, + "grad_norm": 10.27568531036377, + "learning_rate": 9.0138e-06, + "loss": 0.6241, + "step": 4940 + }, + { + "epoch": 0.495, + "grad_norm": 25.206148147583008, + "learning_rate": 9.0118e-06, + "loss": 0.7524, + "step": 4950 + }, + { + "epoch": 0.496, + "grad_norm": 16.134567260742188, + "learning_rate": 9.009800000000001e-06, + "loss": 0.5719, + "step": 4960 + }, + { + "epoch": 0.497, + "grad_norm": 27.282073974609375, + "learning_rate": 9.0078e-06, + "loss": 0.9313, + "step": 4970 + }, + { + "epoch": 0.498, + "grad_norm": 31.6767578125, + "learning_rate": 9.005800000000001e-06, + "loss": 0.7742, + "step": 4980 + }, + { + "epoch": 0.499, + "grad_norm": 47.10121536254883, + "learning_rate": 9.0038e-06, + "loss": 0.726, + "step": 4990 + }, + { + "epoch": 0.5, + "grad_norm": 32.248146057128906, + "learning_rate": 9.0018e-06, + "loss": 0.6758, + "step": 5000 + }, + { + "epoch": 0.501, + "grad_norm": 21.67167854309082, + "learning_rate": 8.999800000000001e-06, + "loss": 0.8787, + "step": 5010 + }, + { + "epoch": 0.502, + "grad_norm": 22.77020835876465, + "learning_rate": 8.9978e-06, + "loss": 0.5766, + "step": 5020 + }, + { + "epoch": 0.503, + "grad_norm": 31.036142349243164, + "learning_rate": 8.995800000000001e-06, + "loss": 0.837, + "step": 5030 + }, + { + "epoch": 0.504, + "grad_norm": 18.027769088745117, + "learning_rate": 8.9938e-06, + "loss": 0.8028, + "step": 5040 + }, + { + "epoch": 0.505, + "grad_norm": 23.31261444091797, + "learning_rate": 8.991800000000001e-06, + "loss": 0.5677, + "step": 5050 + }, + { + "epoch": 0.506, + "grad_norm": 22.90900421142578, + "learning_rate": 8.989800000000002e-06, + "loss": 0.7327, + "step": 5060 + }, + { + "epoch": 0.507, + "grad_norm": 29.72652244567871, + "learning_rate": 8.9878e-06, + "loss": 0.5881, + "step": 5070 + }, + { + "epoch": 0.508, + "grad_norm": 29.568384170532227, + "learning_rate": 8.985800000000001e-06, + "loss": 0.6233, + "step": 5080 + }, + { + "epoch": 0.509, + "grad_norm": 38.03068923950195, + "learning_rate": 8.9838e-06, + "loss": 0.7105, + "step": 5090 + }, + { + "epoch": 0.51, + "grad_norm": 42.80902862548828, + "learning_rate": 8.981800000000001e-06, + "loss": 0.727, + "step": 5100 + }, + { + "epoch": 0.511, + "grad_norm": 39.31882858276367, + "learning_rate": 8.9798e-06, + "loss": 0.6516, + "step": 5110 + }, + { + "epoch": 0.512, + "grad_norm": 28.98011589050293, + "learning_rate": 8.9778e-06, + "loss": 0.6184, + "step": 5120 + }, + { + "epoch": 0.513, + "grad_norm": 26.84391212463379, + "learning_rate": 8.975800000000001e-06, + "loss": 1.0467, + "step": 5130 + }, + { + "epoch": 0.514, + "grad_norm": 35.17725372314453, + "learning_rate": 8.9738e-06, + "loss": 0.9125, + "step": 5140 + }, + { + "epoch": 0.515, + "grad_norm": 34.84568405151367, + "learning_rate": 8.971800000000001e-06, + "loss": 0.5917, + "step": 5150 + }, + { + "epoch": 0.516, + "grad_norm": 23.5303897857666, + "learning_rate": 8.9698e-06, + "loss": 0.6795, + "step": 5160 + }, + { + "epoch": 0.517, + "grad_norm": 48.519927978515625, + "learning_rate": 8.9678e-06, + "loss": 0.7996, + "step": 5170 + }, + { + "epoch": 0.518, + "grad_norm": 9.060611724853516, + "learning_rate": 8.965800000000001e-06, + "loss": 0.67, + "step": 5180 + }, + { + "epoch": 0.519, + "grad_norm": 15.595134735107422, + "learning_rate": 8.9638e-06, + "loss": 0.9602, + "step": 5190 + }, + { + "epoch": 0.52, + "grad_norm": 13.556856155395508, + "learning_rate": 8.961800000000001e-06, + "loss": 0.7247, + "step": 5200 + }, + { + "epoch": 0.521, + "grad_norm": 21.102563858032227, + "learning_rate": 8.9598e-06, + "loss": 0.8323, + "step": 5210 + }, + { + "epoch": 0.522, + "grad_norm": 18.13496208190918, + "learning_rate": 8.957800000000001e-06, + "loss": 0.6505, + "step": 5220 + }, + { + "epoch": 0.523, + "grad_norm": 5.696938514709473, + "learning_rate": 8.955800000000002e-06, + "loss": 0.7781, + "step": 5230 + }, + { + "epoch": 0.524, + "grad_norm": 37.51520538330078, + "learning_rate": 8.9538e-06, + "loss": 1.074, + "step": 5240 + }, + { + "epoch": 0.525, + "grad_norm": 8.744912147521973, + "learning_rate": 8.951800000000001e-06, + "loss": 0.6568, + "step": 5250 + }, + { + "epoch": 0.526, + "grad_norm": 23.32012367248535, + "learning_rate": 8.9498e-06, + "loss": 0.9314, + "step": 5260 + }, + { + "epoch": 0.527, + "grad_norm": 15.782977104187012, + "learning_rate": 8.947800000000001e-06, + "loss": 0.7599, + "step": 5270 + }, + { + "epoch": 0.528, + "grad_norm": 14.995272636413574, + "learning_rate": 8.945800000000002e-06, + "loss": 0.6917, + "step": 5280 + }, + { + "epoch": 0.529, + "grad_norm": 15.904824256896973, + "learning_rate": 8.9438e-06, + "loss": 0.7478, + "step": 5290 + }, + { + "epoch": 0.53, + "grad_norm": 25.889888763427734, + "learning_rate": 8.9418e-06, + "loss": 0.6467, + "step": 5300 + }, + { + "epoch": 0.531, + "grad_norm": 10.381155014038086, + "learning_rate": 8.9398e-06, + "loss": 0.6683, + "step": 5310 + }, + { + "epoch": 0.532, + "grad_norm": 21.153766632080078, + "learning_rate": 8.937800000000001e-06, + "loss": 0.5883, + "step": 5320 + }, + { + "epoch": 0.533, + "grad_norm": 35.405887603759766, + "learning_rate": 8.935800000000002e-06, + "loss": 0.6827, + "step": 5330 + }, + { + "epoch": 0.534, + "grad_norm": 34.55176544189453, + "learning_rate": 8.9338e-06, + "loss": 0.5616, + "step": 5340 + }, + { + "epoch": 0.535, + "grad_norm": 48.410457611083984, + "learning_rate": 8.9318e-06, + "loss": 0.8742, + "step": 5350 + }, + { + "epoch": 0.536, + "grad_norm": 30.154441833496094, + "learning_rate": 8.9298e-06, + "loss": 0.7361, + "step": 5360 + }, + { + "epoch": 0.537, + "grad_norm": 22.52509117126465, + "learning_rate": 8.927800000000001e-06, + "loss": 0.8061, + "step": 5370 + }, + { + "epoch": 0.538, + "grad_norm": 33.107120513916016, + "learning_rate": 8.925800000000002e-06, + "loss": 0.9955, + "step": 5380 + }, + { + "epoch": 0.539, + "grad_norm": 26.009675979614258, + "learning_rate": 8.923800000000001e-06, + "loss": 0.7532, + "step": 5390 + }, + { + "epoch": 0.54, + "grad_norm": 16.156719207763672, + "learning_rate": 8.9218e-06, + "loss": 0.6046, + "step": 5400 + }, + { + "epoch": 0.541, + "grad_norm": 34.604610443115234, + "learning_rate": 8.9198e-06, + "loss": 1.0056, + "step": 5410 + }, + { + "epoch": 0.542, + "grad_norm": 7.621533393859863, + "learning_rate": 8.917800000000001e-06, + "loss": 0.6947, + "step": 5420 + }, + { + "epoch": 0.543, + "grad_norm": 22.182506561279297, + "learning_rate": 8.915800000000002e-06, + "loss": 0.7304, + "step": 5430 + }, + { + "epoch": 0.544, + "grad_norm": 17.386693954467773, + "learning_rate": 8.913800000000001e-06, + "loss": 0.8156, + "step": 5440 + }, + { + "epoch": 0.545, + "grad_norm": 28.76013946533203, + "learning_rate": 8.9118e-06, + "loss": 0.4821, + "step": 5450 + }, + { + "epoch": 0.546, + "grad_norm": 42.765296936035156, + "learning_rate": 8.9098e-06, + "loss": 0.871, + "step": 5460 + }, + { + "epoch": 0.547, + "grad_norm": 31.4482479095459, + "learning_rate": 8.907800000000001e-06, + "loss": 0.79, + "step": 5470 + }, + { + "epoch": 0.548, + "grad_norm": 21.309450149536133, + "learning_rate": 8.9058e-06, + "loss": 0.5208, + "step": 5480 + }, + { + "epoch": 0.549, + "grad_norm": 26.79330062866211, + "learning_rate": 8.904e-06, + "loss": 0.5881, + "step": 5490 + }, + { + "epoch": 0.55, + "grad_norm": 19.553604125976562, + "learning_rate": 8.902e-06, + "loss": 0.9493, + "step": 5500 + }, + { + "epoch": 0.551, + "grad_norm": 25.66444969177246, + "learning_rate": 8.900000000000001e-06, + "loss": 0.5148, + "step": 5510 + }, + { + "epoch": 0.552, + "grad_norm": 33.9870491027832, + "learning_rate": 8.898000000000002e-06, + "loss": 0.4237, + "step": 5520 + }, + { + "epoch": 0.553, + "grad_norm": 38.512290954589844, + "learning_rate": 8.896000000000001e-06, + "loss": 0.673, + "step": 5530 + }, + { + "epoch": 0.554, + "grad_norm": 35.605525970458984, + "learning_rate": 8.894e-06, + "loss": 0.767, + "step": 5540 + }, + { + "epoch": 0.555, + "grad_norm": 42.295989990234375, + "learning_rate": 8.892e-06, + "loss": 0.9592, + "step": 5550 + }, + { + "epoch": 0.556, + "grad_norm": 40.66447830200195, + "learning_rate": 8.890000000000001e-06, + "loss": 0.7368, + "step": 5560 + }, + { + "epoch": 0.557, + "grad_norm": 28.294403076171875, + "learning_rate": 8.888e-06, + "loss": 0.8183, + "step": 5570 + }, + { + "epoch": 0.558, + "grad_norm": 25.432598114013672, + "learning_rate": 8.886000000000001e-06, + "loss": 0.8464, + "step": 5580 + }, + { + "epoch": 0.559, + "grad_norm": 33.40468215942383, + "learning_rate": 8.884e-06, + "loss": 0.6672, + "step": 5590 + }, + { + "epoch": 0.56, + "grad_norm": 27.27119255065918, + "learning_rate": 8.882e-06, + "loss": 0.6753, + "step": 5600 + }, + { + "epoch": 0.561, + "grad_norm": 27.768653869628906, + "learning_rate": 8.880000000000001e-06, + "loss": 0.6132, + "step": 5610 + }, + { + "epoch": 0.562, + "grad_norm": 9.56386947631836, + "learning_rate": 8.878e-06, + "loss": 0.8525, + "step": 5620 + }, + { + "epoch": 0.563, + "grad_norm": 39.86705017089844, + "learning_rate": 8.876e-06, + "loss": 0.9229, + "step": 5630 + }, + { + "epoch": 0.564, + "grad_norm": 31.844165802001953, + "learning_rate": 8.874e-06, + "loss": 0.9162, + "step": 5640 + }, + { + "epoch": 0.565, + "grad_norm": 30.67936897277832, + "learning_rate": 8.872e-06, + "loss": 0.6348, + "step": 5650 + }, + { + "epoch": 0.566, + "grad_norm": 25.196563720703125, + "learning_rate": 8.870000000000001e-06, + "loss": 0.7222, + "step": 5660 + }, + { + "epoch": 0.567, + "grad_norm": 14.649309158325195, + "learning_rate": 8.868e-06, + "loss": 0.791, + "step": 5670 + }, + { + "epoch": 0.568, + "grad_norm": 23.247085571289062, + "learning_rate": 8.866000000000001e-06, + "loss": 0.6081, + "step": 5680 + }, + { + "epoch": 0.569, + "grad_norm": 23.32643699645996, + "learning_rate": 8.864e-06, + "loss": 0.605, + "step": 5690 + }, + { + "epoch": 0.57, + "grad_norm": 32.223018646240234, + "learning_rate": 8.862000000000001e-06, + "loss": 0.7627, + "step": 5700 + }, + { + "epoch": 0.571, + "grad_norm": 19.816926956176758, + "learning_rate": 8.860000000000002e-06, + "loss": 0.8573, + "step": 5710 + }, + { + "epoch": 0.572, + "grad_norm": 31.972070693969727, + "learning_rate": 8.858e-06, + "loss": 0.7598, + "step": 5720 + }, + { + "epoch": 0.573, + "grad_norm": 28.69130516052246, + "learning_rate": 8.856000000000001e-06, + "loss": 0.7115, + "step": 5730 + }, + { + "epoch": 0.574, + "grad_norm": 35.84492111206055, + "learning_rate": 8.854e-06, + "loss": 0.5372, + "step": 5740 + }, + { + "epoch": 0.575, + "grad_norm": 6.977468967437744, + "learning_rate": 8.852000000000001e-06, + "loss": 0.5373, + "step": 5750 + }, + { + "epoch": 0.576, + "grad_norm": 51.22468566894531, + "learning_rate": 8.85e-06, + "loss": 0.6888, + "step": 5760 + }, + { + "epoch": 0.577, + "grad_norm": 17.154935836791992, + "learning_rate": 8.848e-06, + "loss": 0.3186, + "step": 5770 + }, + { + "epoch": 0.578, + "grad_norm": 30.214448928833008, + "learning_rate": 8.846000000000001e-06, + "loss": 0.6309, + "step": 5780 + }, + { + "epoch": 0.579, + "grad_norm": 54.17653274536133, + "learning_rate": 8.844e-06, + "loss": 0.7933, + "step": 5790 + }, + { + "epoch": 0.58, + "grad_norm": 45.278411865234375, + "learning_rate": 8.842000000000001e-06, + "loss": 0.709, + "step": 5800 + }, + { + "epoch": 0.581, + "grad_norm": 5.875872611999512, + "learning_rate": 8.84e-06, + "loss": 0.4124, + "step": 5810 + }, + { + "epoch": 0.582, + "grad_norm": 54.30577087402344, + "learning_rate": 8.838e-06, + "loss": 0.6046, + "step": 5820 + }, + { + "epoch": 0.583, + "grad_norm": 28.242185592651367, + "learning_rate": 8.836000000000001e-06, + "loss": 0.8088, + "step": 5830 + }, + { + "epoch": 0.584, + "grad_norm": 17.646381378173828, + "learning_rate": 8.834e-06, + "loss": 0.6872, + "step": 5840 + }, + { + "epoch": 0.585, + "grad_norm": 29.147197723388672, + "learning_rate": 8.832000000000001e-06, + "loss": 0.5049, + "step": 5850 + }, + { + "epoch": 0.586, + "grad_norm": 41.977230072021484, + "learning_rate": 8.83e-06, + "loss": 0.8069, + "step": 5860 + }, + { + "epoch": 0.587, + "grad_norm": 17.308279037475586, + "learning_rate": 8.828000000000001e-06, + "loss": 0.5329, + "step": 5870 + }, + { + "epoch": 0.588, + "grad_norm": 59.8780632019043, + "learning_rate": 8.826000000000002e-06, + "loss": 0.8051, + "step": 5880 + }, + { + "epoch": 0.589, + "grad_norm": 72.70011138916016, + "learning_rate": 8.824e-06, + "loss": 1.0038, + "step": 5890 + }, + { + "epoch": 0.59, + "grad_norm": 17.008209228515625, + "learning_rate": 8.822000000000001e-06, + "loss": 0.442, + "step": 5900 + }, + { + "epoch": 0.591, + "grad_norm": 45.6512565612793, + "learning_rate": 8.82e-06, + "loss": 0.4802, + "step": 5910 + }, + { + "epoch": 0.592, + "grad_norm": 9.083815574645996, + "learning_rate": 8.818000000000001e-06, + "loss": 0.5706, + "step": 5920 + }, + { + "epoch": 0.593, + "grad_norm": 31.648679733276367, + "learning_rate": 8.816000000000002e-06, + "loss": 0.5389, + "step": 5930 + }, + { + "epoch": 0.594, + "grad_norm": 47.933197021484375, + "learning_rate": 8.814e-06, + "loss": 0.6758, + "step": 5940 + }, + { + "epoch": 0.595, + "grad_norm": 44.82785415649414, + "learning_rate": 8.812000000000001e-06, + "loss": 0.6103, + "step": 5950 + }, + { + "epoch": 0.596, + "grad_norm": 15.51645565032959, + "learning_rate": 8.81e-06, + "loss": 0.7726, + "step": 5960 + }, + { + "epoch": 0.597, + "grad_norm": 31.250015258789062, + "learning_rate": 8.808000000000001e-06, + "loss": 0.6967, + "step": 5970 + }, + { + "epoch": 0.598, + "grad_norm": 39.70091247558594, + "learning_rate": 8.806000000000002e-06, + "loss": 0.9665, + "step": 5980 + }, + { + "epoch": 0.599, + "grad_norm": 17.304738998413086, + "learning_rate": 8.804e-06, + "loss": 0.6742, + "step": 5990 + }, + { + "epoch": 0.6, + "grad_norm": 51.37207794189453, + "learning_rate": 8.802e-06, + "loss": 0.767, + "step": 6000 + }, + { + "epoch": 0.601, + "grad_norm": 21.40468978881836, + "learning_rate": 8.8e-06, + "loss": 0.6364, + "step": 6010 + }, + { + "epoch": 0.602, + "grad_norm": 23.359695434570312, + "learning_rate": 8.798000000000001e-06, + "loss": 0.6353, + "step": 6020 + }, + { + "epoch": 0.603, + "grad_norm": 25.836505889892578, + "learning_rate": 8.796000000000002e-06, + "loss": 0.8381, + "step": 6030 + }, + { + "epoch": 0.604, + "grad_norm": 37.61728286743164, + "learning_rate": 8.794e-06, + "loss": 0.6782, + "step": 6040 + }, + { + "epoch": 0.605, + "grad_norm": 27.693248748779297, + "learning_rate": 8.792e-06, + "loss": 0.7325, + "step": 6050 + }, + { + "epoch": 0.606, + "grad_norm": 23.111059188842773, + "learning_rate": 8.79e-06, + "loss": 0.8304, + "step": 6060 + }, + { + "epoch": 0.607, + "grad_norm": 34.3321647644043, + "learning_rate": 8.788000000000001e-06, + "loss": 0.5949, + "step": 6070 + }, + { + "epoch": 0.608, + "grad_norm": 43.452415466308594, + "learning_rate": 8.786000000000002e-06, + "loss": 0.669, + "step": 6080 + }, + { + "epoch": 0.609, + "grad_norm": 12.943218231201172, + "learning_rate": 8.784000000000001e-06, + "loss": 0.6943, + "step": 6090 + }, + { + "epoch": 0.61, + "grad_norm": 11.950276374816895, + "learning_rate": 8.782e-06, + "loss": 0.5902, + "step": 6100 + }, + { + "epoch": 0.611, + "grad_norm": 9.990435600280762, + "learning_rate": 8.78e-06, + "loss": 0.5351, + "step": 6110 + }, + { + "epoch": 0.612, + "grad_norm": 33.92399215698242, + "learning_rate": 8.778000000000001e-06, + "loss": 0.858, + "step": 6120 + }, + { + "epoch": 0.613, + "grad_norm": 29.9383544921875, + "learning_rate": 8.776e-06, + "loss": 0.7106, + "step": 6130 + }, + { + "epoch": 0.614, + "grad_norm": 25.9768123626709, + "learning_rate": 8.774000000000001e-06, + "loss": 0.7565, + "step": 6140 + }, + { + "epoch": 0.615, + "grad_norm": 14.911945343017578, + "learning_rate": 8.772e-06, + "loss": 0.6397, + "step": 6150 + }, + { + "epoch": 0.616, + "grad_norm": 42.49845886230469, + "learning_rate": 8.77e-06, + "loss": 0.5812, + "step": 6160 + }, + { + "epoch": 0.617, + "grad_norm": 24.447677612304688, + "learning_rate": 8.768000000000001e-06, + "loss": 0.6996, + "step": 6170 + }, + { + "epoch": 0.618, + "grad_norm": 22.37032127380371, + "learning_rate": 8.766e-06, + "loss": 0.7133, + "step": 6180 + }, + { + "epoch": 0.619, + "grad_norm": 15.080531120300293, + "learning_rate": 8.764e-06, + "loss": 0.7305, + "step": 6190 + }, + { + "epoch": 0.62, + "grad_norm": 7.373855113983154, + "learning_rate": 8.762e-06, + "loss": 0.7308, + "step": 6200 + }, + { + "epoch": 0.621, + "grad_norm": 31.23784065246582, + "learning_rate": 8.76e-06, + "loss": 0.7615, + "step": 6210 + }, + { + "epoch": 0.622, + "grad_norm": 38.272438049316406, + "learning_rate": 8.758000000000002e-06, + "loss": 0.6086, + "step": 6220 + }, + { + "epoch": 0.623, + "grad_norm": 23.415599822998047, + "learning_rate": 8.756e-06, + "loss": 0.6437, + "step": 6230 + }, + { + "epoch": 0.624, + "grad_norm": 27.43158531188965, + "learning_rate": 8.754e-06, + "loss": 0.7331, + "step": 6240 + }, + { + "epoch": 0.625, + "grad_norm": 20.39682388305664, + "learning_rate": 8.752e-06, + "loss": 0.5126, + "step": 6250 + }, + { + "epoch": 0.626, + "grad_norm": 55.00613784790039, + "learning_rate": 8.750000000000001e-06, + "loss": 0.7794, + "step": 6260 + }, + { + "epoch": 0.627, + "grad_norm": 38.01939010620117, + "learning_rate": 8.748000000000002e-06, + "loss": 0.617, + "step": 6270 + }, + { + "epoch": 0.628, + "grad_norm": 21.09652328491211, + "learning_rate": 8.746e-06, + "loss": 0.742, + "step": 6280 + }, + { + "epoch": 0.629, + "grad_norm": 14.96897029876709, + "learning_rate": 8.744e-06, + "loss": 0.3911, + "step": 6290 + }, + { + "epoch": 0.63, + "grad_norm": 31.505590438842773, + "learning_rate": 8.742e-06, + "loss": 1.0415, + "step": 6300 + }, + { + "epoch": 0.631, + "grad_norm": 36.91202926635742, + "learning_rate": 8.740000000000001e-06, + "loss": 0.7963, + "step": 6310 + }, + { + "epoch": 0.632, + "grad_norm": 23.781566619873047, + "learning_rate": 8.738000000000002e-06, + "loss": 0.4309, + "step": 6320 + }, + { + "epoch": 0.633, + "grad_norm": 37.20154571533203, + "learning_rate": 8.736e-06, + "loss": 0.7661, + "step": 6330 + }, + { + "epoch": 0.634, + "grad_norm": 49.68278121948242, + "learning_rate": 8.734e-06, + "loss": 0.9511, + "step": 6340 + }, + { + "epoch": 0.635, + "grad_norm": 36.95859909057617, + "learning_rate": 8.732e-06, + "loss": 0.7286, + "step": 6350 + }, + { + "epoch": 0.636, + "grad_norm": 42.05426025390625, + "learning_rate": 8.730000000000001e-06, + "loss": 0.684, + "step": 6360 + }, + { + "epoch": 0.637, + "grad_norm": 55.31623840332031, + "learning_rate": 8.728e-06, + "loss": 1.0039, + "step": 6370 + }, + { + "epoch": 0.638, + "grad_norm": 37.254356384277344, + "learning_rate": 8.726e-06, + "loss": 0.5336, + "step": 6380 + }, + { + "epoch": 0.639, + "grad_norm": 9.793030738830566, + "learning_rate": 8.724e-06, + "loss": 0.6437, + "step": 6390 + }, + { + "epoch": 0.64, + "grad_norm": 41.21820831298828, + "learning_rate": 8.722e-06, + "loss": 0.6156, + "step": 6400 + }, + { + "epoch": 0.641, + "grad_norm": 16.490699768066406, + "learning_rate": 8.720000000000001e-06, + "loss": 0.6881, + "step": 6410 + }, + { + "epoch": 0.642, + "grad_norm": 45.798728942871094, + "learning_rate": 8.718e-06, + "loss": 0.7894, + "step": 6420 + }, + { + "epoch": 0.643, + "grad_norm": 36.68819808959961, + "learning_rate": 8.716000000000001e-06, + "loss": 0.771, + "step": 6430 + }, + { + "epoch": 0.644, + "grad_norm": 9.799590110778809, + "learning_rate": 8.714e-06, + "loss": 0.6979, + "step": 6440 + }, + { + "epoch": 0.645, + "grad_norm": 24.45409393310547, + "learning_rate": 8.712e-06, + "loss": 0.65, + "step": 6450 + }, + { + "epoch": 0.646, + "grad_norm": 17.76416778564453, + "learning_rate": 8.710000000000001e-06, + "loss": 0.8058, + "step": 6460 + }, + { + "epoch": 0.647, + "grad_norm": 16.39788055419922, + "learning_rate": 8.708e-06, + "loss": 0.6635, + "step": 6470 + }, + { + "epoch": 0.648, + "grad_norm": 18.585445404052734, + "learning_rate": 8.706000000000001e-06, + "loss": 0.7145, + "step": 6480 + }, + { + "epoch": 0.649, + "grad_norm": 31.643970489501953, + "learning_rate": 8.704e-06, + "loss": 0.5295, + "step": 6490 + }, + { + "epoch": 0.65, + "grad_norm": 14.48748779296875, + "learning_rate": 8.702e-06, + "loss": 0.5848, + "step": 6500 + }, + { + "epoch": 0.651, + "grad_norm": 36.65018081665039, + "learning_rate": 8.700000000000001e-06, + "loss": 0.7816, + "step": 6510 + }, + { + "epoch": 0.652, + "grad_norm": 29.191734313964844, + "learning_rate": 8.698e-06, + "loss": 0.6342, + "step": 6520 + }, + { + "epoch": 0.653, + "grad_norm": 29.119672775268555, + "learning_rate": 8.696000000000001e-06, + "loss": 0.6923, + "step": 6530 + }, + { + "epoch": 0.654, + "grad_norm": 33.278594970703125, + "learning_rate": 8.694e-06, + "loss": 0.617, + "step": 6540 + }, + { + "epoch": 0.655, + "grad_norm": 36.83278274536133, + "learning_rate": 8.692e-06, + "loss": 0.7368, + "step": 6550 + }, + { + "epoch": 0.656, + "grad_norm": 25.805545806884766, + "learning_rate": 8.690000000000002e-06, + "loss": 0.7087, + "step": 6560 + }, + { + "epoch": 0.657, + "grad_norm": 22.268075942993164, + "learning_rate": 8.688e-06, + "loss": 0.6209, + "step": 6570 + }, + { + "epoch": 0.658, + "grad_norm": 12.060050964355469, + "learning_rate": 8.686000000000001e-06, + "loss": 0.4913, + "step": 6580 + }, + { + "epoch": 0.659, + "grad_norm": 13.607654571533203, + "learning_rate": 8.684e-06, + "loss": 0.7358, + "step": 6590 + }, + { + "epoch": 0.66, + "grad_norm": 38.86949920654297, + "learning_rate": 8.682000000000001e-06, + "loss": 0.7662, + "step": 6600 + }, + { + "epoch": 0.661, + "grad_norm": 22.411563873291016, + "learning_rate": 8.68e-06, + "loss": 0.5944, + "step": 6610 + }, + { + "epoch": 0.662, + "grad_norm": 27.0049991607666, + "learning_rate": 8.678e-06, + "loss": 0.6924, + "step": 6620 + }, + { + "epoch": 0.663, + "grad_norm": 16.259170532226562, + "learning_rate": 8.676000000000001e-06, + "loss": 0.6007, + "step": 6630 + }, + { + "epoch": 0.664, + "grad_norm": 21.972728729248047, + "learning_rate": 8.674e-06, + "loss": 0.5624, + "step": 6640 + }, + { + "epoch": 0.665, + "grad_norm": 24.113021850585938, + "learning_rate": 8.672000000000001e-06, + "loss": 0.7037, + "step": 6650 + }, + { + "epoch": 0.666, + "grad_norm": 24.1397705078125, + "learning_rate": 8.67e-06, + "loss": 0.6052, + "step": 6660 + }, + { + "epoch": 0.667, + "grad_norm": 23.034231185913086, + "learning_rate": 8.668e-06, + "loss": 0.653, + "step": 6670 + }, + { + "epoch": 0.668, + "grad_norm": 22.749948501586914, + "learning_rate": 8.666000000000001e-06, + "loss": 0.5321, + "step": 6680 + }, + { + "epoch": 0.669, + "grad_norm": 37.435977935791016, + "learning_rate": 8.664e-06, + "loss": 0.8462, + "step": 6690 + }, + { + "epoch": 0.67, + "grad_norm": 19.840953826904297, + "learning_rate": 8.662000000000001e-06, + "loss": 0.7411, + "step": 6700 + }, + { + "epoch": 0.671, + "grad_norm": 28.068639755249023, + "learning_rate": 8.66e-06, + "loss": 0.513, + "step": 6710 + }, + { + "epoch": 0.672, + "grad_norm": 13.38555908203125, + "learning_rate": 8.658e-06, + "loss": 0.7307, + "step": 6720 + }, + { + "epoch": 0.673, + "grad_norm": 23.34636878967285, + "learning_rate": 8.656000000000001e-06, + "loss": 0.9183, + "step": 6730 + }, + { + "epoch": 0.674, + "grad_norm": 23.878576278686523, + "learning_rate": 8.654e-06, + "loss": 0.5841, + "step": 6740 + }, + { + "epoch": 0.675, + "grad_norm": 31.957456588745117, + "learning_rate": 8.652000000000001e-06, + "loss": 0.8774, + "step": 6750 + }, + { + "epoch": 0.676, + "grad_norm": 41.79953384399414, + "learning_rate": 8.65e-06, + "loss": 0.9604, + "step": 6760 + }, + { + "epoch": 0.677, + "grad_norm": 19.412567138671875, + "learning_rate": 8.648000000000001e-06, + "loss": 0.6639, + "step": 6770 + }, + { + "epoch": 0.678, + "grad_norm": 23.539968490600586, + "learning_rate": 8.646000000000002e-06, + "loss": 0.5529, + "step": 6780 + }, + { + "epoch": 0.679, + "grad_norm": 16.99551773071289, + "learning_rate": 8.644e-06, + "loss": 0.7736, + "step": 6790 + }, + { + "epoch": 0.68, + "grad_norm": 33.947628021240234, + "learning_rate": 8.642e-06, + "loss": 0.7398, + "step": 6800 + }, + { + "epoch": 0.681, + "grad_norm": 31.256986618041992, + "learning_rate": 8.64e-06, + "loss": 0.7431, + "step": 6810 + }, + { + "epoch": 0.682, + "grad_norm": 34.16581344604492, + "learning_rate": 8.638000000000001e-06, + "loss": 0.7627, + "step": 6820 + }, + { + "epoch": 0.683, + "grad_norm": 28.380054473876953, + "learning_rate": 8.636000000000002e-06, + "loss": 0.5163, + "step": 6830 + }, + { + "epoch": 0.684, + "grad_norm": 17.26369285583496, + "learning_rate": 8.634e-06, + "loss": 0.5183, + "step": 6840 + }, + { + "epoch": 0.685, + "grad_norm": 9.074869155883789, + "learning_rate": 8.632e-06, + "loss": 0.754, + "step": 6850 + }, + { + "epoch": 0.686, + "grad_norm": 12.758150100708008, + "learning_rate": 8.63e-06, + "loss": 0.6343, + "step": 6860 + }, + { + "epoch": 0.687, + "grad_norm": 9.85903263092041, + "learning_rate": 8.628000000000001e-06, + "loss": 0.7374, + "step": 6870 + }, + { + "epoch": 0.688, + "grad_norm": 46.6954231262207, + "learning_rate": 8.626000000000002e-06, + "loss": 0.6382, + "step": 6880 + }, + { + "epoch": 0.689, + "grad_norm": 26.677377700805664, + "learning_rate": 8.624e-06, + "loss": 1.0401, + "step": 6890 + }, + { + "epoch": 0.69, + "grad_norm": 26.68614387512207, + "learning_rate": 8.622e-06, + "loss": 0.6229, + "step": 6900 + }, + { + "epoch": 0.691, + "grad_norm": 10.952954292297363, + "learning_rate": 8.62e-06, + "loss": 0.8177, + "step": 6910 + }, + { + "epoch": 0.692, + "grad_norm": 32.28446960449219, + "learning_rate": 8.618000000000001e-06, + "loss": 0.6967, + "step": 6920 + }, + { + "epoch": 0.693, + "grad_norm": 32.433128356933594, + "learning_rate": 8.616000000000002e-06, + "loss": 0.7636, + "step": 6930 + }, + { + "epoch": 0.694, + "grad_norm": 39.171165466308594, + "learning_rate": 8.614000000000001e-06, + "loss": 0.5737, + "step": 6940 + }, + { + "epoch": 0.695, + "grad_norm": 32.63961410522461, + "learning_rate": 8.612e-06, + "loss": 0.7406, + "step": 6950 + }, + { + "epoch": 0.696, + "grad_norm": 30.88587188720703, + "learning_rate": 8.61e-06, + "loss": 0.9034, + "step": 6960 + }, + { + "epoch": 0.697, + "grad_norm": 29.65443992614746, + "learning_rate": 8.608000000000001e-06, + "loss": 0.4538, + "step": 6970 + }, + { + "epoch": 0.698, + "grad_norm": 35.46700668334961, + "learning_rate": 8.606e-06, + "loss": 0.9041, + "step": 6980 + }, + { + "epoch": 0.699, + "grad_norm": 12.977072715759277, + "learning_rate": 8.604000000000001e-06, + "loss": 0.6177, + "step": 6990 + }, + { + "epoch": 0.7, + "grad_norm": 18.877992630004883, + "learning_rate": 8.602e-06, + "loss": 0.5468, + "step": 7000 + }, + { + "epoch": 0.701, + "grad_norm": 54.49048614501953, + "learning_rate": 8.6e-06, + "loss": 0.7231, + "step": 7010 + }, + { + "epoch": 0.702, + "grad_norm": 51.610084533691406, + "learning_rate": 8.598000000000001e-06, + "loss": 0.7289, + "step": 7020 + }, + { + "epoch": 0.703, + "grad_norm": 38.165809631347656, + "learning_rate": 8.596e-06, + "loss": 0.9512, + "step": 7030 + }, + { + "epoch": 0.704, + "grad_norm": 26.098329544067383, + "learning_rate": 8.594000000000001e-06, + "loss": 0.7228, + "step": 7040 + }, + { + "epoch": 0.705, + "grad_norm": 46.73612976074219, + "learning_rate": 8.592e-06, + "loss": 0.7202, + "step": 7050 + }, + { + "epoch": 0.706, + "grad_norm": 40.09733200073242, + "learning_rate": 8.59e-06, + "loss": 0.4938, + "step": 7060 + }, + { + "epoch": 0.707, + "grad_norm": 43.058841705322266, + "learning_rate": 8.588000000000001e-06, + "loss": 0.7683, + "step": 7070 + }, + { + "epoch": 0.708, + "grad_norm": 50.56329345703125, + "learning_rate": 8.586e-06, + "loss": 1.064, + "step": 7080 + }, + { + "epoch": 0.709, + "grad_norm": 39.37093734741211, + "learning_rate": 8.584000000000001e-06, + "loss": 0.8581, + "step": 7090 + }, + { + "epoch": 0.71, + "grad_norm": 9.572067260742188, + "learning_rate": 8.582e-06, + "loss": 0.5938, + "step": 7100 + }, + { + "epoch": 0.711, + "grad_norm": 16.67902183532715, + "learning_rate": 8.580000000000001e-06, + "loss": 0.5792, + "step": 7110 + }, + { + "epoch": 0.712, + "grad_norm": 21.606800079345703, + "learning_rate": 8.578000000000002e-06, + "loss": 0.8628, + "step": 7120 + }, + { + "epoch": 0.713, + "grad_norm": 29.96168327331543, + "learning_rate": 8.576e-06, + "loss": 0.7258, + "step": 7130 + }, + { + "epoch": 0.714, + "grad_norm": 17.330432891845703, + "learning_rate": 8.574000000000001e-06, + "loss": 0.769, + "step": 7140 + }, + { + "epoch": 0.715, + "grad_norm": 43.22488021850586, + "learning_rate": 8.572e-06, + "loss": 0.7662, + "step": 7150 + }, + { + "epoch": 0.716, + "grad_norm": 24.21693992614746, + "learning_rate": 8.570000000000001e-06, + "loss": 0.8548, + "step": 7160 + }, + { + "epoch": 0.717, + "grad_norm": 14.228105545043945, + "learning_rate": 8.568e-06, + "loss": 0.7917, + "step": 7170 + }, + { + "epoch": 0.718, + "grad_norm": 34.96318817138672, + "learning_rate": 8.566e-06, + "loss": 1.0241, + "step": 7180 + }, + { + "epoch": 0.719, + "grad_norm": 23.818344116210938, + "learning_rate": 8.564000000000001e-06, + "loss": 0.8201, + "step": 7190 + }, + { + "epoch": 0.72, + "grad_norm": 15.386028289794922, + "learning_rate": 8.562e-06, + "loss": 0.9085, + "step": 7200 + }, + { + "epoch": 0.721, + "grad_norm": 11.841687202453613, + "learning_rate": 8.560000000000001e-06, + "loss": 0.713, + "step": 7210 + }, + { + "epoch": 0.722, + "grad_norm": 6.8726725578308105, + "learning_rate": 8.558e-06, + "loss": 0.6639, + "step": 7220 + }, + { + "epoch": 0.723, + "grad_norm": 31.94175910949707, + "learning_rate": 8.556e-06, + "loss": 0.5847, + "step": 7230 + }, + { + "epoch": 0.724, + "grad_norm": 17.904970169067383, + "learning_rate": 8.554000000000001e-06, + "loss": 0.6752, + "step": 7240 + }, + { + "epoch": 0.725, + "grad_norm": 10.443678855895996, + "learning_rate": 8.552e-06, + "loss": 0.9382, + "step": 7250 + }, + { + "epoch": 0.726, + "grad_norm": 21.056140899658203, + "learning_rate": 8.550000000000001e-06, + "loss": 0.7227, + "step": 7260 + }, + { + "epoch": 0.727, + "grad_norm": 36.38212585449219, + "learning_rate": 8.548e-06, + "loss": 0.563, + "step": 7270 + }, + { + "epoch": 0.728, + "grad_norm": 29.130191802978516, + "learning_rate": 8.546000000000001e-06, + "loss": 0.5688, + "step": 7280 + }, + { + "epoch": 0.729, + "grad_norm": 19.342304229736328, + "learning_rate": 8.544000000000002e-06, + "loss": 0.6482, + "step": 7290 + }, + { + "epoch": 0.73, + "grad_norm": 32.37727737426758, + "learning_rate": 8.542e-06, + "loss": 0.6761, + "step": 7300 + }, + { + "epoch": 0.731, + "grad_norm": 33.89266586303711, + "learning_rate": 8.540000000000001e-06, + "loss": 0.7243, + "step": 7310 + }, + { + "epoch": 0.732, + "grad_norm": 26.32709312438965, + "learning_rate": 8.538e-06, + "loss": 0.754, + "step": 7320 + }, + { + "epoch": 0.733, + "grad_norm": 18.493139266967773, + "learning_rate": 8.536000000000001e-06, + "loss": 0.3922, + "step": 7330 + }, + { + "epoch": 0.734, + "grad_norm": 17.698272705078125, + "learning_rate": 8.534000000000002e-06, + "loss": 0.8431, + "step": 7340 + }, + { + "epoch": 0.735, + "grad_norm": 56.601158142089844, + "learning_rate": 8.532e-06, + "loss": 0.905, + "step": 7350 + }, + { + "epoch": 0.736, + "grad_norm": 35.621917724609375, + "learning_rate": 8.530000000000001e-06, + "loss": 0.9357, + "step": 7360 + }, + { + "epoch": 0.737, + "grad_norm": 30.305463790893555, + "learning_rate": 8.528e-06, + "loss": 0.4989, + "step": 7370 + }, + { + "epoch": 0.738, + "grad_norm": 35.86700439453125, + "learning_rate": 8.526000000000001e-06, + "loss": 0.6451, + "step": 7380 + }, + { + "epoch": 0.739, + "grad_norm": 35.63044357299805, + "learning_rate": 8.524000000000002e-06, + "loss": 0.6942, + "step": 7390 + }, + { + "epoch": 0.74, + "grad_norm": 21.295764923095703, + "learning_rate": 8.522e-06, + "loss": 0.8271, + "step": 7400 + }, + { + "epoch": 0.741, + "grad_norm": 21.100643157958984, + "learning_rate": 8.52e-06, + "loss": 0.5248, + "step": 7410 + }, + { + "epoch": 0.742, + "grad_norm": 27.021936416625977, + "learning_rate": 8.518e-06, + "loss": 0.7567, + "step": 7420 + }, + { + "epoch": 0.743, + "grad_norm": 38.12123107910156, + "learning_rate": 8.516000000000001e-06, + "loss": 0.7024, + "step": 7430 + }, + { + "epoch": 0.744, + "grad_norm": 11.784056663513184, + "learning_rate": 8.514000000000002e-06, + "loss": 0.7275, + "step": 7440 + }, + { + "epoch": 0.745, + "grad_norm": 25.664180755615234, + "learning_rate": 8.512e-06, + "loss": 0.8583, + "step": 7450 + }, + { + "epoch": 0.746, + "grad_norm": 23.51327133178711, + "learning_rate": 8.51e-06, + "loss": 0.9263, + "step": 7460 + }, + { + "epoch": 0.747, + "grad_norm": 7.7203593254089355, + "learning_rate": 8.508e-06, + "loss": 0.6919, + "step": 7470 + }, + { + "epoch": 0.748, + "grad_norm": 24.703340530395508, + "learning_rate": 8.506000000000001e-06, + "loss": 0.7369, + "step": 7480 + }, + { + "epoch": 0.749, + "grad_norm": 31.4427490234375, + "learning_rate": 8.504000000000002e-06, + "loss": 0.7534, + "step": 7490 + }, + { + "epoch": 0.75, + "grad_norm": 15.99972915649414, + "learning_rate": 8.502000000000001e-06, + "loss": 0.7025, + "step": 7500 + }, + { + "epoch": 0.751, + "grad_norm": 35.3527717590332, + "learning_rate": 8.5e-06, + "loss": 0.6326, + "step": 7510 + }, + { + "epoch": 0.752, + "grad_norm": 14.60549259185791, + "learning_rate": 8.498e-06, + "loss": 0.6411, + "step": 7520 + }, + { + "epoch": 0.753, + "grad_norm": 11.679787635803223, + "learning_rate": 8.496000000000001e-06, + "loss": 0.7096, + "step": 7530 + }, + { + "epoch": 0.754, + "grad_norm": 16.99401092529297, + "learning_rate": 8.494e-06, + "loss": 0.6471, + "step": 7540 + }, + { + "epoch": 0.755, + "grad_norm": 31.19361114501953, + "learning_rate": 8.492000000000001e-06, + "loss": 0.7787, + "step": 7550 + }, + { + "epoch": 0.756, + "grad_norm": 20.032594680786133, + "learning_rate": 8.49e-06, + "loss": 0.7043, + "step": 7560 + }, + { + "epoch": 0.757, + "grad_norm": 34.8859748840332, + "learning_rate": 8.488e-06, + "loss": 0.6349, + "step": 7570 + }, + { + "epoch": 0.758, + "grad_norm": 33.36531066894531, + "learning_rate": 8.486000000000001e-06, + "loss": 0.6744, + "step": 7580 + }, + { + "epoch": 0.759, + "grad_norm": 8.187764167785645, + "learning_rate": 8.484e-06, + "loss": 0.9517, + "step": 7590 + }, + { + "epoch": 0.76, + "grad_norm": 6.946749687194824, + "learning_rate": 8.482e-06, + "loss": 0.6614, + "step": 7600 + }, + { + "epoch": 0.761, + "grad_norm": 27.84326934814453, + "learning_rate": 8.48e-06, + "loss": 0.6593, + "step": 7610 + }, + { + "epoch": 0.762, + "grad_norm": 30.7228946685791, + "learning_rate": 8.478e-06, + "loss": 0.9224, + "step": 7620 + }, + { + "epoch": 0.763, + "grad_norm": 30.333284378051758, + "learning_rate": 8.476000000000002e-06, + "loss": 0.7381, + "step": 7630 + }, + { + "epoch": 0.764, + "grad_norm": 20.903934478759766, + "learning_rate": 8.474e-06, + "loss": 0.8126, + "step": 7640 + }, + { + "epoch": 0.765, + "grad_norm": 19.930513381958008, + "learning_rate": 8.472e-06, + "loss": 0.4864, + "step": 7650 + }, + { + "epoch": 0.766, + "grad_norm": 28.226978302001953, + "learning_rate": 8.47e-06, + "loss": 0.7474, + "step": 7660 + }, + { + "epoch": 0.767, + "grad_norm": 35.864070892333984, + "learning_rate": 8.468000000000001e-06, + "loss": 0.7979, + "step": 7670 + }, + { + "epoch": 0.768, + "grad_norm": 18.937253952026367, + "learning_rate": 8.466000000000002e-06, + "loss": 0.6326, + "step": 7680 + }, + { + "epoch": 0.769, + "grad_norm": 33.21320724487305, + "learning_rate": 8.464e-06, + "loss": 0.8367, + "step": 7690 + }, + { + "epoch": 0.77, + "grad_norm": 17.86391258239746, + "learning_rate": 8.462e-06, + "loss": 0.7428, + "step": 7700 + }, + { + "epoch": 0.771, + "grad_norm": 39.599666595458984, + "learning_rate": 8.46e-06, + "loss": 0.6803, + "step": 7710 + }, + { + "epoch": 0.772, + "grad_norm": 31.304195404052734, + "learning_rate": 8.458000000000001e-06, + "loss": 0.6783, + "step": 7720 + }, + { + "epoch": 0.773, + "grad_norm": 21.93630027770996, + "learning_rate": 8.456000000000002e-06, + "loss": 0.4458, + "step": 7730 + }, + { + "epoch": 0.774, + "grad_norm": 10.390119552612305, + "learning_rate": 8.454e-06, + "loss": 0.5747, + "step": 7740 + }, + { + "epoch": 0.775, + "grad_norm": 29.907333374023438, + "learning_rate": 8.452e-06, + "loss": 0.6706, + "step": 7750 + }, + { + "epoch": 0.776, + "grad_norm": 14.718128204345703, + "learning_rate": 8.45e-06, + "loss": 0.8387, + "step": 7760 + }, + { + "epoch": 0.777, + "grad_norm": 32.11901092529297, + "learning_rate": 8.448000000000001e-06, + "loss": 0.7932, + "step": 7770 + }, + { + "epoch": 0.778, + "grad_norm": 18.3746280670166, + "learning_rate": 8.446e-06, + "loss": 0.3266, + "step": 7780 + }, + { + "epoch": 0.779, + "grad_norm": 17.5025634765625, + "learning_rate": 8.444e-06, + "loss": 0.5833, + "step": 7790 + }, + { + "epoch": 0.78, + "grad_norm": 44.1501579284668, + "learning_rate": 8.442e-06, + "loss": 0.9204, + "step": 7800 + }, + { + "epoch": 0.781, + "grad_norm": 4.0904107093811035, + "learning_rate": 8.44e-06, + "loss": 0.7701, + "step": 7810 + }, + { + "epoch": 0.782, + "grad_norm": 14.378214836120605, + "learning_rate": 8.438000000000001e-06, + "loss": 0.6579, + "step": 7820 + }, + { + "epoch": 0.783, + "grad_norm": 31.078834533691406, + "learning_rate": 8.436e-06, + "loss": 0.7474, + "step": 7830 + }, + { + "epoch": 0.784, + "grad_norm": 32.57761764526367, + "learning_rate": 8.434000000000001e-06, + "loss": 0.6718, + "step": 7840 + }, + { + "epoch": 0.785, + "grad_norm": 29.638538360595703, + "learning_rate": 8.432e-06, + "loss": 0.4598, + "step": 7850 + }, + { + "epoch": 0.786, + "grad_norm": 15.80543041229248, + "learning_rate": 8.43e-06, + "loss": 0.591, + "step": 7860 + }, + { + "epoch": 0.787, + "grad_norm": 22.110992431640625, + "learning_rate": 8.428000000000001e-06, + "loss": 0.8063, + "step": 7870 + }, + { + "epoch": 0.788, + "grad_norm": 20.21088218688965, + "learning_rate": 8.426e-06, + "loss": 0.7009, + "step": 7880 + }, + { + "epoch": 0.789, + "grad_norm": 29.424182891845703, + "learning_rate": 8.424000000000001e-06, + "loss": 0.6799, + "step": 7890 + }, + { + "epoch": 0.79, + "grad_norm": 22.46828842163086, + "learning_rate": 8.422e-06, + "loss": 0.6094, + "step": 7900 + }, + { + "epoch": 0.791, + "grad_norm": 16.11787986755371, + "learning_rate": 8.42e-06, + "loss": 0.4994, + "step": 7910 + }, + { + "epoch": 0.792, + "grad_norm": 13.472208023071289, + "learning_rate": 8.418000000000001e-06, + "loss": 0.6699, + "step": 7920 + }, + { + "epoch": 0.793, + "grad_norm": 23.473478317260742, + "learning_rate": 8.416e-06, + "loss": 0.6749, + "step": 7930 + }, + { + "epoch": 0.794, + "grad_norm": 6.8579792976379395, + "learning_rate": 8.414000000000001e-06, + "loss": 0.4462, + "step": 7940 + }, + { + "epoch": 0.795, + "grad_norm": 34.45481491088867, + "learning_rate": 8.412e-06, + "loss": 0.9276, + "step": 7950 + }, + { + "epoch": 0.796, + "grad_norm": 33.5301399230957, + "learning_rate": 8.41e-06, + "loss": 0.6733, + "step": 7960 + }, + { + "epoch": 0.797, + "grad_norm": 57.889190673828125, + "learning_rate": 8.408e-06, + "loss": 0.9723, + "step": 7970 + }, + { + "epoch": 0.798, + "grad_norm": 6.853766441345215, + "learning_rate": 8.406e-06, + "loss": 0.6781, + "step": 7980 + }, + { + "epoch": 0.799, + "grad_norm": 47.17698669433594, + "learning_rate": 8.404000000000001e-06, + "loss": 0.6879, + "step": 7990 + }, + { + "epoch": 0.8, + "grad_norm": 16.459867477416992, + "learning_rate": 8.402e-06, + "loss": 0.7155, + "step": 8000 + }, + { + "epoch": 0.801, + "grad_norm": 17.590360641479492, + "learning_rate": 8.400000000000001e-06, + "loss": 0.7116, + "step": 8010 + }, + { + "epoch": 0.802, + "grad_norm": 20.620521545410156, + "learning_rate": 8.398e-06, + "loss": 0.7005, + "step": 8020 + }, + { + "epoch": 0.803, + "grad_norm": 44.69348907470703, + "learning_rate": 8.396e-06, + "loss": 0.7358, + "step": 8030 + }, + { + "epoch": 0.804, + "grad_norm": 31.715669631958008, + "learning_rate": 8.394000000000001e-06, + "loss": 0.8132, + "step": 8040 + }, + { + "epoch": 0.805, + "grad_norm": 19.612058639526367, + "learning_rate": 8.392e-06, + "loss": 0.5062, + "step": 8050 + }, + { + "epoch": 0.806, + "grad_norm": 24.114898681640625, + "learning_rate": 8.390000000000001e-06, + "loss": 0.9493, + "step": 8060 + }, + { + "epoch": 0.807, + "grad_norm": 32.89696502685547, + "learning_rate": 8.388e-06, + "loss": 0.6696, + "step": 8070 + }, + { + "epoch": 0.808, + "grad_norm": 31.07137680053711, + "learning_rate": 8.386e-06, + "loss": 0.6975, + "step": 8080 + }, + { + "epoch": 0.809, + "grad_norm": 31.295612335205078, + "learning_rate": 8.384000000000001e-06, + "loss": 0.6132, + "step": 8090 + }, + { + "epoch": 0.81, + "grad_norm": 25.72049903869629, + "learning_rate": 8.382e-06, + "loss": 0.4594, + "step": 8100 + }, + { + "epoch": 0.811, + "grad_norm": 33.49985885620117, + "learning_rate": 8.380000000000001e-06, + "loss": 0.7982, + "step": 8110 + }, + { + "epoch": 0.812, + "grad_norm": 26.266956329345703, + "learning_rate": 8.378e-06, + "loss": 0.652, + "step": 8120 + }, + { + "epoch": 0.813, + "grad_norm": 8.741480827331543, + "learning_rate": 8.376e-06, + "loss": 0.6933, + "step": 8130 + }, + { + "epoch": 0.814, + "grad_norm": 60.22322082519531, + "learning_rate": 8.374000000000001e-06, + "loss": 0.9593, + "step": 8140 + }, + { + "epoch": 0.815, + "grad_norm": 20.960670471191406, + "learning_rate": 8.372e-06, + "loss": 0.5567, + "step": 8150 + }, + { + "epoch": 0.816, + "grad_norm": 24.88983726501465, + "learning_rate": 8.370000000000001e-06, + "loss": 0.5051, + "step": 8160 + }, + { + "epoch": 0.817, + "grad_norm": 32.92258834838867, + "learning_rate": 8.368e-06, + "loss": 0.7988, + "step": 8170 + }, + { + "epoch": 0.818, + "grad_norm": 39.90187072753906, + "learning_rate": 8.366000000000001e-06, + "loss": 0.7172, + "step": 8180 + }, + { + "epoch": 0.819, + "grad_norm": 24.0626163482666, + "learning_rate": 8.364000000000002e-06, + "loss": 0.5431, + "step": 8190 + }, + { + "epoch": 0.82, + "grad_norm": 53.17387771606445, + "learning_rate": 8.362e-06, + "loss": 0.8249, + "step": 8200 + }, + { + "epoch": 0.821, + "grad_norm": 12.743687629699707, + "learning_rate": 8.36e-06, + "loss": 0.5468, + "step": 8210 + }, + { + "epoch": 0.822, + "grad_norm": 23.088817596435547, + "learning_rate": 8.358e-06, + "loss": 0.648, + "step": 8220 + }, + { + "epoch": 0.823, + "grad_norm": 22.222497940063477, + "learning_rate": 8.356000000000001e-06, + "loss": 0.5813, + "step": 8230 + }, + { + "epoch": 0.824, + "grad_norm": 22.03820037841797, + "learning_rate": 8.354000000000002e-06, + "loss": 0.8584, + "step": 8240 + }, + { + "epoch": 0.825, + "grad_norm": 7.797426700592041, + "learning_rate": 8.352e-06, + "loss": 0.8026, + "step": 8250 + }, + { + "epoch": 0.826, + "grad_norm": 12.721060752868652, + "learning_rate": 8.35e-06, + "loss": 0.6768, + "step": 8260 + }, + { + "epoch": 0.827, + "grad_norm": 27.198408126831055, + "learning_rate": 8.348e-06, + "loss": 0.6818, + "step": 8270 + }, + { + "epoch": 0.828, + "grad_norm": 22.904878616333008, + "learning_rate": 8.346000000000001e-06, + "loss": 0.9425, + "step": 8280 + }, + { + "epoch": 0.829, + "grad_norm": 22.876659393310547, + "learning_rate": 8.344000000000002e-06, + "loss": 0.3033, + "step": 8290 + }, + { + "epoch": 0.83, + "grad_norm": 35.75187301635742, + "learning_rate": 8.342e-06, + "loss": 0.9064, + "step": 8300 + }, + { + "epoch": 0.831, + "grad_norm": 26.52242660522461, + "learning_rate": 8.34e-06, + "loss": 0.733, + "step": 8310 + }, + { + "epoch": 0.832, + "grad_norm": 0.9999659061431885, + "learning_rate": 8.338e-06, + "loss": 0.7176, + "step": 8320 + }, + { + "epoch": 0.833, + "grad_norm": 33.7828369140625, + "learning_rate": 8.336000000000001e-06, + "loss": 0.8047, + "step": 8330 + }, + { + "epoch": 0.834, + "grad_norm": 17.01401138305664, + "learning_rate": 8.334e-06, + "loss": 0.7676, + "step": 8340 + }, + { + "epoch": 0.835, + "grad_norm": 24.290319442749023, + "learning_rate": 8.332000000000001e-06, + "loss": 0.774, + "step": 8350 + }, + { + "epoch": 0.836, + "grad_norm": 24.34035301208496, + "learning_rate": 8.33e-06, + "loss": 0.7797, + "step": 8360 + }, + { + "epoch": 0.837, + "grad_norm": 19.819849014282227, + "learning_rate": 8.328e-06, + "loss": 0.6335, + "step": 8370 + }, + { + "epoch": 0.838, + "grad_norm": 33.246543884277344, + "learning_rate": 8.326000000000001e-06, + "loss": 0.6919, + "step": 8380 + }, + { + "epoch": 0.839, + "grad_norm": 23.2971134185791, + "learning_rate": 8.324e-06, + "loss": 0.7181, + "step": 8390 + }, + { + "epoch": 0.84, + "grad_norm": 24.520105361938477, + "learning_rate": 8.322000000000001e-06, + "loss": 0.7179, + "step": 8400 + }, + { + "epoch": 0.841, + "grad_norm": 11.137072563171387, + "learning_rate": 8.32e-06, + "loss": 0.6092, + "step": 8410 + }, + { + "epoch": 0.842, + "grad_norm": 15.37700366973877, + "learning_rate": 8.318e-06, + "loss": 0.6143, + "step": 8420 + }, + { + "epoch": 0.843, + "grad_norm": 19.72783660888672, + "learning_rate": 8.316000000000001e-06, + "loss": 0.7324, + "step": 8430 + }, + { + "epoch": 0.844, + "grad_norm": 7.460726737976074, + "learning_rate": 8.314e-06, + "loss": 0.8602, + "step": 8440 + }, + { + "epoch": 0.845, + "grad_norm": 18.754709243774414, + "learning_rate": 8.312000000000001e-06, + "loss": 0.7999, + "step": 8450 + }, + { + "epoch": 0.846, + "grad_norm": 19.63807487487793, + "learning_rate": 8.31e-06, + "loss": 0.9064, + "step": 8460 + }, + { + "epoch": 0.847, + "grad_norm": 19.692235946655273, + "learning_rate": 8.308e-06, + "loss": 0.5737, + "step": 8470 + }, + { + "epoch": 0.848, + "grad_norm": 53.53981018066406, + "learning_rate": 8.306000000000001e-06, + "loss": 0.7999, + "step": 8480 + }, + { + "epoch": 0.849, + "grad_norm": 20.587724685668945, + "learning_rate": 8.304e-06, + "loss": 0.5465, + "step": 8490 + }, + { + "epoch": 0.85, + "grad_norm": 35.91649627685547, + "learning_rate": 8.302000000000001e-06, + "loss": 0.7199, + "step": 8500 + }, + { + "epoch": 0.851, + "grad_norm": 31.062335968017578, + "learning_rate": 8.3e-06, + "loss": 0.6207, + "step": 8510 + }, + { + "epoch": 0.852, + "grad_norm": 28.461896896362305, + "learning_rate": 8.298000000000001e-06, + "loss": 0.8531, + "step": 8520 + }, + { + "epoch": 0.853, + "grad_norm": 42.22334671020508, + "learning_rate": 8.296000000000002e-06, + "loss": 0.7788, + "step": 8530 + }, + { + "epoch": 0.854, + "grad_norm": 27.250551223754883, + "learning_rate": 8.294e-06, + "loss": 0.499, + "step": 8540 + }, + { + "epoch": 0.855, + "grad_norm": 26.948123931884766, + "learning_rate": 8.292000000000001e-06, + "loss": 0.7913, + "step": 8550 + }, + { + "epoch": 0.856, + "grad_norm": 17.879419326782227, + "learning_rate": 8.29e-06, + "loss": 0.7423, + "step": 8560 + }, + { + "epoch": 0.857, + "grad_norm": 23.53091812133789, + "learning_rate": 8.288000000000001e-06, + "loss": 0.6809, + "step": 8570 + }, + { + "epoch": 0.858, + "grad_norm": 16.258769989013672, + "learning_rate": 8.286e-06, + "loss": 0.8998, + "step": 8580 + }, + { + "epoch": 0.859, + "grad_norm": 23.789386749267578, + "learning_rate": 8.284e-06, + "loss": 0.6641, + "step": 8590 + }, + { + "epoch": 0.86, + "grad_norm": 28.253061294555664, + "learning_rate": 8.282000000000001e-06, + "loss": 0.8821, + "step": 8600 + }, + { + "epoch": 0.861, + "grad_norm": 25.612363815307617, + "learning_rate": 8.28e-06, + "loss": 0.7803, + "step": 8610 + }, + { + "epoch": 0.862, + "grad_norm": 22.466379165649414, + "learning_rate": 8.278000000000001e-06, + "loss": 0.5856, + "step": 8620 + }, + { + "epoch": 0.863, + "grad_norm": 19.716615676879883, + "learning_rate": 8.276e-06, + "loss": 0.5913, + "step": 8630 + }, + { + "epoch": 0.864, + "grad_norm": 22.004531860351562, + "learning_rate": 8.274e-06, + "loss": 0.819, + "step": 8640 + }, + { + "epoch": 0.865, + "grad_norm": 29.626161575317383, + "learning_rate": 8.272000000000001e-06, + "loss": 0.47, + "step": 8650 + }, + { + "epoch": 0.866, + "grad_norm": 16.204734802246094, + "learning_rate": 8.27e-06, + "loss": 0.4727, + "step": 8660 + }, + { + "epoch": 0.867, + "grad_norm": 37.761837005615234, + "learning_rate": 8.268000000000001e-06, + "loss": 0.6507, + "step": 8670 + }, + { + "epoch": 0.868, + "grad_norm": 40.671695709228516, + "learning_rate": 8.266e-06, + "loss": 0.7335, + "step": 8680 + }, + { + "epoch": 0.869, + "grad_norm": 41.19644546508789, + "learning_rate": 8.264e-06, + "loss": 0.9103, + "step": 8690 + }, + { + "epoch": 0.87, + "grad_norm": 33.284019470214844, + "learning_rate": 8.262000000000002e-06, + "loss": 0.864, + "step": 8700 + }, + { + "epoch": 0.871, + "grad_norm": 20.35594940185547, + "learning_rate": 8.26e-06, + "loss": 0.7424, + "step": 8710 + }, + { + "epoch": 0.872, + "grad_norm": 17.536413192749023, + "learning_rate": 8.258000000000001e-06, + "loss": 0.7686, + "step": 8720 + }, + { + "epoch": 0.873, + "grad_norm": 25.731748580932617, + "learning_rate": 8.256e-06, + "loss": 0.7772, + "step": 8730 + }, + { + "epoch": 0.874, + "grad_norm": 13.456388473510742, + "learning_rate": 8.254000000000001e-06, + "loss": 0.6912, + "step": 8740 + }, + { + "epoch": 0.875, + "grad_norm": 24.622709274291992, + "learning_rate": 8.252000000000002e-06, + "loss": 0.5566, + "step": 8750 + }, + { + "epoch": 0.876, + "grad_norm": 29.366680145263672, + "learning_rate": 8.25e-06, + "loss": 0.8911, + "step": 8760 + }, + { + "epoch": 0.877, + "grad_norm": 31.854909896850586, + "learning_rate": 8.248e-06, + "loss": 0.7445, + "step": 8770 + }, + { + "epoch": 0.878, + "grad_norm": 22.391782760620117, + "learning_rate": 8.246e-06, + "loss": 0.8338, + "step": 8780 + }, + { + "epoch": 0.879, + "grad_norm": 23.707935333251953, + "learning_rate": 8.244000000000001e-06, + "loss": 0.5422, + "step": 8790 + }, + { + "epoch": 0.88, + "grad_norm": 32.04408645629883, + "learning_rate": 8.242000000000002e-06, + "loss": 0.8426, + "step": 8800 + }, + { + "epoch": 0.881, + "grad_norm": 14.735499382019043, + "learning_rate": 8.24e-06, + "loss": 0.7227, + "step": 8810 + }, + { + "epoch": 0.882, + "grad_norm": 27.44671630859375, + "learning_rate": 8.238e-06, + "loss": 0.7975, + "step": 8820 + }, + { + "epoch": 0.883, + "grad_norm": 15.00820255279541, + "learning_rate": 8.236e-06, + "loss": 0.6293, + "step": 8830 + }, + { + "epoch": 0.884, + "grad_norm": 23.110822677612305, + "learning_rate": 8.234000000000001e-06, + "loss": 0.5946, + "step": 8840 + }, + { + "epoch": 0.885, + "grad_norm": 22.80463218688965, + "learning_rate": 8.232000000000002e-06, + "loss": 0.6666, + "step": 8850 + }, + { + "epoch": 0.886, + "grad_norm": 33.523399353027344, + "learning_rate": 8.23e-06, + "loss": 0.8169, + "step": 8860 + }, + { + "epoch": 0.887, + "grad_norm": 19.332923889160156, + "learning_rate": 8.228e-06, + "loss": 0.6937, + "step": 8870 + }, + { + "epoch": 0.888, + "grad_norm": 12.804621696472168, + "learning_rate": 8.226e-06, + "loss": 0.6046, + "step": 8880 + }, + { + "epoch": 0.889, + "grad_norm": 18.660120010375977, + "learning_rate": 8.224000000000001e-06, + "loss": 0.7445, + "step": 8890 + }, + { + "epoch": 0.89, + "grad_norm": 12.0673246383667, + "learning_rate": 8.222000000000002e-06, + "loss": 0.8442, + "step": 8900 + }, + { + "epoch": 0.891, + "grad_norm": 22.311397552490234, + "learning_rate": 8.220000000000001e-06, + "loss": 0.6267, + "step": 8910 + }, + { + "epoch": 0.892, + "grad_norm": 35.950958251953125, + "learning_rate": 8.218e-06, + "loss": 0.6888, + "step": 8920 + }, + { + "epoch": 0.893, + "grad_norm": 19.454551696777344, + "learning_rate": 8.216e-06, + "loss": 0.7747, + "step": 8930 + }, + { + "epoch": 0.894, + "grad_norm": 24.78478240966797, + "learning_rate": 8.214000000000001e-06, + "loss": 0.6721, + "step": 8940 + }, + { + "epoch": 0.895, + "grad_norm": 21.27381706237793, + "learning_rate": 8.212e-06, + "loss": 0.7577, + "step": 8950 + }, + { + "epoch": 0.896, + "grad_norm": 24.68490982055664, + "learning_rate": 8.210000000000001e-06, + "loss": 0.7143, + "step": 8960 + }, + { + "epoch": 0.897, + "grad_norm": 22.952789306640625, + "learning_rate": 8.208e-06, + "loss": 0.6053, + "step": 8970 + }, + { + "epoch": 0.898, + "grad_norm": 29.25204849243164, + "learning_rate": 8.206e-06, + "loss": 0.5921, + "step": 8980 + }, + { + "epoch": 0.899, + "grad_norm": 26.693504333496094, + "learning_rate": 8.204000000000001e-06, + "loss": 0.6771, + "step": 8990 + }, + { + "epoch": 0.9, + "grad_norm": 30.661624908447266, + "learning_rate": 8.202e-06, + "loss": 0.6005, + "step": 9000 + }, + { + "epoch": 0.901, + "grad_norm": 36.014434814453125, + "learning_rate": 8.2e-06, + "loss": 0.6479, + "step": 9010 + }, + { + "epoch": 0.902, + "grad_norm": 27.50763511657715, + "learning_rate": 8.198e-06, + "loss": 0.6197, + "step": 9020 + }, + { + "epoch": 0.903, + "grad_norm": 27.790868759155273, + "learning_rate": 8.196e-06, + "loss": 0.7401, + "step": 9030 + }, + { + "epoch": 0.904, + "grad_norm": 25.92188262939453, + "learning_rate": 8.194000000000002e-06, + "loss": 0.6251, + "step": 9040 + }, + { + "epoch": 0.905, + "grad_norm": 12.84793758392334, + "learning_rate": 8.192e-06, + "loss": 0.693, + "step": 9050 + }, + { + "epoch": 0.906, + "grad_norm": 26.72030258178711, + "learning_rate": 8.19e-06, + "loss": 0.6391, + "step": 9060 + }, + { + "epoch": 0.907, + "grad_norm": 36.05525207519531, + "learning_rate": 8.188e-06, + "loss": 0.7277, + "step": 9070 + }, + { + "epoch": 0.908, + "grad_norm": 52.11860275268555, + "learning_rate": 8.186000000000001e-06, + "loss": 1.0863, + "step": 9080 + }, + { + "epoch": 0.909, + "grad_norm": 21.379451751708984, + "learning_rate": 8.184000000000002e-06, + "loss": 0.7588, + "step": 9090 + }, + { + "epoch": 0.91, + "grad_norm": 25.653507232666016, + "learning_rate": 8.182e-06, + "loss": 0.7232, + "step": 9100 + }, + { + "epoch": 0.911, + "grad_norm": 84.60053253173828, + "learning_rate": 8.18e-06, + "loss": 0.8049, + "step": 9110 + }, + { + "epoch": 0.912, + "grad_norm": 15.349387168884277, + "learning_rate": 8.178e-06, + "loss": 0.7509, + "step": 9120 + }, + { + "epoch": 0.913, + "grad_norm": 18.783823013305664, + "learning_rate": 8.176000000000001e-06, + "loss": 0.6542, + "step": 9130 + }, + { + "epoch": 0.914, + "grad_norm": 53.27655029296875, + "learning_rate": 8.174e-06, + "loss": 0.8797, + "step": 9140 + }, + { + "epoch": 0.915, + "grad_norm": 27.371829986572266, + "learning_rate": 8.172e-06, + "loss": 0.6392, + "step": 9150 + }, + { + "epoch": 0.916, + "grad_norm": 17.546588897705078, + "learning_rate": 8.17e-06, + "loss": 0.8283, + "step": 9160 + }, + { + "epoch": 0.917, + "grad_norm": 53.05363464355469, + "learning_rate": 8.168e-06, + "loss": 0.6169, + "step": 9170 + }, + { + "epoch": 0.918, + "grad_norm": 43.26979446411133, + "learning_rate": 8.166000000000001e-06, + "loss": 0.6253, + "step": 9180 + }, + { + "epoch": 0.919, + "grad_norm": 33.546607971191406, + "learning_rate": 8.164e-06, + "loss": 0.7871, + "step": 9190 + }, + { + "epoch": 0.92, + "grad_norm": 13.680317878723145, + "learning_rate": 8.162e-06, + "loss": 0.8808, + "step": 9200 + }, + { + "epoch": 0.921, + "grad_norm": 53.66688919067383, + "learning_rate": 8.1602e-06, + "loss": 0.5977, + "step": 9210 + }, + { + "epoch": 0.922, + "grad_norm": 14.099869728088379, + "learning_rate": 8.158200000000001e-06, + "loss": 0.973, + "step": 9220 + }, + { + "epoch": 0.923, + "grad_norm": 5.584235668182373, + "learning_rate": 8.1562e-06, + "loss": 0.464, + "step": 9230 + }, + { + "epoch": 0.924, + "grad_norm": 15.951348304748535, + "learning_rate": 8.1542e-06, + "loss": 0.5346, + "step": 9240 + }, + { + "epoch": 0.925, + "grad_norm": 31.42668342590332, + "learning_rate": 8.1522e-06, + "loss": 0.619, + "step": 9250 + }, + { + "epoch": 0.926, + "grad_norm": 18.869773864746094, + "learning_rate": 8.1502e-06, + "loss": 0.5971, + "step": 9260 + }, + { + "epoch": 0.927, + "grad_norm": 35.41682434082031, + "learning_rate": 8.148200000000001e-06, + "loss": 0.9642, + "step": 9270 + }, + { + "epoch": 0.928, + "grad_norm": 32.13036346435547, + "learning_rate": 8.1462e-06, + "loss": 0.7917, + "step": 9280 + }, + { + "epoch": 0.929, + "grad_norm": 19.31574249267578, + "learning_rate": 8.1442e-06, + "loss": 0.7301, + "step": 9290 + }, + { + "epoch": 0.93, + "grad_norm": 47.323856353759766, + "learning_rate": 8.142200000000001e-06, + "loss": 0.7374, + "step": 9300 + }, + { + "epoch": 0.931, + "grad_norm": 9.623072624206543, + "learning_rate": 8.1402e-06, + "loss": 0.5891, + "step": 9310 + }, + { + "epoch": 0.932, + "grad_norm": 31.580720901489258, + "learning_rate": 8.138200000000001e-06, + "loss": 0.7645, + "step": 9320 + }, + { + "epoch": 0.933, + "grad_norm": 23.051807403564453, + "learning_rate": 8.1362e-06, + "loss": 0.7925, + "step": 9330 + }, + { + "epoch": 0.934, + "grad_norm": 19.1823787689209, + "learning_rate": 8.1342e-06, + "loss": 0.8137, + "step": 9340 + }, + { + "epoch": 0.935, + "grad_norm": 40.923606872558594, + "learning_rate": 8.132200000000002e-06, + "loss": 0.7003, + "step": 9350 + }, + { + "epoch": 0.936, + "grad_norm": 18.946191787719727, + "learning_rate": 8.1302e-06, + "loss": 0.6805, + "step": 9360 + }, + { + "epoch": 0.937, + "grad_norm": 29.28422737121582, + "learning_rate": 8.128200000000001e-06, + "loss": 0.8437, + "step": 9370 + }, + { + "epoch": 0.938, + "grad_norm": 16.271589279174805, + "learning_rate": 8.1262e-06, + "loss": 0.5734, + "step": 9380 + }, + { + "epoch": 0.939, + "grad_norm": 31.045360565185547, + "learning_rate": 8.124200000000001e-06, + "loss": 0.6937, + "step": 9390 + }, + { + "epoch": 0.94, + "grad_norm": 25.518590927124023, + "learning_rate": 8.122200000000002e-06, + "loss": 0.7622, + "step": 9400 + }, + { + "epoch": 0.941, + "grad_norm": 14.280061721801758, + "learning_rate": 8.1202e-06, + "loss": 0.4491, + "step": 9410 + }, + { + "epoch": 0.942, + "grad_norm": 13.236858367919922, + "learning_rate": 8.118200000000001e-06, + "loss": 0.8063, + "step": 9420 + }, + { + "epoch": 0.943, + "grad_norm": 25.479963302612305, + "learning_rate": 8.1162e-06, + "loss": 0.847, + "step": 9430 + }, + { + "epoch": 0.944, + "grad_norm": 47.634483337402344, + "learning_rate": 8.114200000000001e-06, + "loss": 0.7497, + "step": 9440 + }, + { + "epoch": 0.945, + "grad_norm": 20.367294311523438, + "learning_rate": 8.112200000000002e-06, + "loss": 0.4785, + "step": 9450 + }, + { + "epoch": 0.946, + "grad_norm": 34.85844039916992, + "learning_rate": 8.1102e-06, + "loss": 0.7972, + "step": 9460 + }, + { + "epoch": 0.947, + "grad_norm": 30.255382537841797, + "learning_rate": 8.1082e-06, + "loss": 0.7756, + "step": 9470 + }, + { + "epoch": 0.948, + "grad_norm": 30.46112060546875, + "learning_rate": 8.1062e-06, + "loss": 0.8008, + "step": 9480 + }, + { + "epoch": 0.949, + "grad_norm": 15.27729606628418, + "learning_rate": 8.104200000000001e-06, + "loss": 0.5169, + "step": 9490 + }, + { + "epoch": 0.95, + "grad_norm": 81.02428436279297, + "learning_rate": 8.102200000000002e-06, + "loss": 0.7141, + "step": 9500 + }, + { + "epoch": 0.951, + "grad_norm": 22.240251541137695, + "learning_rate": 8.1002e-06, + "loss": 0.5889, + "step": 9510 + }, + { + "epoch": 0.952, + "grad_norm": 28.84646987915039, + "learning_rate": 8.0982e-06, + "loss": 0.6834, + "step": 9520 + }, + { + "epoch": 0.953, + "grad_norm": 35.79847717285156, + "learning_rate": 8.0962e-06, + "loss": 0.7559, + "step": 9530 + }, + { + "epoch": 0.954, + "grad_norm": 25.810239791870117, + "learning_rate": 8.094200000000001e-06, + "loss": 0.7268, + "step": 9540 + }, + { + "epoch": 0.955, + "grad_norm": 34.47658157348633, + "learning_rate": 8.092200000000002e-06, + "loss": 0.6295, + "step": 9550 + }, + { + "epoch": 0.956, + "grad_norm": 33.548397064208984, + "learning_rate": 8.090200000000001e-06, + "loss": 0.7543, + "step": 9560 + }, + { + "epoch": 0.957, + "grad_norm": 28.522300720214844, + "learning_rate": 8.0882e-06, + "loss": 0.6737, + "step": 9570 + }, + { + "epoch": 0.958, + "grad_norm": 4.9988226890563965, + "learning_rate": 8.0862e-06, + "loss": 0.6983, + "step": 9580 + }, + { + "epoch": 0.959, + "grad_norm": 4.651895523071289, + "learning_rate": 8.084200000000001e-06, + "loss": 0.6858, + "step": 9590 + }, + { + "epoch": 0.96, + "grad_norm": 32.256072998046875, + "learning_rate": 8.082200000000002e-06, + "loss": 0.7514, + "step": 9600 + }, + { + "epoch": 0.961, + "grad_norm": 21.96092987060547, + "learning_rate": 8.080200000000001e-06, + "loss": 0.6136, + "step": 9610 + }, + { + "epoch": 0.962, + "grad_norm": 11.350577354431152, + "learning_rate": 8.0782e-06, + "loss": 0.6035, + "step": 9620 + }, + { + "epoch": 0.963, + "grad_norm": 23.908449172973633, + "learning_rate": 8.0762e-06, + "loss": 0.515, + "step": 9630 + }, + { + "epoch": 0.964, + "grad_norm": 20.183002471923828, + "learning_rate": 8.074200000000001e-06, + "loss": 0.6201, + "step": 9640 + }, + { + "epoch": 0.965, + "grad_norm": 48.198543548583984, + "learning_rate": 8.0722e-06, + "loss": 0.6703, + "step": 9650 + }, + { + "epoch": 0.966, + "grad_norm": 57.94309616088867, + "learning_rate": 8.070200000000001e-06, + "loss": 0.952, + "step": 9660 + }, + { + "epoch": 0.967, + "grad_norm": 0.18479055166244507, + "learning_rate": 8.0682e-06, + "loss": 0.5543, + "step": 9670 + }, + { + "epoch": 0.968, + "grad_norm": 31.414304733276367, + "learning_rate": 8.0662e-06, + "loss": 0.7087, + "step": 9680 + }, + { + "epoch": 0.969, + "grad_norm": 21.823001861572266, + "learning_rate": 8.064200000000001e-06, + "loss": 0.8891, + "step": 9690 + }, + { + "epoch": 0.97, + "grad_norm": 20.574464797973633, + "learning_rate": 8.0622e-06, + "loss": 0.7342, + "step": 9700 + }, + { + "epoch": 0.971, + "grad_norm": 40.75229263305664, + "learning_rate": 8.0602e-06, + "loss": 0.6528, + "step": 9710 + }, + { + "epoch": 0.972, + "grad_norm": 31.854001998901367, + "learning_rate": 8.0582e-06, + "loss": 0.7637, + "step": 9720 + }, + { + "epoch": 0.973, + "grad_norm": 58.702945709228516, + "learning_rate": 8.056200000000001e-06, + "loss": 1.0291, + "step": 9730 + }, + { + "epoch": 0.974, + "grad_norm": 22.919151306152344, + "learning_rate": 8.054200000000002e-06, + "loss": 0.6746, + "step": 9740 + }, + { + "epoch": 0.975, + "grad_norm": 13.362497329711914, + "learning_rate": 8.0522e-06, + "loss": 0.8915, + "step": 9750 + }, + { + "epoch": 0.976, + "grad_norm": 24.370723724365234, + "learning_rate": 8.0502e-06, + "loss": 0.5496, + "step": 9760 + }, + { + "epoch": 0.977, + "grad_norm": 26.763219833374023, + "learning_rate": 8.0482e-06, + "loss": 0.8504, + "step": 9770 + }, + { + "epoch": 0.978, + "grad_norm": 23.564735412597656, + "learning_rate": 8.046200000000001e-06, + "loss": 0.5827, + "step": 9780 + }, + { + "epoch": 0.979, + "grad_norm": 34.036258697509766, + "learning_rate": 8.044200000000002e-06, + "loss": 0.7833, + "step": 9790 + }, + { + "epoch": 0.98, + "grad_norm": 23.769405364990234, + "learning_rate": 8.0422e-06, + "loss": 0.8223, + "step": 9800 + }, + { + "epoch": 0.981, + "grad_norm": 16.29641342163086, + "learning_rate": 8.0402e-06, + "loss": 0.8408, + "step": 9810 + }, + { + "epoch": 0.982, + "grad_norm": 6.285024642944336, + "learning_rate": 8.0382e-06, + "loss": 0.6709, + "step": 9820 + }, + { + "epoch": 0.983, + "grad_norm": 16.67902946472168, + "learning_rate": 8.036200000000001e-06, + "loss": 0.8057, + "step": 9830 + }, + { + "epoch": 0.984, + "grad_norm": 15.825904846191406, + "learning_rate": 8.0342e-06, + "loss": 0.5174, + "step": 9840 + }, + { + "epoch": 0.985, + "grad_norm": 22.62651824951172, + "learning_rate": 8.0322e-06, + "loss": 0.5081, + "step": 9850 + }, + { + "epoch": 0.986, + "grad_norm": 22.813600540161133, + "learning_rate": 8.0302e-06, + "loss": 0.6055, + "step": 9860 + }, + { + "epoch": 0.987, + "grad_norm": 39.27797317504883, + "learning_rate": 8.0282e-06, + "loss": 0.6965, + "step": 9870 + }, + { + "epoch": 0.988, + "grad_norm": 21.403615951538086, + "learning_rate": 8.026200000000001e-06, + "loss": 0.6509, + "step": 9880 + }, + { + "epoch": 0.989, + "grad_norm": 38.3160400390625, + "learning_rate": 8.0242e-06, + "loss": 0.4817, + "step": 9890 + }, + { + "epoch": 0.99, + "grad_norm": 4.218068599700928, + "learning_rate": 8.022200000000001e-06, + "loss": 0.8259, + "step": 9900 + }, + { + "epoch": 0.991, + "grad_norm": 38.34626388549805, + "learning_rate": 8.0202e-06, + "loss": 0.7534, + "step": 9910 + }, + { + "epoch": 0.992, + "grad_norm": 17.265363693237305, + "learning_rate": 8.0182e-06, + "loss": 0.807, + "step": 9920 + }, + { + "epoch": 0.993, + "grad_norm": 25.756587982177734, + "learning_rate": 8.016200000000001e-06, + "loss": 0.7307, + "step": 9930 + }, + { + "epoch": 0.994, + "grad_norm": 27.01302719116211, + "learning_rate": 8.0142e-06, + "loss": 0.7551, + "step": 9940 + }, + { + "epoch": 0.995, + "grad_norm": 22.30595588684082, + "learning_rate": 8.012200000000001e-06, + "loss": 0.6335, + "step": 9950 + }, + { + "epoch": 0.996, + "grad_norm": 29.83885955810547, + "learning_rate": 8.0102e-06, + "loss": 0.6483, + "step": 9960 + }, + { + "epoch": 0.997, + "grad_norm": 22.51917839050293, + "learning_rate": 8.0082e-06, + "loss": 0.7563, + "step": 9970 + }, + { + "epoch": 0.998, + "grad_norm": 28.65583038330078, + "learning_rate": 8.006200000000001e-06, + "loss": 0.5714, + "step": 9980 + }, + { + "epoch": 0.999, + "grad_norm": 40.359317779541016, + "learning_rate": 8.0042e-06, + "loss": 0.615, + "step": 9990 + }, + { + "epoch": 1.0, + "grad_norm": 48.593536376953125, + "learning_rate": 8.002200000000001e-06, + "loss": 0.6024, + "step": 10000 + }, + { + "epoch": 1.001, + "grad_norm": 32.619136810302734, + "learning_rate": 8.0002e-06, + "loss": 0.7511, + "step": 10010 + }, + { + "epoch": 1.002, + "grad_norm": 22.86610221862793, + "learning_rate": 7.9982e-06, + "loss": 0.7653, + "step": 10020 + }, + { + "epoch": 1.003, + "grad_norm": 36.1226692199707, + "learning_rate": 7.996200000000001e-06, + "loss": 0.7023, + "step": 10030 + }, + { + "epoch": 1.004, + "grad_norm": 48.830448150634766, + "learning_rate": 7.9942e-06, + "loss": 0.6821, + "step": 10040 + }, + { + "epoch": 1.005, + "grad_norm": 40.12712097167969, + "learning_rate": 7.992200000000001e-06, + "loss": 0.9133, + "step": 10050 + }, + { + "epoch": 1.006, + "grad_norm": 16.794452667236328, + "learning_rate": 7.9902e-06, + "loss": 0.6606, + "step": 10060 + }, + { + "epoch": 1.007, + "grad_norm": 22.306949615478516, + "learning_rate": 7.988200000000001e-06, + "loss": 0.5966, + "step": 10070 + }, + { + "epoch": 1.008, + "grad_norm": 10.76391887664795, + "learning_rate": 7.9862e-06, + "loss": 0.5588, + "step": 10080 + }, + { + "epoch": 1.009, + "grad_norm": 30.505229949951172, + "learning_rate": 7.9842e-06, + "loss": 0.7045, + "step": 10090 + }, + { + "epoch": 1.01, + "grad_norm": 25.643178939819336, + "learning_rate": 7.982200000000001e-06, + "loss": 0.6035, + "step": 10100 + }, + { + "epoch": 1.011, + "grad_norm": 29.607240676879883, + "learning_rate": 7.9802e-06, + "loss": 0.4267, + "step": 10110 + }, + { + "epoch": 1.012, + "grad_norm": 28.369211196899414, + "learning_rate": 7.978200000000001e-06, + "loss": 0.6734, + "step": 10120 + }, + { + "epoch": 1.013, + "grad_norm": 23.7529354095459, + "learning_rate": 7.9762e-06, + "loss": 0.632, + "step": 10130 + }, + { + "epoch": 1.014, + "grad_norm": 38.64054870605469, + "learning_rate": 7.9742e-06, + "loss": 0.7071, + "step": 10140 + }, + { + "epoch": 1.015, + "grad_norm": 24.465662002563477, + "learning_rate": 7.972200000000001e-06, + "loss": 0.7538, + "step": 10150 + }, + { + "epoch": 1.016, + "grad_norm": 40.59769821166992, + "learning_rate": 7.9702e-06, + "loss": 0.7346, + "step": 10160 + }, + { + "epoch": 1.017, + "grad_norm": 4.554698467254639, + "learning_rate": 7.968200000000001e-06, + "loss": 0.5459, + "step": 10170 + }, + { + "epoch": 1.018, + "grad_norm": 10.751768112182617, + "learning_rate": 7.9662e-06, + "loss": 0.4939, + "step": 10180 + }, + { + "epoch": 1.019, + "grad_norm": 2.6401894092559814, + "learning_rate": 7.9642e-06, + "loss": 0.5954, + "step": 10190 + }, + { + "epoch": 1.02, + "grad_norm": 16.48724937438965, + "learning_rate": 7.962200000000001e-06, + "loss": 0.6032, + "step": 10200 + }, + { + "epoch": 1.021, + "grad_norm": 27.149873733520508, + "learning_rate": 7.9602e-06, + "loss": 0.4046, + "step": 10210 + }, + { + "epoch": 1.022, + "grad_norm": 25.46242332458496, + "learning_rate": 7.958200000000001e-06, + "loss": 0.8915, + "step": 10220 + }, + { + "epoch": 1.023, + "grad_norm": 50.827213287353516, + "learning_rate": 7.9562e-06, + "loss": 0.6709, + "step": 10230 + }, + { + "epoch": 1.024, + "grad_norm": 21.56336212158203, + "learning_rate": 7.954200000000001e-06, + "loss": 0.573, + "step": 10240 + }, + { + "epoch": 1.025, + "grad_norm": 20.957317352294922, + "learning_rate": 7.952200000000002e-06, + "loss": 0.5161, + "step": 10250 + }, + { + "epoch": 1.026, + "grad_norm": 46.837120056152344, + "learning_rate": 7.9502e-06, + "loss": 0.7277, + "step": 10260 + }, + { + "epoch": 1.027, + "grad_norm": 75.76361846923828, + "learning_rate": 7.9482e-06, + "loss": 0.7414, + "step": 10270 + }, + { + "epoch": 1.028, + "grad_norm": 53.485206604003906, + "learning_rate": 7.9462e-06, + "loss": 1.0627, + "step": 10280 + }, + { + "epoch": 1.029, + "grad_norm": 21.28236961364746, + "learning_rate": 7.944200000000001e-06, + "loss": 0.5271, + "step": 10290 + }, + { + "epoch": 1.03, + "grad_norm": 13.299246788024902, + "learning_rate": 7.942200000000002e-06, + "loss": 0.6222, + "step": 10300 + }, + { + "epoch": 1.031, + "grad_norm": 46.65543746948242, + "learning_rate": 7.9402e-06, + "loss": 0.8833, + "step": 10310 + }, + { + "epoch": 1.032, + "grad_norm": 28.530805587768555, + "learning_rate": 7.9382e-06, + "loss": 0.9351, + "step": 10320 + }, + { + "epoch": 1.033, + "grad_norm": 21.32673454284668, + "learning_rate": 7.9362e-06, + "loss": 0.5873, + "step": 10330 + }, + { + "epoch": 1.034, + "grad_norm": 33.533451080322266, + "learning_rate": 7.934200000000001e-06, + "loss": 0.7576, + "step": 10340 + }, + { + "epoch": 1.035, + "grad_norm": 36.07699966430664, + "learning_rate": 7.932200000000002e-06, + "loss": 0.7857, + "step": 10350 + }, + { + "epoch": 1.036, + "grad_norm": 22.704744338989258, + "learning_rate": 7.9302e-06, + "loss": 0.7796, + "step": 10360 + }, + { + "epoch": 1.037, + "grad_norm": 21.681188583374023, + "learning_rate": 7.9282e-06, + "loss": 0.8053, + "step": 10370 + }, + { + "epoch": 1.038, + "grad_norm": 25.426740646362305, + "learning_rate": 7.9262e-06, + "loss": 0.7377, + "step": 10380 + }, + { + "epoch": 1.039, + "grad_norm": 34.104248046875, + "learning_rate": 7.924200000000001e-06, + "loss": 0.6537, + "step": 10390 + }, + { + "epoch": 1.04, + "grad_norm": 14.893959999084473, + "learning_rate": 7.922200000000002e-06, + "loss": 0.4739, + "step": 10400 + }, + { + "epoch": 1.041, + "grad_norm": 64.21956634521484, + "learning_rate": 7.9202e-06, + "loss": 0.8915, + "step": 10410 + }, + { + "epoch": 1.042, + "grad_norm": 35.314414978027344, + "learning_rate": 7.9182e-06, + "loss": 0.8416, + "step": 10420 + }, + { + "epoch": 1.043, + "grad_norm": 9.086963653564453, + "learning_rate": 7.9162e-06, + "loss": 0.7986, + "step": 10430 + }, + { + "epoch": 1.044, + "grad_norm": 20.71559715270996, + "learning_rate": 7.914200000000001e-06, + "loss": 0.5166, + "step": 10440 + }, + { + "epoch": 1.045, + "grad_norm": 32.6236572265625, + "learning_rate": 7.9122e-06, + "loss": 0.7166, + "step": 10450 + }, + { + "epoch": 1.046, + "grad_norm": 34.77704620361328, + "learning_rate": 7.910200000000001e-06, + "loss": 0.7366, + "step": 10460 + }, + { + "epoch": 1.047, + "grad_norm": 30.373966217041016, + "learning_rate": 7.9082e-06, + "loss": 0.5817, + "step": 10470 + }, + { + "epoch": 1.048, + "grad_norm": 21.460689544677734, + "learning_rate": 7.9062e-06, + "loss": 0.6465, + "step": 10480 + }, + { + "epoch": 1.049, + "grad_norm": 21.833818435668945, + "learning_rate": 7.904200000000001e-06, + "loss": 0.5789, + "step": 10490 + }, + { + "epoch": 1.05, + "grad_norm": 22.837757110595703, + "learning_rate": 7.9022e-06, + "loss": 0.8323, + "step": 10500 + }, + { + "epoch": 1.051, + "grad_norm": 14.228981018066406, + "learning_rate": 7.9002e-06, + "loss": 0.6262, + "step": 10510 + }, + { + "epoch": 1.052, + "grad_norm": 24.556312561035156, + "learning_rate": 7.8982e-06, + "loss": 0.7267, + "step": 10520 + }, + { + "epoch": 1.053, + "grad_norm": 54.67141342163086, + "learning_rate": 7.8962e-06, + "loss": 0.4909, + "step": 10530 + }, + { + "epoch": 1.054, + "grad_norm": 33.58781051635742, + "learning_rate": 7.894200000000001e-06, + "loss": 0.6022, + "step": 10540 + }, + { + "epoch": 1.055, + "grad_norm": 16.152503967285156, + "learning_rate": 7.8922e-06, + "loss": 0.5752, + "step": 10550 + }, + { + "epoch": 1.056, + "grad_norm": 50.25066375732422, + "learning_rate": 7.890200000000001e-06, + "loss": 0.7387, + "step": 10560 + }, + { + "epoch": 1.057, + "grad_norm": 23.39357566833496, + "learning_rate": 7.8882e-06, + "loss": 0.5076, + "step": 10570 + }, + { + "epoch": 1.058, + "grad_norm": 22.392078399658203, + "learning_rate": 7.8862e-06, + "loss": 0.961, + "step": 10580 + }, + { + "epoch": 1.059, + "grad_norm": 12.517849922180176, + "learning_rate": 7.884200000000002e-06, + "loss": 0.6681, + "step": 10590 + }, + { + "epoch": 1.06, + "grad_norm": 25.450721740722656, + "learning_rate": 7.8822e-06, + "loss": 0.7726, + "step": 10600 + }, + { + "epoch": 1.061, + "grad_norm": 37.7824821472168, + "learning_rate": 7.880200000000001e-06, + "loss": 0.4592, + "step": 10610 + }, + { + "epoch": 1.062, + "grad_norm": 33.96101379394531, + "learning_rate": 7.8782e-06, + "loss": 0.9868, + "step": 10620 + }, + { + "epoch": 1.063, + "grad_norm": 7.661472320556641, + "learning_rate": 7.876200000000001e-06, + "loss": 0.6406, + "step": 10630 + }, + { + "epoch": 1.064, + "grad_norm": 32.47237777709961, + "learning_rate": 7.8742e-06, + "loss": 0.6936, + "step": 10640 + }, + { + "epoch": 1.065, + "grad_norm": 54.157474517822266, + "learning_rate": 7.8722e-06, + "loss": 0.7992, + "step": 10650 + }, + { + "epoch": 1.066, + "grad_norm": 31.294652938842773, + "learning_rate": 7.870200000000001e-06, + "loss": 0.7245, + "step": 10660 + }, + { + "epoch": 1.067, + "grad_norm": 41.6014404296875, + "learning_rate": 7.8682e-06, + "loss": 0.6573, + "step": 10670 + }, + { + "epoch": 1.068, + "grad_norm": 19.699817657470703, + "learning_rate": 7.866200000000001e-06, + "loss": 0.6731, + "step": 10680 + }, + { + "epoch": 1.069, + "grad_norm": 41.20882034301758, + "learning_rate": 7.8642e-06, + "loss": 0.3784, + "step": 10690 + }, + { + "epoch": 1.07, + "grad_norm": 25.886396408081055, + "learning_rate": 7.8622e-06, + "loss": 0.509, + "step": 10700 + }, + { + "epoch": 1.071, + "grad_norm": 40.70294189453125, + "learning_rate": 7.860200000000001e-06, + "loss": 0.6683, + "step": 10710 + }, + { + "epoch": 1.072, + "grad_norm": 41.22813415527344, + "learning_rate": 7.8582e-06, + "loss": 0.7252, + "step": 10720 + }, + { + "epoch": 1.073, + "grad_norm": 21.7695255279541, + "learning_rate": 7.856200000000001e-06, + "loss": 0.6357, + "step": 10730 + }, + { + "epoch": 1.074, + "grad_norm": 41.75007247924805, + "learning_rate": 7.8542e-06, + "loss": 0.7398, + "step": 10740 + }, + { + "epoch": 1.075, + "grad_norm": 7.795687675476074, + "learning_rate": 7.8522e-06, + "loss": 0.8729, + "step": 10750 + }, + { + "epoch": 1.076, + "grad_norm": 17.92356300354004, + "learning_rate": 7.850200000000002e-06, + "loss": 0.6661, + "step": 10760 + }, + { + "epoch": 1.077, + "grad_norm": 15.715928077697754, + "learning_rate": 7.8482e-06, + "loss": 0.528, + "step": 10770 + }, + { + "epoch": 1.078, + "grad_norm": 27.70500946044922, + "learning_rate": 7.846200000000001e-06, + "loss": 0.8231, + "step": 10780 + }, + { + "epoch": 1.079, + "grad_norm": 16.61052894592285, + "learning_rate": 7.8442e-06, + "loss": 0.6257, + "step": 10790 + }, + { + "epoch": 1.08, + "grad_norm": 8.07752513885498, + "learning_rate": 7.842200000000001e-06, + "loss": 0.7334, + "step": 10800 + }, + { + "epoch": 1.081, + "grad_norm": 34.947757720947266, + "learning_rate": 7.840200000000002e-06, + "loss": 0.6238, + "step": 10810 + }, + { + "epoch": 1.082, + "grad_norm": 19.702144622802734, + "learning_rate": 7.8382e-06, + "loss": 0.5864, + "step": 10820 + }, + { + "epoch": 1.083, + "grad_norm": 35.57761764526367, + "learning_rate": 7.836200000000001e-06, + "loss": 0.747, + "step": 10830 + }, + { + "epoch": 1.084, + "grad_norm": 26.503231048583984, + "learning_rate": 7.8342e-06, + "loss": 0.73, + "step": 10840 + }, + { + "epoch": 1.085, + "grad_norm": 37.302093505859375, + "learning_rate": 7.832200000000001e-06, + "loss": 0.6326, + "step": 10850 + }, + { + "epoch": 1.086, + "grad_norm": 24.52855110168457, + "learning_rate": 7.830200000000002e-06, + "loss": 0.4165, + "step": 10860 + }, + { + "epoch": 1.087, + "grad_norm": 17.49752426147461, + "learning_rate": 7.8282e-06, + "loss": 0.5934, + "step": 10870 + }, + { + "epoch": 1.088, + "grad_norm": 50.52231979370117, + "learning_rate": 7.8262e-06, + "loss": 0.482, + "step": 10880 + }, + { + "epoch": 1.089, + "grad_norm": 23.141277313232422, + "learning_rate": 7.8242e-06, + "loss": 0.5202, + "step": 10890 + }, + { + "epoch": 1.09, + "grad_norm": 28.306123733520508, + "learning_rate": 7.822200000000001e-06, + "loss": 0.7558, + "step": 10900 + }, + { + "epoch": 1.091, + "grad_norm": 25.076086044311523, + "learning_rate": 7.820200000000002e-06, + "loss": 0.8782, + "step": 10910 + }, + { + "epoch": 1.092, + "grad_norm": 39.36845016479492, + "learning_rate": 7.8182e-06, + "loss": 0.6682, + "step": 10920 + }, + { + "epoch": 1.093, + "grad_norm": 7.6542067527771, + "learning_rate": 7.8162e-06, + "loss": 0.8071, + "step": 10930 + }, + { + "epoch": 1.094, + "grad_norm": 28.47787094116211, + "learning_rate": 7.8142e-06, + "loss": 0.8684, + "step": 10940 + }, + { + "epoch": 1.095, + "grad_norm": 32.25487518310547, + "learning_rate": 7.812200000000001e-06, + "loss": 0.5147, + "step": 10950 + }, + { + "epoch": 1.096, + "grad_norm": 29.876779556274414, + "learning_rate": 7.810200000000002e-06, + "loss": 0.7231, + "step": 10960 + }, + { + "epoch": 1.097, + "grad_norm": 15.228774070739746, + "learning_rate": 7.808200000000001e-06, + "loss": 0.7224, + "step": 10970 + }, + { + "epoch": 1.098, + "grad_norm": 18.995376586914062, + "learning_rate": 7.8062e-06, + "loss": 0.5264, + "step": 10980 + }, + { + "epoch": 1.099, + "grad_norm": 29.122920989990234, + "learning_rate": 7.8042e-06, + "loss": 0.5653, + "step": 10990 + }, + { + "epoch": 1.1, + "grad_norm": 47.868804931640625, + "learning_rate": 7.802200000000001e-06, + "loss": 0.6261, + "step": 11000 + }, + { + "epoch": 1.101, + "grad_norm": 30.663339614868164, + "learning_rate": 7.8002e-06, + "loss": 0.48, + "step": 11010 + }, + { + "epoch": 1.102, + "grad_norm": 8.724465370178223, + "learning_rate": 7.798200000000001e-06, + "loss": 0.4851, + "step": 11020 + }, + { + "epoch": 1.103, + "grad_norm": 52.42881393432617, + "learning_rate": 7.7962e-06, + "loss": 0.6022, + "step": 11030 + }, + { + "epoch": 1.104, + "grad_norm": 47.27696990966797, + "learning_rate": 7.7942e-06, + "loss": 0.8039, + "step": 11040 + }, + { + "epoch": 1.105, + "grad_norm": 51.4878044128418, + "learning_rate": 7.792200000000001e-06, + "loss": 0.7665, + "step": 11050 + }, + { + "epoch": 1.106, + "grad_norm": 35.13274383544922, + "learning_rate": 7.7902e-06, + "loss": 0.6382, + "step": 11060 + }, + { + "epoch": 1.107, + "grad_norm": 61.97047424316406, + "learning_rate": 7.7882e-06, + "loss": 0.4421, + "step": 11070 + }, + { + "epoch": 1.108, + "grad_norm": 54.35153579711914, + "learning_rate": 7.7862e-06, + "loss": 0.9091, + "step": 11080 + }, + { + "epoch": 1.109, + "grad_norm": 5.162842273712158, + "learning_rate": 7.7842e-06, + "loss": 0.42, + "step": 11090 + }, + { + "epoch": 1.11, + "grad_norm": 29.818588256835938, + "learning_rate": 7.782200000000001e-06, + "loss": 0.8216, + "step": 11100 + }, + { + "epoch": 1.111, + "grad_norm": 46.03116989135742, + "learning_rate": 7.7802e-06, + "loss": 0.9207, + "step": 11110 + }, + { + "epoch": 1.112, + "grad_norm": 40.337379455566406, + "learning_rate": 7.7782e-06, + "loss": 0.7755, + "step": 11120 + }, + { + "epoch": 1.113, + "grad_norm": 27.618555068969727, + "learning_rate": 7.7762e-06, + "loss": 0.5692, + "step": 11130 + }, + { + "epoch": 1.114, + "grad_norm": 39.48080062866211, + "learning_rate": 7.774200000000001e-06, + "loss": 0.7518, + "step": 11140 + }, + { + "epoch": 1.115, + "grad_norm": 26.62726402282715, + "learning_rate": 7.772200000000002e-06, + "loss": 0.6853, + "step": 11150 + }, + { + "epoch": 1.116, + "grad_norm": 20.606592178344727, + "learning_rate": 7.7702e-06, + "loss": 0.8051, + "step": 11160 + }, + { + "epoch": 1.117, + "grad_norm": 38.8145751953125, + "learning_rate": 7.7682e-06, + "loss": 0.6414, + "step": 11170 + }, + { + "epoch": 1.1179999999999999, + "grad_norm": 36.362327575683594, + "learning_rate": 7.7662e-06, + "loss": 0.6396, + "step": 11180 + }, + { + "epoch": 1.119, + "grad_norm": 15.188089370727539, + "learning_rate": 7.764200000000001e-06, + "loss": 0.7237, + "step": 11190 + }, + { + "epoch": 1.12, + "grad_norm": 43.664649963378906, + "learning_rate": 7.762200000000002e-06, + "loss": 0.806, + "step": 11200 + }, + { + "epoch": 1.121, + "grad_norm": 45.87483596801758, + "learning_rate": 7.7602e-06, + "loss": 0.6526, + "step": 11210 + }, + { + "epoch": 1.1219999999999999, + "grad_norm": 41.26497268676758, + "learning_rate": 7.7582e-06, + "loss": 0.4962, + "step": 11220 + }, + { + "epoch": 1.123, + "grad_norm": 23.14554786682129, + "learning_rate": 7.7562e-06, + "loss": 0.6414, + "step": 11230 + }, + { + "epoch": 1.124, + "grad_norm": 13.66366958618164, + "learning_rate": 7.754200000000001e-06, + "loss": 0.811, + "step": 11240 + }, + { + "epoch": 1.125, + "grad_norm": 43.1004638671875, + "learning_rate": 7.7522e-06, + "loss": 0.6546, + "step": 11250 + }, + { + "epoch": 1.126, + "grad_norm": 33.18350601196289, + "learning_rate": 7.7502e-06, + "loss": 0.7879, + "step": 11260 + }, + { + "epoch": 1.127, + "grad_norm": 11.210447311401367, + "learning_rate": 7.7482e-06, + "loss": 0.6429, + "step": 11270 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 50.47068405151367, + "learning_rate": 7.7462e-06, + "loss": 0.4329, + "step": 11280 + }, + { + "epoch": 1.129, + "grad_norm": 38.2538948059082, + "learning_rate": 7.744200000000001e-06, + "loss": 0.7161, + "step": 11290 + }, + { + "epoch": 1.13, + "grad_norm": 22.432058334350586, + "learning_rate": 7.7422e-06, + "loss": 0.535, + "step": 11300 + }, + { + "epoch": 1.131, + "grad_norm": 44.708797454833984, + "learning_rate": 7.740200000000001e-06, + "loss": 0.7098, + "step": 11310 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 27.494529724121094, + "learning_rate": 7.7382e-06, + "loss": 0.6044, + "step": 11320 + }, + { + "epoch": 1.133, + "grad_norm": 37.533042907714844, + "learning_rate": 7.7362e-06, + "loss": 0.944, + "step": 11330 + }, + { + "epoch": 1.134, + "grad_norm": 24.845661163330078, + "learning_rate": 7.734200000000001e-06, + "loss": 0.7786, + "step": 11340 + }, + { + "epoch": 1.135, + "grad_norm": 36.159950256347656, + "learning_rate": 7.7322e-06, + "loss": 0.5647, + "step": 11350 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 33.843299865722656, + "learning_rate": 7.730200000000001e-06, + "loss": 0.9112, + "step": 11360 + }, + { + "epoch": 1.137, + "grad_norm": 46.498435974121094, + "learning_rate": 7.7282e-06, + "loss": 1.0631, + "step": 11370 + }, + { + "epoch": 1.138, + "grad_norm": 24.42860221862793, + "learning_rate": 7.7262e-06, + "loss": 0.8096, + "step": 11380 + }, + { + "epoch": 1.139, + "grad_norm": 14.397269248962402, + "learning_rate": 7.724200000000001e-06, + "loss": 0.5918, + "step": 11390 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 14.705584526062012, + "learning_rate": 7.7222e-06, + "loss": 0.5131, + "step": 11400 + }, + { + "epoch": 1.141, + "grad_norm": 26.62394142150879, + "learning_rate": 7.720200000000001e-06, + "loss": 0.6488, + "step": 11410 + }, + { + "epoch": 1.142, + "grad_norm": 32.86175537109375, + "learning_rate": 7.7182e-06, + "loss": 0.5914, + "step": 11420 + }, + { + "epoch": 1.143, + "grad_norm": 20.663698196411133, + "learning_rate": 7.7162e-06, + "loss": 0.6432, + "step": 11430 + }, + { + "epoch": 1.144, + "grad_norm": 30.446578979492188, + "learning_rate": 7.7142e-06, + "loss": 0.7907, + "step": 11440 + }, + { + "epoch": 1.145, + "grad_norm": 34.20303726196289, + "learning_rate": 7.7122e-06, + "loss": 0.7505, + "step": 11450 + }, + { + "epoch": 1.146, + "grad_norm": 26.968290328979492, + "learning_rate": 7.710200000000001e-06, + "loss": 0.8869, + "step": 11460 + }, + { + "epoch": 1.147, + "grad_norm": 50.744842529296875, + "learning_rate": 7.7082e-06, + "loss": 0.87, + "step": 11470 + }, + { + "epoch": 1.148, + "grad_norm": 31.850502014160156, + "learning_rate": 7.706200000000001e-06, + "loss": 0.5349, + "step": 11480 + }, + { + "epoch": 1.149, + "grad_norm": 32.834651947021484, + "learning_rate": 7.7042e-06, + "loss": 0.5677, + "step": 11490 + }, + { + "epoch": 1.15, + "grad_norm": 21.380491256713867, + "learning_rate": 7.7022e-06, + "loss": 0.6704, + "step": 11500 + }, + { + "epoch": 1.151, + "grad_norm": 11.736533164978027, + "learning_rate": 7.700200000000001e-06, + "loss": 0.7262, + "step": 11510 + }, + { + "epoch": 1.152, + "grad_norm": 52.395469665527344, + "learning_rate": 7.6982e-06, + "loss": 0.6247, + "step": 11520 + }, + { + "epoch": 1.153, + "grad_norm": 39.82683181762695, + "learning_rate": 7.696200000000001e-06, + "loss": 0.7259, + "step": 11530 + }, + { + "epoch": 1.154, + "grad_norm": 23.57517433166504, + "learning_rate": 7.6942e-06, + "loss": 0.7778, + "step": 11540 + }, + { + "epoch": 1.155, + "grad_norm": 8.457917213439941, + "learning_rate": 7.6922e-06, + "loss": 0.835, + "step": 11550 + }, + { + "epoch": 1.156, + "grad_norm": 16.930097579956055, + "learning_rate": 7.690200000000001e-06, + "loss": 0.6884, + "step": 11560 + }, + { + "epoch": 1.157, + "grad_norm": 37.95302963256836, + "learning_rate": 7.6882e-06, + "loss": 0.8589, + "step": 11570 + }, + { + "epoch": 1.158, + "grad_norm": 26.018352508544922, + "learning_rate": 7.686200000000001e-06, + "loss": 0.6359, + "step": 11580 + }, + { + "epoch": 1.159, + "grad_norm": 16.526905059814453, + "learning_rate": 7.6842e-06, + "loss": 0.5665, + "step": 11590 + }, + { + "epoch": 1.16, + "grad_norm": 39.45627212524414, + "learning_rate": 7.6822e-06, + "loss": 0.845, + "step": 11600 + }, + { + "epoch": 1.161, + "grad_norm": 27.748611450195312, + "learning_rate": 7.680200000000001e-06, + "loss": 0.5258, + "step": 11610 + }, + { + "epoch": 1.162, + "grad_norm": 14.75036907196045, + "learning_rate": 7.6782e-06, + "loss": 0.7445, + "step": 11620 + }, + { + "epoch": 1.163, + "grad_norm": 32.02853775024414, + "learning_rate": 7.676200000000001e-06, + "loss": 0.5285, + "step": 11630 + }, + { + "epoch": 1.164, + "grad_norm": 23.34952163696289, + "learning_rate": 7.6742e-06, + "loss": 0.7296, + "step": 11640 + }, + { + "epoch": 1.165, + "grad_norm": 43.00461959838867, + "learning_rate": 7.6722e-06, + "loss": 0.7876, + "step": 11650 + }, + { + "epoch": 1.166, + "grad_norm": 33.411170959472656, + "learning_rate": 7.670200000000002e-06, + "loss": 0.5697, + "step": 11660 + }, + { + "epoch": 1.167, + "grad_norm": 32.586212158203125, + "learning_rate": 7.6682e-06, + "loss": 0.5464, + "step": 11670 + }, + { + "epoch": 1.168, + "grad_norm": 9.358975410461426, + "learning_rate": 7.6662e-06, + "loss": 0.515, + "step": 11680 + }, + { + "epoch": 1.169, + "grad_norm": 25.917869567871094, + "learning_rate": 7.6642e-06, + "loss": 0.8162, + "step": 11690 + }, + { + "epoch": 1.17, + "grad_norm": 20.3763427734375, + "learning_rate": 7.662200000000001e-06, + "loss": 0.4068, + "step": 11700 + }, + { + "epoch": 1.171, + "grad_norm": 32.39506912231445, + "learning_rate": 7.660200000000002e-06, + "loss": 0.6225, + "step": 11710 + }, + { + "epoch": 1.172, + "grad_norm": 24.119741439819336, + "learning_rate": 7.6582e-06, + "loss": 0.8509, + "step": 11720 + }, + { + "epoch": 1.173, + "grad_norm": 46.87370300292969, + "learning_rate": 7.6562e-06, + "loss": 0.828, + "step": 11730 + }, + { + "epoch": 1.174, + "grad_norm": 28.37071990966797, + "learning_rate": 7.6542e-06, + "loss": 0.9559, + "step": 11740 + }, + { + "epoch": 1.175, + "grad_norm": 38.91966247558594, + "learning_rate": 7.652200000000001e-06, + "loss": 0.7304, + "step": 11750 + }, + { + "epoch": 1.176, + "grad_norm": 42.96327590942383, + "learning_rate": 7.650200000000002e-06, + "loss": 0.7791, + "step": 11760 + }, + { + "epoch": 1.177, + "grad_norm": 23.778846740722656, + "learning_rate": 7.6482e-06, + "loss": 0.8178, + "step": 11770 + }, + { + "epoch": 1.178, + "grad_norm": 27.186412811279297, + "learning_rate": 7.6462e-06, + "loss": 0.7396, + "step": 11780 + }, + { + "epoch": 1.179, + "grad_norm": 32.69036865234375, + "learning_rate": 7.6442e-06, + "loss": 0.8204, + "step": 11790 + }, + { + "epoch": 1.18, + "grad_norm": 18.225202560424805, + "learning_rate": 7.642200000000001e-06, + "loss": 0.6291, + "step": 11800 + }, + { + "epoch": 1.181, + "grad_norm": 16.933101654052734, + "learning_rate": 7.6402e-06, + "loss": 0.6593, + "step": 11810 + }, + { + "epoch": 1.182, + "grad_norm": 32.985843658447266, + "learning_rate": 7.6382e-06, + "loss": 0.836, + "step": 11820 + }, + { + "epoch": 1.183, + "grad_norm": 26.52162742614746, + "learning_rate": 7.6362e-06, + "loss": 0.7246, + "step": 11830 + }, + { + "epoch": 1.184, + "grad_norm": 49.35344314575195, + "learning_rate": 7.6342e-06, + "loss": 0.6721, + "step": 11840 + }, + { + "epoch": 1.185, + "grad_norm": 12.153249740600586, + "learning_rate": 7.632200000000001e-06, + "loss": 0.8688, + "step": 11850 + }, + { + "epoch": 1.186, + "grad_norm": 33.72947311401367, + "learning_rate": 7.6302e-06, + "loss": 0.6242, + "step": 11860 + }, + { + "epoch": 1.187, + "grad_norm": 52.174407958984375, + "learning_rate": 7.628200000000001e-06, + "loss": 0.7162, + "step": 11870 + }, + { + "epoch": 1.188, + "grad_norm": 30.34244728088379, + "learning_rate": 7.6262e-06, + "loss": 0.7312, + "step": 11880 + }, + { + "epoch": 1.189, + "grad_norm": 34.500083923339844, + "learning_rate": 7.624200000000001e-06, + "loss": 0.7341, + "step": 11890 + }, + { + "epoch": 1.19, + "grad_norm": 29.85059356689453, + "learning_rate": 7.6222000000000005e-06, + "loss": 0.5486, + "step": 11900 + }, + { + "epoch": 1.191, + "grad_norm": 17.79989242553711, + "learning_rate": 7.620200000000001e-06, + "loss": 0.7282, + "step": 11910 + }, + { + "epoch": 1.192, + "grad_norm": 10.709245681762695, + "learning_rate": 7.618200000000001e-06, + "loss": 0.5993, + "step": 11920 + }, + { + "epoch": 1.193, + "grad_norm": 25.175249099731445, + "learning_rate": 7.6162e-06, + "loss": 0.7604, + "step": 11930 + }, + { + "epoch": 1.194, + "grad_norm": 26.545978546142578, + "learning_rate": 7.614200000000001e-06, + "loss": 0.6331, + "step": 11940 + }, + { + "epoch": 1.195, + "grad_norm": 39.96792984008789, + "learning_rate": 7.6122000000000006e-06, + "loss": 0.7824, + "step": 11950 + }, + { + "epoch": 1.196, + "grad_norm": 38.01614761352539, + "learning_rate": 7.610200000000001e-06, + "loss": 0.8789, + "step": 11960 + }, + { + "epoch": 1.197, + "grad_norm": 5.848873615264893, + "learning_rate": 7.608200000000001e-06, + "loss": 0.5995, + "step": 11970 + }, + { + "epoch": 1.198, + "grad_norm": 78.58563232421875, + "learning_rate": 7.6062e-06, + "loss": 0.6577, + "step": 11980 + }, + { + "epoch": 1.199, + "grad_norm": 18.236574172973633, + "learning_rate": 7.604200000000001e-06, + "loss": 0.5221, + "step": 11990 + }, + { + "epoch": 1.2, + "grad_norm": 19.23676872253418, + "learning_rate": 7.602200000000001e-06, + "loss": 0.4422, + "step": 12000 + }, + { + "epoch": 1.201, + "grad_norm": 29.161029815673828, + "learning_rate": 7.6002000000000005e-06, + "loss": 0.6526, + "step": 12010 + }, + { + "epoch": 1.202, + "grad_norm": 25.65962791442871, + "learning_rate": 7.598200000000001e-06, + "loss": 0.7309, + "step": 12020 + }, + { + "epoch": 1.203, + "grad_norm": 32.263160705566406, + "learning_rate": 7.5962e-06, + "loss": 0.7232, + "step": 12030 + }, + { + "epoch": 1.204, + "grad_norm": 103.59854125976562, + "learning_rate": 7.5942e-06, + "loss": 0.7941, + "step": 12040 + }, + { + "epoch": 1.205, + "grad_norm": 28.62127685546875, + "learning_rate": 7.592200000000001e-06, + "loss": 0.5728, + "step": 12050 + }, + { + "epoch": 1.206, + "grad_norm": 28.61754035949707, + "learning_rate": 7.590200000000001e-06, + "loss": 0.5754, + "step": 12060 + }, + { + "epoch": 1.207, + "grad_norm": 28.66469955444336, + "learning_rate": 7.588200000000001e-06, + "loss": 0.6362, + "step": 12070 + }, + { + "epoch": 1.208, + "grad_norm": 21.962121963500977, + "learning_rate": 7.5862e-06, + "loss": 0.5206, + "step": 12080 + }, + { + "epoch": 1.209, + "grad_norm": 10.966731071472168, + "learning_rate": 7.5842e-06, + "loss": 0.8621, + "step": 12090 + }, + { + "epoch": 1.21, + "grad_norm": 1.3472261428833008, + "learning_rate": 7.582200000000001e-06, + "loss": 0.6844, + "step": 12100 + }, + { + "epoch": 1.211, + "grad_norm": 51.0182991027832, + "learning_rate": 7.580200000000001e-06, + "loss": 0.6243, + "step": 12110 + }, + { + "epoch": 1.212, + "grad_norm": 13.555909156799316, + "learning_rate": 7.578200000000001e-06, + "loss": 0.5435, + "step": 12120 + }, + { + "epoch": 1.213, + "grad_norm": 43.063419342041016, + "learning_rate": 7.5762e-06, + "loss": 0.72, + "step": 12130 + }, + { + "epoch": 1.214, + "grad_norm": 24.662275314331055, + "learning_rate": 7.5742e-06, + "loss": 0.4902, + "step": 12140 + }, + { + "epoch": 1.215, + "grad_norm": 41.610626220703125, + "learning_rate": 7.572200000000001e-06, + "loss": 0.6616, + "step": 12150 + }, + { + "epoch": 1.216, + "grad_norm": 44.63747787475586, + "learning_rate": 7.570200000000001e-06, + "loss": 0.8976, + "step": 12160 + }, + { + "epoch": 1.217, + "grad_norm": 4.047074794769287, + "learning_rate": 7.5682000000000015e-06, + "loss": 0.9258, + "step": 12170 + }, + { + "epoch": 1.218, + "grad_norm": 52.99892044067383, + "learning_rate": 7.5662000000000005e-06, + "loss": 0.7729, + "step": 12180 + }, + { + "epoch": 1.219, + "grad_norm": 42.149688720703125, + "learning_rate": 7.5642e-06, + "loss": 0.7279, + "step": 12190 + }, + { + "epoch": 1.22, + "grad_norm": 17.185766220092773, + "learning_rate": 7.562200000000001e-06, + "loss": 0.7866, + "step": 12200 + }, + { + "epoch": 1.221, + "grad_norm": 10.956640243530273, + "learning_rate": 7.560200000000001e-06, + "loss": 0.5585, + "step": 12210 + }, + { + "epoch": 1.222, + "grad_norm": 19.674837112426758, + "learning_rate": 7.558200000000001e-06, + "loss": 0.6338, + "step": 12220 + }, + { + "epoch": 1.223, + "grad_norm": 10.953132629394531, + "learning_rate": 7.556200000000001e-06, + "loss": 0.5947, + "step": 12230 + }, + { + "epoch": 1.224, + "grad_norm": 33.46485900878906, + "learning_rate": 7.5542000000000005e-06, + "loss": 0.7458, + "step": 12240 + }, + { + "epoch": 1.225, + "grad_norm": 9.161663055419922, + "learning_rate": 7.5522e-06, + "loss": 0.5293, + "step": 12250 + }, + { + "epoch": 1.226, + "grad_norm": 36.02226257324219, + "learning_rate": 7.550200000000001e-06, + "loss": 0.6799, + "step": 12260 + }, + { + "epoch": 1.227, + "grad_norm": 31.329750061035156, + "learning_rate": 7.548200000000001e-06, + "loss": 0.57, + "step": 12270 + }, + { + "epoch": 1.228, + "grad_norm": 17.420188903808594, + "learning_rate": 7.5462e-06, + "loss": 0.7771, + "step": 12280 + }, + { + "epoch": 1.229, + "grad_norm": 20.56215476989746, + "learning_rate": 7.5442000000000005e-06, + "loss": 0.743, + "step": 12290 + }, + { + "epoch": 1.23, + "grad_norm": 19.782087326049805, + "learning_rate": 7.5422e-06, + "loss": 0.7162, + "step": 12300 + }, + { + "epoch": 1.231, + "grad_norm": 12.472650527954102, + "learning_rate": 7.540200000000001e-06, + "loss": 0.583, + "step": 12310 + }, + { + "epoch": 1.232, + "grad_norm": 32.87686538696289, + "learning_rate": 7.538200000000001e-06, + "loss": 0.6691, + "step": 12320 + }, + { + "epoch": 1.233, + "grad_norm": 4.495699405670166, + "learning_rate": 7.5362e-06, + "loss": 0.4486, + "step": 12330 + }, + { + "epoch": 1.234, + "grad_norm": 21.480024337768555, + "learning_rate": 7.534200000000001e-06, + "loss": 0.7801, + "step": 12340 + }, + { + "epoch": 1.2349999999999999, + "grad_norm": 14.798094749450684, + "learning_rate": 7.5322000000000005e-06, + "loss": 0.5866, + "step": 12350 + }, + { + "epoch": 1.236, + "grad_norm": 32.35955810546875, + "learning_rate": 7.530200000000001e-06, + "loss": 0.743, + "step": 12360 + }, + { + "epoch": 1.237, + "grad_norm": 44.85438537597656, + "learning_rate": 7.528200000000001e-06, + "loss": 0.7529, + "step": 12370 + }, + { + "epoch": 1.238, + "grad_norm": 22.723072052001953, + "learning_rate": 7.5262e-06, + "loss": 0.8058, + "step": 12380 + }, + { + "epoch": 1.2389999999999999, + "grad_norm": 34.42487716674805, + "learning_rate": 7.524200000000001e-06, + "loss": 0.7757, + "step": 12390 + }, + { + "epoch": 1.24, + "grad_norm": 45.39601135253906, + "learning_rate": 7.522200000000001e-06, + "loss": 0.7571, + "step": 12400 + }, + { + "epoch": 1.241, + "grad_norm": 34.43247604370117, + "learning_rate": 7.5202000000000004e-06, + "loss": 0.7506, + "step": 12410 + }, + { + "epoch": 1.242, + "grad_norm": 41.917396545410156, + "learning_rate": 7.518200000000001e-06, + "loss": 0.8379, + "step": 12420 + }, + { + "epoch": 1.2429999999999999, + "grad_norm": 38.676753997802734, + "learning_rate": 7.5162e-06, + "loss": 0.6781, + "step": 12430 + }, + { + "epoch": 1.244, + "grad_norm": 12.698298454284668, + "learning_rate": 7.5142e-06, + "loss": 0.6937, + "step": 12440 + }, + { + "epoch": 1.245, + "grad_norm": 11.609149932861328, + "learning_rate": 7.512200000000001e-06, + "loss": 0.7535, + "step": 12450 + }, + { + "epoch": 1.246, + "grad_norm": 16.510948181152344, + "learning_rate": 7.5102000000000005e-06, + "loss": 0.7634, + "step": 12460 + }, + { + "epoch": 1.2469999999999999, + "grad_norm": 25.841136932373047, + "learning_rate": 7.508200000000001e-06, + "loss": 0.5595, + "step": 12470 + }, + { + "epoch": 1.248, + "grad_norm": 44.224754333496094, + "learning_rate": 7.5062e-06, + "loss": 0.9578, + "step": 12480 + }, + { + "epoch": 1.249, + "grad_norm": 32.64692687988281, + "learning_rate": 7.5042e-06, + "loss": 0.5536, + "step": 12490 + }, + { + "epoch": 1.25, + "grad_norm": 19.11054039001465, + "learning_rate": 7.502200000000001e-06, + "loss": 0.6679, + "step": 12500 + }, + { + "epoch": 1.251, + "grad_norm": 20.72027015686035, + "learning_rate": 7.500200000000001e-06, + "loss": 0.8676, + "step": 12510 + }, + { + "epoch": 1.252, + "grad_norm": 10.526891708374023, + "learning_rate": 7.498200000000001e-06, + "loss": 0.6176, + "step": 12520 + }, + { + "epoch": 1.2530000000000001, + "grad_norm": 20.656282424926758, + "learning_rate": 7.4962e-06, + "loss": 0.7816, + "step": 12530 + }, + { + "epoch": 1.254, + "grad_norm": 57.771263122558594, + "learning_rate": 7.4942e-06, + "loss": 0.5708, + "step": 12540 + }, + { + "epoch": 1.255, + "grad_norm": 10.302445411682129, + "learning_rate": 7.492200000000001e-06, + "loss": 0.7098, + "step": 12550 + }, + { + "epoch": 1.256, + "grad_norm": 70.08512115478516, + "learning_rate": 7.490200000000001e-06, + "loss": 0.8798, + "step": 12560 + }, + { + "epoch": 1.2570000000000001, + "grad_norm": 21.335031509399414, + "learning_rate": 7.488200000000001e-06, + "loss": 0.6085, + "step": 12570 + }, + { + "epoch": 1.258, + "grad_norm": 25.092174530029297, + "learning_rate": 7.4862000000000004e-06, + "loss": 0.6516, + "step": 12580 + }, + { + "epoch": 1.259, + "grad_norm": 11.581510543823242, + "learning_rate": 7.4842e-06, + "loss": 0.5533, + "step": 12590 + }, + { + "epoch": 1.26, + "grad_norm": 15.392578125, + "learning_rate": 7.482200000000001e-06, + "loss": 0.6309, + "step": 12600 + }, + { + "epoch": 1.2610000000000001, + "grad_norm": 46.0922737121582, + "learning_rate": 7.480200000000001e-06, + "loss": 0.7325, + "step": 12610 + }, + { + "epoch": 1.262, + "grad_norm": 18.586734771728516, + "learning_rate": 7.478200000000001e-06, + "loss": 0.7337, + "step": 12620 + }, + { + "epoch": 1.263, + "grad_norm": 5.978358745574951, + "learning_rate": 7.4762000000000005e-06, + "loss": 0.5667, + "step": 12630 + }, + { + "epoch": 1.264, + "grad_norm": 37.30823516845703, + "learning_rate": 7.4742e-06, + "loss": 0.7755, + "step": 12640 + }, + { + "epoch": 1.2650000000000001, + "grad_norm": 24.3037052154541, + "learning_rate": 7.4722e-06, + "loss": 0.7101, + "step": 12650 + }, + { + "epoch": 1.266, + "grad_norm": 27.828401565551758, + "learning_rate": 7.470200000000001e-06, + "loss": 0.5885, + "step": 12660 + }, + { + "epoch": 1.267, + "grad_norm": 23.27979278564453, + "learning_rate": 7.468200000000001e-06, + "loss": 0.6291, + "step": 12670 + }, + { + "epoch": 1.268, + "grad_norm": 22.77558708190918, + "learning_rate": 7.4662e-06, + "loss": 0.7666, + "step": 12680 + }, + { + "epoch": 1.2690000000000001, + "grad_norm": 23.251815795898438, + "learning_rate": 7.4642000000000005e-06, + "loss": 0.5757, + "step": 12690 + }, + { + "epoch": 1.27, + "grad_norm": 25.934316635131836, + "learning_rate": 7.4622e-06, + "loss": 0.632, + "step": 12700 + }, + { + "epoch": 1.271, + "grad_norm": 25.416095733642578, + "learning_rate": 7.460200000000001e-06, + "loss": 0.7164, + "step": 12710 + }, + { + "epoch": 1.272, + "grad_norm": 25.40232276916504, + "learning_rate": 7.458200000000001e-06, + "loss": 0.6782, + "step": 12720 + }, + { + "epoch": 1.2730000000000001, + "grad_norm": 48.11226272583008, + "learning_rate": 7.4562e-06, + "loss": 0.6615, + "step": 12730 + }, + { + "epoch": 1.274, + "grad_norm": 8.603801727294922, + "learning_rate": 7.4542000000000006e-06, + "loss": 0.7099, + "step": 12740 + }, + { + "epoch": 1.275, + "grad_norm": 18.267532348632812, + "learning_rate": 7.4522e-06, + "loss": 0.7334, + "step": 12750 + }, + { + "epoch": 1.276, + "grad_norm": 45.3951530456543, + "learning_rate": 7.450200000000001e-06, + "loss": 0.6886, + "step": 12760 + }, + { + "epoch": 1.2770000000000001, + "grad_norm": 235.92100524902344, + "learning_rate": 7.448200000000001e-06, + "loss": 0.5436, + "step": 12770 + }, + { + "epoch": 1.278, + "grad_norm": 15.398147583007812, + "learning_rate": 7.4462e-06, + "loss": 0.7053, + "step": 12780 + }, + { + "epoch": 1.279, + "grad_norm": 48.43658447265625, + "learning_rate": 7.444200000000001e-06, + "loss": 0.5759, + "step": 12790 + }, + { + "epoch": 1.28, + "grad_norm": 39.65992736816406, + "learning_rate": 7.4422000000000005e-06, + "loss": 0.699, + "step": 12800 + }, + { + "epoch": 1.2810000000000001, + "grad_norm": 20.03291893005371, + "learning_rate": 7.4402e-06, + "loss": 0.6684, + "step": 12810 + }, + { + "epoch": 1.282, + "grad_norm": 15.064152717590332, + "learning_rate": 7.438200000000001e-06, + "loss": 0.7231, + "step": 12820 + }, + { + "epoch": 1.283, + "grad_norm": 35.2615966796875, + "learning_rate": 7.4362e-06, + "loss": 0.7014, + "step": 12830 + }, + { + "epoch": 1.284, + "grad_norm": 19.830982208251953, + "learning_rate": 7.4342e-06, + "loss": 0.6552, + "step": 12840 + }, + { + "epoch": 1.285, + "grad_norm": 9.698526382446289, + "learning_rate": 7.432200000000001e-06, + "loss": 0.7807, + "step": 12850 + }, + { + "epoch": 1.286, + "grad_norm": 25.383045196533203, + "learning_rate": 7.4302000000000005e-06, + "loss": 0.6524, + "step": 12860 + }, + { + "epoch": 1.287, + "grad_norm": 26.713987350463867, + "learning_rate": 7.428200000000001e-06, + "loss": 0.6857, + "step": 12870 + }, + { + "epoch": 1.288, + "grad_norm": 8.724356651306152, + "learning_rate": 7.4262e-06, + "loss": 0.4876, + "step": 12880 + }, + { + "epoch": 1.289, + "grad_norm": 22.239267349243164, + "learning_rate": 7.4242e-06, + "loss": 0.6277, + "step": 12890 + }, + { + "epoch": 1.29, + "grad_norm": 28.160232543945312, + "learning_rate": 7.422200000000001e-06, + "loss": 0.6837, + "step": 12900 + }, + { + "epoch": 1.291, + "grad_norm": 30.82021713256836, + "learning_rate": 7.4202000000000005e-06, + "loss": 0.5708, + "step": 12910 + }, + { + "epoch": 1.292, + "grad_norm": 24.322532653808594, + "learning_rate": 7.418200000000001e-06, + "loss": 0.5548, + "step": 12920 + }, + { + "epoch": 1.293, + "grad_norm": 29.292064666748047, + "learning_rate": 7.4162e-06, + "loss": 0.8325, + "step": 12930 + }, + { + "epoch": 1.294, + "grad_norm": 33.295448303222656, + "learning_rate": 7.4142e-06, + "loss": 0.7986, + "step": 12940 + }, + { + "epoch": 1.295, + "grad_norm": 18.731597900390625, + "learning_rate": 7.412200000000001e-06, + "loss": 1.0747, + "step": 12950 + }, + { + "epoch": 1.296, + "grad_norm": 22.99925994873047, + "learning_rate": 7.410200000000001e-06, + "loss": 0.6548, + "step": 12960 + }, + { + "epoch": 1.297, + "grad_norm": 25.831560134887695, + "learning_rate": 7.408200000000001e-06, + "loss": 0.7995, + "step": 12970 + }, + { + "epoch": 1.298, + "grad_norm": 21.3696231842041, + "learning_rate": 7.4062e-06, + "loss": 0.8104, + "step": 12980 + }, + { + "epoch": 1.299, + "grad_norm": 29.745773315429688, + "learning_rate": 7.4042e-06, + "loss": 0.6324, + "step": 12990 + }, + { + "epoch": 1.3, + "grad_norm": 25.20359992980957, + "learning_rate": 7.402200000000001e-06, + "loss": 0.6775, + "step": 13000 + }, + { + "epoch": 1.301, + "grad_norm": 22.265050888061523, + "learning_rate": 7.400200000000001e-06, + "loss": 0.5683, + "step": 13010 + }, + { + "epoch": 1.302, + "grad_norm": 26.836889266967773, + "learning_rate": 7.398200000000001e-06, + "loss": 0.4955, + "step": 13020 + }, + { + "epoch": 1.303, + "grad_norm": 20.822649002075195, + "learning_rate": 7.396200000000001e-06, + "loss": 0.594, + "step": 13030 + }, + { + "epoch": 1.304, + "grad_norm": 5.273739814758301, + "learning_rate": 7.3942e-06, + "loss": 0.4596, + "step": 13040 + }, + { + "epoch": 1.305, + "grad_norm": 41.83501052856445, + "learning_rate": 7.3922e-06, + "loss": 0.5578, + "step": 13050 + }, + { + "epoch": 1.306, + "grad_norm": 20.89312171936035, + "learning_rate": 7.390200000000001e-06, + "loss": 0.5613, + "step": 13060 + }, + { + "epoch": 1.307, + "grad_norm": 39.89366149902344, + "learning_rate": 7.388200000000001e-06, + "loss": 0.9497, + "step": 13070 + }, + { + "epoch": 1.308, + "grad_norm": 37.19102478027344, + "learning_rate": 7.386200000000001e-06, + "loss": 0.5325, + "step": 13080 + }, + { + "epoch": 1.309, + "grad_norm": 3.879763603210449, + "learning_rate": 7.3842e-06, + "loss": 0.52, + "step": 13090 + }, + { + "epoch": 1.31, + "grad_norm": 21.704940795898438, + "learning_rate": 7.3822e-06, + "loss": 0.4688, + "step": 13100 + }, + { + "epoch": 1.311, + "grad_norm": 23.836936950683594, + "learning_rate": 7.380200000000001e-06, + "loss": 0.6162, + "step": 13110 + }, + { + "epoch": 1.312, + "grad_norm": 40.60661697387695, + "learning_rate": 7.378200000000001e-06, + "loss": 0.6149, + "step": 13120 + }, + { + "epoch": 1.313, + "grad_norm": 23.078330993652344, + "learning_rate": 7.3762000000000015e-06, + "loss": 0.6401, + "step": 13130 + }, + { + "epoch": 1.314, + "grad_norm": 26.6516170501709, + "learning_rate": 7.3742000000000005e-06, + "loss": 0.7671, + "step": 13140 + }, + { + "epoch": 1.315, + "grad_norm": 35.08804702758789, + "learning_rate": 7.3722e-06, + "loss": 0.5731, + "step": 13150 + }, + { + "epoch": 1.316, + "grad_norm": 32.42262268066406, + "learning_rate": 7.370200000000001e-06, + "loss": 0.9451, + "step": 13160 + }, + { + "epoch": 1.317, + "grad_norm": 11.495752334594727, + "learning_rate": 7.368200000000001e-06, + "loss": 0.8786, + "step": 13170 + }, + { + "epoch": 1.318, + "grad_norm": 22.15064811706543, + "learning_rate": 7.366200000000001e-06, + "loss": 0.5448, + "step": 13180 + }, + { + "epoch": 1.319, + "grad_norm": 28.169740676879883, + "learning_rate": 7.3642000000000006e-06, + "loss": 0.8506, + "step": 13190 + }, + { + "epoch": 1.32, + "grad_norm": 19.906099319458008, + "learning_rate": 7.3622000000000004e-06, + "loss": 0.8319, + "step": 13200 + }, + { + "epoch": 1.321, + "grad_norm": 30.913488388061523, + "learning_rate": 7.3602e-06, + "loss": 0.7243, + "step": 13210 + }, + { + "epoch": 1.322, + "grad_norm": 41.180477142333984, + "learning_rate": 7.358200000000001e-06, + "loss": 0.8648, + "step": 13220 + }, + { + "epoch": 1.323, + "grad_norm": 29.278974533081055, + "learning_rate": 7.356200000000001e-06, + "loss": 0.6343, + "step": 13230 + }, + { + "epoch": 1.324, + "grad_norm": 8.021432876586914, + "learning_rate": 7.3542e-06, + "loss": 0.5605, + "step": 13240 + }, + { + "epoch": 1.325, + "grad_norm": 32.5282096862793, + "learning_rate": 7.3522000000000005e-06, + "loss": 0.8554, + "step": 13250 + }, + { + "epoch": 1.326, + "grad_norm": 26.73030662536621, + "learning_rate": 7.3502e-06, + "loss": 0.6036, + "step": 13260 + }, + { + "epoch": 1.327, + "grad_norm": 35.48610305786133, + "learning_rate": 7.348200000000001e-06, + "loss": 0.786, + "step": 13270 + }, + { + "epoch": 1.328, + "grad_norm": 19.208602905273438, + "learning_rate": 7.346200000000001e-06, + "loss": 0.6422, + "step": 13280 + }, + { + "epoch": 1.329, + "grad_norm": 13.82779598236084, + "learning_rate": 7.3442e-06, + "loss": 0.6407, + "step": 13290 + }, + { + "epoch": 1.33, + "grad_norm": 33.58254623413086, + "learning_rate": 7.342200000000001e-06, + "loss": 0.7012, + "step": 13300 + }, + { + "epoch": 1.331, + "grad_norm": 16.39531707763672, + "learning_rate": 7.3402000000000005e-06, + "loss": 0.6333, + "step": 13310 + }, + { + "epoch": 1.332, + "grad_norm": 35.258243560791016, + "learning_rate": 7.338200000000001e-06, + "loss": 0.5802, + "step": 13320 + }, + { + "epoch": 1.333, + "grad_norm": 23.17426872253418, + "learning_rate": 7.336200000000001e-06, + "loss": 0.6976, + "step": 13330 + }, + { + "epoch": 1.334, + "grad_norm": 47.89413070678711, + "learning_rate": 7.3342e-06, + "loss": 0.6137, + "step": 13340 + }, + { + "epoch": 1.335, + "grad_norm": 26.017189025878906, + "learning_rate": 7.332200000000001e-06, + "loss": 0.7701, + "step": 13350 + }, + { + "epoch": 1.336, + "grad_norm": 6.929594039916992, + "learning_rate": 7.3302000000000006e-06, + "loss": 0.6573, + "step": 13360 + }, + { + "epoch": 1.337, + "grad_norm": 9.463047981262207, + "learning_rate": 7.328200000000001e-06, + "loss": 0.6186, + "step": 13370 + }, + { + "epoch": 1.338, + "grad_norm": 29.972517013549805, + "learning_rate": 7.326200000000001e-06, + "loss": 0.5866, + "step": 13380 + }, + { + "epoch": 1.339, + "grad_norm": 22.74357032775879, + "learning_rate": 7.3242e-06, + "loss": 0.8169, + "step": 13390 + }, + { + "epoch": 1.34, + "grad_norm": 28.037187576293945, + "learning_rate": 7.322200000000001e-06, + "loss": 0.566, + "step": 13400 + }, + { + "epoch": 1.341, + "grad_norm": 19.294530868530273, + "learning_rate": 7.320200000000001e-06, + "loss": 0.5276, + "step": 13410 + }, + { + "epoch": 1.342, + "grad_norm": 34.14975357055664, + "learning_rate": 7.3182000000000005e-06, + "loss": 0.96, + "step": 13420 + }, + { + "epoch": 1.343, + "grad_norm": 28.102813720703125, + "learning_rate": 7.316200000000001e-06, + "loss": 0.7187, + "step": 13430 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 24.984050750732422, + "learning_rate": 7.3142e-06, + "loss": 0.7285, + "step": 13440 + }, + { + "epoch": 1.345, + "grad_norm": 8.587176322937012, + "learning_rate": 7.3122e-06, + "loss": 0.6251, + "step": 13450 + }, + { + "epoch": 1.346, + "grad_norm": 48.03047561645508, + "learning_rate": 7.310200000000001e-06, + "loss": 0.6534, + "step": 13460 + }, + { + "epoch": 1.347, + "grad_norm": 41.015052795410156, + "learning_rate": 7.308200000000001e-06, + "loss": 0.638, + "step": 13470 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 53.580875396728516, + "learning_rate": 7.306200000000001e-06, + "loss": 0.5571, + "step": 13480 + }, + { + "epoch": 1.349, + "grad_norm": 41.472984313964844, + "learning_rate": 7.3042e-06, + "loss": 0.7513, + "step": 13490 + }, + { + "epoch": 1.35, + "grad_norm": 23.2569580078125, + "learning_rate": 7.3022e-06, + "loss": 0.5893, + "step": 13500 + }, + { + "epoch": 1.351, + "grad_norm": 39.171356201171875, + "learning_rate": 7.300200000000001e-06, + "loss": 0.7546, + "step": 13510 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 23.469484329223633, + "learning_rate": 7.298200000000001e-06, + "loss": 0.5874, + "step": 13520 + }, + { + "epoch": 1.353, + "grad_norm": 22.72773551940918, + "learning_rate": 7.296200000000001e-06, + "loss": 0.5501, + "step": 13530 + }, + { + "epoch": 1.354, + "grad_norm": 27.632083892822266, + "learning_rate": 7.2942e-06, + "loss": 0.6415, + "step": 13540 + }, + { + "epoch": 1.355, + "grad_norm": 14.760106086730957, + "learning_rate": 7.2922e-06, + "loss": 0.5344, + "step": 13550 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 48.3216667175293, + "learning_rate": 7.290200000000001e-06, + "loss": 0.6311, + "step": 13560 + }, + { + "epoch": 1.357, + "grad_norm": 34.64678192138672, + "learning_rate": 7.288200000000001e-06, + "loss": 0.6622, + "step": 13570 + }, + { + "epoch": 1.358, + "grad_norm": 16.70772361755371, + "learning_rate": 7.286200000000001e-06, + "loss": 0.6337, + "step": 13580 + }, + { + "epoch": 1.359, + "grad_norm": 10.013886451721191, + "learning_rate": 7.2842000000000005e-06, + "loss": 0.5636, + "step": 13590 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 57.176029205322266, + "learning_rate": 7.2822e-06, + "loss": 0.5612, + "step": 13600 + }, + { + "epoch": 1.361, + "grad_norm": 56.01203536987305, + "learning_rate": 7.2802e-06, + "loss": 0.6763, + "step": 13610 + }, + { + "epoch": 1.362, + "grad_norm": 20.71746253967285, + "learning_rate": 7.278200000000001e-06, + "loss": 0.8182, + "step": 13620 + }, + { + "epoch": 1.363, + "grad_norm": 23.728199005126953, + "learning_rate": 7.276200000000001e-06, + "loss": 0.6961, + "step": 13630 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 39.34861755371094, + "learning_rate": 7.2742e-06, + "loss": 0.4997, + "step": 13640 + }, + { + "epoch": 1.365, + "grad_norm": 35.87208557128906, + "learning_rate": 7.2722000000000004e-06, + "loss": 0.8257, + "step": 13650 + }, + { + "epoch": 1.366, + "grad_norm": 13.711315155029297, + "learning_rate": 7.2702e-06, + "loss": 0.6184, + "step": 13660 + }, + { + "epoch": 1.367, + "grad_norm": 22.51475715637207, + "learning_rate": 7.268200000000001e-06, + "loss": 0.8108, + "step": 13670 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 33.50455093383789, + "learning_rate": 7.266200000000001e-06, + "loss": 0.8212, + "step": 13680 + }, + { + "epoch": 1.369, + "grad_norm": 19.336406707763672, + "learning_rate": 7.2642e-06, + "loss": 0.4771, + "step": 13690 + }, + { + "epoch": 1.37, + "grad_norm": 24.16560935974121, + "learning_rate": 7.2622000000000005e-06, + "loss": 0.4932, + "step": 13700 + }, + { + "epoch": 1.371, + "grad_norm": 16.50635528564453, + "learning_rate": 7.2602e-06, + "loss": 0.7304, + "step": 13710 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 18.5286865234375, + "learning_rate": 7.258200000000001e-06, + "loss": 0.5905, + "step": 13720 + }, + { + "epoch": 1.373, + "grad_norm": 23.307600021362305, + "learning_rate": 7.256200000000001e-06, + "loss": 0.8059, + "step": 13730 + }, + { + "epoch": 1.374, + "grad_norm": 43.4228515625, + "learning_rate": 7.2542e-06, + "loss": 0.6437, + "step": 13740 + }, + { + "epoch": 1.375, + "grad_norm": 25.997020721435547, + "learning_rate": 7.252200000000001e-06, + "loss": 0.7291, + "step": 13750 + }, + { + "epoch": 1.376, + "grad_norm": 9.636469841003418, + "learning_rate": 7.2502000000000005e-06, + "loss": 0.6815, + "step": 13760 + }, + { + "epoch": 1.377, + "grad_norm": 19.01958465576172, + "learning_rate": 7.248200000000001e-06, + "loss": 0.6909, + "step": 13770 + }, + { + "epoch": 1.3780000000000001, + "grad_norm": 12.587032318115234, + "learning_rate": 7.246200000000001e-06, + "loss": 0.4865, + "step": 13780 + }, + { + "epoch": 1.379, + "grad_norm": 14.733444213867188, + "learning_rate": 7.2442e-06, + "loss": 0.5581, + "step": 13790 + }, + { + "epoch": 1.38, + "grad_norm": 13.286008834838867, + "learning_rate": 7.242200000000001e-06, + "loss": 0.7601, + "step": 13800 + }, + { + "epoch": 1.381, + "grad_norm": 55.66062927246094, + "learning_rate": 7.240200000000001e-06, + "loss": 0.7174, + "step": 13810 + }, + { + "epoch": 1.3820000000000001, + "grad_norm": 12.686264991760254, + "learning_rate": 7.2382000000000004e-06, + "loss": 0.7169, + "step": 13820 + }, + { + "epoch": 1.383, + "grad_norm": 33.9859619140625, + "learning_rate": 7.236200000000001e-06, + "loss": 0.6755, + "step": 13830 + }, + { + "epoch": 1.384, + "grad_norm": 24.982358932495117, + "learning_rate": 7.2342e-06, + "loss": 0.631, + "step": 13840 + }, + { + "epoch": 1.385, + "grad_norm": 29.842527389526367, + "learning_rate": 7.2322e-06, + "loss": 0.7629, + "step": 13850 + }, + { + "epoch": 1.3860000000000001, + "grad_norm": 36.99466323852539, + "learning_rate": 7.230200000000001e-06, + "loss": 0.978, + "step": 13860 + }, + { + "epoch": 1.387, + "grad_norm": 30.951824188232422, + "learning_rate": 7.2282000000000005e-06, + "loss": 0.5872, + "step": 13870 + }, + { + "epoch": 1.388, + "grad_norm": 2.009528398513794, + "learning_rate": 7.226200000000001e-06, + "loss": 0.5371, + "step": 13880 + }, + { + "epoch": 1.389, + "grad_norm": 21.285736083984375, + "learning_rate": 7.2242e-06, + "loss": 0.6345, + "step": 13890 + }, + { + "epoch": 1.3900000000000001, + "grad_norm": 29.036205291748047, + "learning_rate": 7.2222e-06, + "loss": 0.6506, + "step": 13900 + }, + { + "epoch": 1.391, + "grad_norm": 32.15199661254883, + "learning_rate": 7.220200000000001e-06, + "loss": 0.7098, + "step": 13910 + }, + { + "epoch": 1.392, + "grad_norm": 56.528621673583984, + "learning_rate": 7.218200000000001e-06, + "loss": 0.7811, + "step": 13920 + }, + { + "epoch": 1.393, + "grad_norm": 32.981815338134766, + "learning_rate": 7.216200000000001e-06, + "loss": 0.6597, + "step": 13930 + }, + { + "epoch": 1.3940000000000001, + "grad_norm": 26.504575729370117, + "learning_rate": 7.2142e-06, + "loss": 0.5865, + "step": 13940 + }, + { + "epoch": 1.395, + "grad_norm": 43.711788177490234, + "learning_rate": 7.2122e-06, + "loss": 0.734, + "step": 13950 + }, + { + "epoch": 1.396, + "grad_norm": 18.780986785888672, + "learning_rate": 7.210200000000001e-06, + "loss": 0.4507, + "step": 13960 + }, + { + "epoch": 1.397, + "grad_norm": 31.044410705566406, + "learning_rate": 7.208200000000001e-06, + "loss": 0.3073, + "step": 13970 + }, + { + "epoch": 1.3980000000000001, + "grad_norm": 23.003955841064453, + "learning_rate": 7.2062000000000006e-06, + "loss": 0.3738, + "step": 13980 + }, + { + "epoch": 1.399, + "grad_norm": 39.76530075073242, + "learning_rate": 7.2042e-06, + "loss": 0.6692, + "step": 13990 + }, + { + "epoch": 1.4, + "grad_norm": 18.235315322875977, + "learning_rate": 7.2022e-06, + "loss": 0.6609, + "step": 14000 + }, + { + "epoch": 1.401, + "grad_norm": 38.41657257080078, + "learning_rate": 7.2002e-06, + "loss": 0.8216, + "step": 14010 + }, + { + "epoch": 1.4020000000000001, + "grad_norm": 3.7093570232391357, + "learning_rate": 7.198200000000001e-06, + "loss": 0.4728, + "step": 14020 + }, + { + "epoch": 1.403, + "grad_norm": 7.972429275512695, + "learning_rate": 7.196200000000001e-06, + "loss": 0.788, + "step": 14030 + }, + { + "epoch": 1.404, + "grad_norm": 30.618633270263672, + "learning_rate": 7.1942e-06, + "loss": 0.7229, + "step": 14040 + }, + { + "epoch": 1.405, + "grad_norm": 20.31015396118164, + "learning_rate": 7.1922e-06, + "loss": 0.679, + "step": 14050 + }, + { + "epoch": 1.4060000000000001, + "grad_norm": 33.285003662109375, + "learning_rate": 7.1902e-06, + "loss": 0.6957, + "step": 14060 + }, + { + "epoch": 1.407, + "grad_norm": 17.131729125976562, + "learning_rate": 7.188200000000001e-06, + "loss": 0.4287, + "step": 14070 + }, + { + "epoch": 1.408, + "grad_norm": 37.19723892211914, + "learning_rate": 7.186200000000001e-06, + "loss": 0.7653, + "step": 14080 + }, + { + "epoch": 1.409, + "grad_norm": 24.82241439819336, + "learning_rate": 7.1842e-06, + "loss": 0.7721, + "step": 14090 + }, + { + "epoch": 1.41, + "grad_norm": 41.999305725097656, + "learning_rate": 7.1822000000000005e-06, + "loss": 0.3842, + "step": 14100 + }, + { + "epoch": 1.411, + "grad_norm": 26.453805923461914, + "learning_rate": 7.1802e-06, + "loss": 0.8213, + "step": 14110 + }, + { + "epoch": 1.412, + "grad_norm": 29.831308364868164, + "learning_rate": 7.178200000000001e-06, + "loss": 0.965, + "step": 14120 + }, + { + "epoch": 1.413, + "grad_norm": 35.91419219970703, + "learning_rate": 7.176200000000001e-06, + "loss": 0.863, + "step": 14130 + }, + { + "epoch": 1.414, + "grad_norm": 17.73390769958496, + "learning_rate": 7.1742e-06, + "loss": 0.5709, + "step": 14140 + }, + { + "epoch": 1.415, + "grad_norm": 7.556557655334473, + "learning_rate": 7.1722000000000006e-06, + "loss": 0.4825, + "step": 14150 + }, + { + "epoch": 1.416, + "grad_norm": 31.779680252075195, + "learning_rate": 7.1702e-06, + "loss": 0.8153, + "step": 14160 + }, + { + "epoch": 1.417, + "grad_norm": 10.960797309875488, + "learning_rate": 7.168200000000001e-06, + "loss": 0.595, + "step": 14170 + }, + { + "epoch": 1.418, + "grad_norm": 8.104966163635254, + "learning_rate": 7.166400000000001e-06, + "loss": 0.6693, + "step": 14180 + }, + { + "epoch": 1.419, + "grad_norm": 26.086509704589844, + "learning_rate": 7.1644e-06, + "loss": 0.6027, + "step": 14190 + }, + { + "epoch": 1.42, + "grad_norm": 25.449806213378906, + "learning_rate": 7.1624e-06, + "loss": 0.557, + "step": 14200 + }, + { + "epoch": 1.421, + "grad_norm": 32.112892150878906, + "learning_rate": 7.160400000000001e-06, + "loss": 0.7366, + "step": 14210 + }, + { + "epoch": 1.422, + "grad_norm": 28.12934684753418, + "learning_rate": 7.158400000000001e-06, + "loss": 0.8341, + "step": 14220 + }, + { + "epoch": 1.423, + "grad_norm": 30.7032413482666, + "learning_rate": 7.1564000000000015e-06, + "loss": 0.6367, + "step": 14230 + }, + { + "epoch": 1.424, + "grad_norm": 16.595033645629883, + "learning_rate": 7.1544000000000005e-06, + "loss": 0.6789, + "step": 14240 + }, + { + "epoch": 1.425, + "grad_norm": 31.310976028442383, + "learning_rate": 7.1524e-06, + "loss": 0.6617, + "step": 14250 + }, + { + "epoch": 1.426, + "grad_norm": 19.14084815979004, + "learning_rate": 7.150400000000001e-06, + "loss": 0.5745, + "step": 14260 + }, + { + "epoch": 1.427, + "grad_norm": 49.51873779296875, + "learning_rate": 7.148400000000001e-06, + "loss": 0.8873, + "step": 14270 + }, + { + "epoch": 1.428, + "grad_norm": 34.733726501464844, + "learning_rate": 7.146400000000001e-06, + "loss": 0.6153, + "step": 14280 + }, + { + "epoch": 1.429, + "grad_norm": 38.35405731201172, + "learning_rate": 7.144400000000001e-06, + "loss": 0.3432, + "step": 14290 + }, + { + "epoch": 1.43, + "grad_norm": 28.54090118408203, + "learning_rate": 7.1424000000000004e-06, + "loss": 0.6794, + "step": 14300 + }, + { + "epoch": 1.431, + "grad_norm": 156.63278198242188, + "learning_rate": 7.1404e-06, + "loss": 0.6149, + "step": 14310 + }, + { + "epoch": 1.432, + "grad_norm": 44.73384094238281, + "learning_rate": 7.138400000000001e-06, + "loss": 0.7077, + "step": 14320 + }, + { + "epoch": 1.433, + "grad_norm": 32.4652099609375, + "learning_rate": 7.136400000000001e-06, + "loss": 0.5943, + "step": 14330 + }, + { + "epoch": 1.434, + "grad_norm": 15.920811653137207, + "learning_rate": 7.1344e-06, + "loss": 0.5748, + "step": 14340 + }, + { + "epoch": 1.435, + "grad_norm": 14.366812705993652, + "learning_rate": 7.1324000000000005e-06, + "loss": 0.7162, + "step": 14350 + }, + { + "epoch": 1.436, + "grad_norm": 20.413108825683594, + "learning_rate": 7.1304e-06, + "loss": 0.7665, + "step": 14360 + }, + { + "epoch": 1.437, + "grad_norm": 35.08794021606445, + "learning_rate": 7.128400000000001e-06, + "loss": 0.7175, + "step": 14370 + }, + { + "epoch": 1.438, + "grad_norm": 58.03242492675781, + "learning_rate": 7.126400000000001e-06, + "loss": 0.952, + "step": 14380 + }, + { + "epoch": 1.439, + "grad_norm": 24.53120231628418, + "learning_rate": 7.1244e-06, + "loss": 0.7022, + "step": 14390 + }, + { + "epoch": 1.44, + "grad_norm": 33.467071533203125, + "learning_rate": 7.122400000000001e-06, + "loss": 0.6428, + "step": 14400 + }, + { + "epoch": 1.441, + "grad_norm": 37.66570281982422, + "learning_rate": 7.1204000000000005e-06, + "loss": 0.8671, + "step": 14410 + }, + { + "epoch": 1.442, + "grad_norm": 31.30205535888672, + "learning_rate": 7.118400000000001e-06, + "loss": 0.5898, + "step": 14420 + }, + { + "epoch": 1.443, + "grad_norm": 31.95399284362793, + "learning_rate": 7.116400000000001e-06, + "loss": 0.6795, + "step": 14430 + }, + { + "epoch": 1.444, + "grad_norm": 32.41059494018555, + "learning_rate": 7.1144e-06, + "loss": 0.4456, + "step": 14440 + }, + { + "epoch": 1.445, + "grad_norm": 25.59587287902832, + "learning_rate": 7.112400000000001e-06, + "loss": 0.7705, + "step": 14450 + }, + { + "epoch": 1.446, + "grad_norm": 37.86859893798828, + "learning_rate": 7.1104000000000006e-06, + "loss": 0.6855, + "step": 14460 + }, + { + "epoch": 1.447, + "grad_norm": 7.653895378112793, + "learning_rate": 7.108400000000001e-06, + "loss": 0.7096, + "step": 14470 + }, + { + "epoch": 1.448, + "grad_norm": 34.88923263549805, + "learning_rate": 7.106400000000001e-06, + "loss": 0.6658, + "step": 14480 + }, + { + "epoch": 1.449, + "grad_norm": 27.103458404541016, + "learning_rate": 7.1044e-06, + "loss": 0.7682, + "step": 14490 + }, + { + "epoch": 1.45, + "grad_norm": 34.40449905395508, + "learning_rate": 7.102400000000001e-06, + "loss": 0.8501, + "step": 14500 + }, + { + "epoch": 1.451, + "grad_norm": 52.43901824951172, + "learning_rate": 7.100400000000001e-06, + "loss": 0.5298, + "step": 14510 + }, + { + "epoch": 1.452, + "grad_norm": 36.47237014770508, + "learning_rate": 7.0984000000000005e-06, + "loss": 0.9262, + "step": 14520 + }, + { + "epoch": 1.453, + "grad_norm": 33.24127197265625, + "learning_rate": 7.096400000000001e-06, + "loss": 0.7582, + "step": 14530 + }, + { + "epoch": 1.454, + "grad_norm": 27.120197296142578, + "learning_rate": 7.0944e-06, + "loss": 0.7854, + "step": 14540 + }, + { + "epoch": 1.455, + "grad_norm": 9.86540699005127, + "learning_rate": 7.0924e-06, + "loss": 0.8409, + "step": 14550 + }, + { + "epoch": 1.456, + "grad_norm": 17.383777618408203, + "learning_rate": 7.090400000000001e-06, + "loss": 0.4384, + "step": 14560 + }, + { + "epoch": 1.457, + "grad_norm": 28.551664352416992, + "learning_rate": 7.088400000000001e-06, + "loss": 0.8198, + "step": 14570 + }, + { + "epoch": 1.458, + "grad_norm": 27.755903244018555, + "learning_rate": 7.086400000000001e-06, + "loss": 0.6956, + "step": 14580 + }, + { + "epoch": 1.459, + "grad_norm": 16.9943904876709, + "learning_rate": 7.0844e-06, + "loss": 0.6524, + "step": 14590 + }, + { + "epoch": 1.46, + "grad_norm": 19.76603889465332, + "learning_rate": 7.0824e-06, + "loss": 0.5919, + "step": 14600 + }, + { + "epoch": 1.461, + "grad_norm": 4.806751728057861, + "learning_rate": 7.080400000000001e-06, + "loss": 0.6876, + "step": 14610 + }, + { + "epoch": 1.462, + "grad_norm": 18.323942184448242, + "learning_rate": 7.078400000000001e-06, + "loss": 0.8455, + "step": 14620 + }, + { + "epoch": 1.463, + "grad_norm": 23.91664695739746, + "learning_rate": 7.076400000000001e-06, + "loss": 0.7989, + "step": 14630 + }, + { + "epoch": 1.464, + "grad_norm": 12.201299667358398, + "learning_rate": 7.0744e-06, + "loss": 0.7662, + "step": 14640 + }, + { + "epoch": 1.465, + "grad_norm": 20.28821563720703, + "learning_rate": 7.0724e-06, + "loss": 0.6595, + "step": 14650 + }, + { + "epoch": 1.466, + "grad_norm": 48.507930755615234, + "learning_rate": 7.070400000000001e-06, + "loss": 0.4343, + "step": 14660 + }, + { + "epoch": 1.467, + "grad_norm": 54.569313049316406, + "learning_rate": 7.068400000000001e-06, + "loss": 0.615, + "step": 14670 + }, + { + "epoch": 1.468, + "grad_norm": 22.862001419067383, + "learning_rate": 7.066400000000001e-06, + "loss": 0.7374, + "step": 14680 + }, + { + "epoch": 1.4689999999999999, + "grad_norm": 23.220779418945312, + "learning_rate": 7.0644000000000005e-06, + "loss": 0.7556, + "step": 14690 + }, + { + "epoch": 1.47, + "grad_norm": 16.430978775024414, + "learning_rate": 7.0624e-06, + "loss": 0.6466, + "step": 14700 + }, + { + "epoch": 1.471, + "grad_norm": 40.130470275878906, + "learning_rate": 7.0604e-06, + "loss": 0.798, + "step": 14710 + }, + { + "epoch": 1.472, + "grad_norm": 31.032583236694336, + "learning_rate": 7.058400000000001e-06, + "loss": 0.7023, + "step": 14720 + }, + { + "epoch": 1.4729999999999999, + "grad_norm": 33.947750091552734, + "learning_rate": 7.056400000000001e-06, + "loss": 0.7844, + "step": 14730 + }, + { + "epoch": 1.474, + "grad_norm": 20.618391036987305, + "learning_rate": 7.0544e-06, + "loss": 0.8526, + "step": 14740 + }, + { + "epoch": 1.475, + "grad_norm": 30.07219886779785, + "learning_rate": 7.0524000000000004e-06, + "loss": 0.6605, + "step": 14750 + }, + { + "epoch": 1.476, + "grad_norm": 37.258087158203125, + "learning_rate": 7.0504e-06, + "loss": 0.6999, + "step": 14760 + }, + { + "epoch": 1.4769999999999999, + "grad_norm": 36.73057556152344, + "learning_rate": 7.048400000000001e-06, + "loss": 0.5431, + "step": 14770 + }, + { + "epoch": 1.478, + "grad_norm": 45.83023452758789, + "learning_rate": 7.046400000000001e-06, + "loss": 1.0078, + "step": 14780 + }, + { + "epoch": 1.479, + "grad_norm": 29.41172981262207, + "learning_rate": 7.0444e-06, + "loss": 0.5701, + "step": 14790 + }, + { + "epoch": 1.48, + "grad_norm": 3.402508497238159, + "learning_rate": 7.0424000000000005e-06, + "loss": 0.4707, + "step": 14800 + }, + { + "epoch": 1.4809999999999999, + "grad_norm": 24.081668853759766, + "learning_rate": 7.0404e-06, + "loss": 0.6248, + "step": 14810 + }, + { + "epoch": 1.482, + "grad_norm": 43.570091247558594, + "learning_rate": 7.038400000000001e-06, + "loss": 0.676, + "step": 14820 + }, + { + "epoch": 1.483, + "grad_norm": 36.47454833984375, + "learning_rate": 7.036400000000001e-06, + "loss": 0.6858, + "step": 14830 + }, + { + "epoch": 1.484, + "grad_norm": 29.922746658325195, + "learning_rate": 7.0344e-06, + "loss": 0.5933, + "step": 14840 + }, + { + "epoch": 1.4849999999999999, + "grad_norm": 8.109268188476562, + "learning_rate": 7.032400000000001e-06, + "loss": 0.6108, + "step": 14850 + }, + { + "epoch": 1.486, + "grad_norm": 7.4250264167785645, + "learning_rate": 7.0304000000000005e-06, + "loss": 0.6123, + "step": 14860 + }, + { + "epoch": 1.487, + "grad_norm": 9.724357604980469, + "learning_rate": 7.028400000000001e-06, + "loss": 0.6254, + "step": 14870 + }, + { + "epoch": 1.488, + "grad_norm": 7.028298377990723, + "learning_rate": 7.026400000000001e-06, + "loss": 0.6052, + "step": 14880 + }, + { + "epoch": 1.4889999999999999, + "grad_norm": 31.92440414428711, + "learning_rate": 7.0244e-06, + "loss": 0.8187, + "step": 14890 + }, + { + "epoch": 1.49, + "grad_norm": 15.212546348571777, + "learning_rate": 7.022400000000001e-06, + "loss": 0.7831, + "step": 14900 + }, + { + "epoch": 1.491, + "grad_norm": 41.331748962402344, + "learning_rate": 7.020400000000001e-06, + "loss": 0.6803, + "step": 14910 + }, + { + "epoch": 1.492, + "grad_norm": 29.76875877380371, + "learning_rate": 7.0184000000000004e-06, + "loss": 0.9259, + "step": 14920 + }, + { + "epoch": 1.4929999999999999, + "grad_norm": 23.765771865844727, + "learning_rate": 7.016400000000001e-06, + "loss": 0.7033, + "step": 14930 + }, + { + "epoch": 1.494, + "grad_norm": 42.97665786743164, + "learning_rate": 7.0144e-06, + "loss": 0.8351, + "step": 14940 + }, + { + "epoch": 1.495, + "grad_norm": 22.49999237060547, + "learning_rate": 7.0124e-06, + "loss": 0.9859, + "step": 14950 + }, + { + "epoch": 1.496, + "grad_norm": 25.587005615234375, + "learning_rate": 7.010400000000001e-06, + "loss": 0.73, + "step": 14960 + }, + { + "epoch": 1.4969999999999999, + "grad_norm": 24.11408805847168, + "learning_rate": 7.0084000000000005e-06, + "loss": 0.6629, + "step": 14970 + }, + { + "epoch": 1.498, + "grad_norm": 33.97454833984375, + "learning_rate": 7.006400000000001e-06, + "loss": 0.7687, + "step": 14980 + }, + { + "epoch": 1.499, + "grad_norm": 27.10246467590332, + "learning_rate": 7.0044e-06, + "loss": 0.5656, + "step": 14990 + }, + { + "epoch": 1.5, + "grad_norm": 31.772153854370117, + "learning_rate": 7.0024e-06, + "loss": 0.5767, + "step": 15000 + }, + { + "epoch": 1.501, + "grad_norm": 32.298919677734375, + "learning_rate": 7.000400000000001e-06, + "loss": 0.6371, + "step": 15010 + }, + { + "epoch": 1.502, + "grad_norm": 41.70977020263672, + "learning_rate": 6.998400000000001e-06, + "loss": 0.8802, + "step": 15020 + }, + { + "epoch": 1.5030000000000001, + "grad_norm": 5.668227672576904, + "learning_rate": 6.996400000000001e-06, + "loss": 0.5482, + "step": 15030 + }, + { + "epoch": 1.504, + "grad_norm": 30.462928771972656, + "learning_rate": 6.9944e-06, + "loss": 0.5476, + "step": 15040 + }, + { + "epoch": 1.505, + "grad_norm": 17.560972213745117, + "learning_rate": 6.9924e-06, + "loss": 0.473, + "step": 15050 + }, + { + "epoch": 1.506, + "grad_norm": 7.06756067276001, + "learning_rate": 6.990400000000001e-06, + "loss": 0.4892, + "step": 15060 + }, + { + "epoch": 1.5070000000000001, + "grad_norm": 14.195466995239258, + "learning_rate": 6.988400000000001e-06, + "loss": 0.6969, + "step": 15070 + }, + { + "epoch": 1.508, + "grad_norm": 32.51763153076172, + "learning_rate": 6.9864000000000006e-06, + "loss": 0.6478, + "step": 15080 + }, + { + "epoch": 1.509, + "grad_norm": 34.5938606262207, + "learning_rate": 6.9844e-06, + "loss": 0.7144, + "step": 15090 + }, + { + "epoch": 1.51, + "grad_norm": 29.059711456298828, + "learning_rate": 6.9824e-06, + "loss": 0.3797, + "step": 15100 + }, + { + "epoch": 1.5110000000000001, + "grad_norm": 42.22810363769531, + "learning_rate": 6.9804e-06, + "loss": 0.7841, + "step": 15110 + }, + { + "epoch": 1.512, + "grad_norm": 15.14452075958252, + "learning_rate": 6.978400000000001e-06, + "loss": 0.9623, + "step": 15120 + }, + { + "epoch": 1.513, + "grad_norm": 26.8512020111084, + "learning_rate": 6.976400000000001e-06, + "loss": 0.5488, + "step": 15130 + }, + { + "epoch": 1.514, + "grad_norm": 12.91208267211914, + "learning_rate": 6.9744e-06, + "loss": 0.6394, + "step": 15140 + }, + { + "epoch": 1.5150000000000001, + "grad_norm": 19.62507438659668, + "learning_rate": 6.9724e-06, + "loss": 0.4828, + "step": 15150 + }, + { + "epoch": 1.516, + "grad_norm": 40.00102615356445, + "learning_rate": 6.9704e-06, + "loss": 0.4805, + "step": 15160 + }, + { + "epoch": 1.517, + "grad_norm": 68.10365295410156, + "learning_rate": 6.968400000000001e-06, + "loss": 0.583, + "step": 15170 + }, + { + "epoch": 1.518, + "grad_norm": 22.018186569213867, + "learning_rate": 6.966400000000001e-06, + "loss": 0.8632, + "step": 15180 + }, + { + "epoch": 1.5190000000000001, + "grad_norm": 18.3768253326416, + "learning_rate": 6.9644e-06, + "loss": 0.4432, + "step": 15190 + }, + { + "epoch": 1.52, + "grad_norm": 18.584394454956055, + "learning_rate": 6.9624000000000005e-06, + "loss": 0.4675, + "step": 15200 + }, + { + "epoch": 1.521, + "grad_norm": 40.143768310546875, + "learning_rate": 6.9604e-06, + "loss": 0.9586, + "step": 15210 + }, + { + "epoch": 1.522, + "grad_norm": 18.579164505004883, + "learning_rate": 6.958400000000001e-06, + "loss": 0.3957, + "step": 15220 + }, + { + "epoch": 1.5230000000000001, + "grad_norm": 18.15229034423828, + "learning_rate": 6.956400000000001e-06, + "loss": 0.7275, + "step": 15230 + }, + { + "epoch": 1.524, + "grad_norm": 17.880271911621094, + "learning_rate": 6.9544e-06, + "loss": 0.7367, + "step": 15240 + }, + { + "epoch": 1.525, + "grad_norm": 5.8904266357421875, + "learning_rate": 6.9524000000000006e-06, + "loss": 0.7471, + "step": 15250 + }, + { + "epoch": 1.526, + "grad_norm": 28.13979721069336, + "learning_rate": 6.9504e-06, + "loss": 0.8176, + "step": 15260 + }, + { + "epoch": 1.5270000000000001, + "grad_norm": 13.52975082397461, + "learning_rate": 6.948400000000001e-06, + "loss": 0.587, + "step": 15270 + }, + { + "epoch": 1.528, + "grad_norm": 34.50688934326172, + "learning_rate": 6.946400000000001e-06, + "loss": 0.3854, + "step": 15280 + }, + { + "epoch": 1.529, + "grad_norm": 26.554487228393555, + "learning_rate": 6.9444e-06, + "loss": 0.5158, + "step": 15290 + }, + { + "epoch": 1.53, + "grad_norm": 4.872733116149902, + "learning_rate": 6.942400000000001e-06, + "loss": 0.5621, + "step": 15300 + }, + { + "epoch": 1.5310000000000001, + "grad_norm": 47.18104934692383, + "learning_rate": 6.9404000000000005e-06, + "loss": 0.5125, + "step": 15310 + }, + { + "epoch": 1.532, + "grad_norm": 26.80068588256836, + "learning_rate": 6.9384e-06, + "loss": 0.5913, + "step": 15320 + }, + { + "epoch": 1.533, + "grad_norm": 39.93467712402344, + "learning_rate": 6.936400000000001e-06, + "loss": 0.5561, + "step": 15330 + }, + { + "epoch": 1.534, + "grad_norm": 3.488877534866333, + "learning_rate": 6.9344e-06, + "loss": 0.6594, + "step": 15340 + }, + { + "epoch": 1.5350000000000001, + "grad_norm": 19.96973419189453, + "learning_rate": 6.9324e-06, + "loss": 0.7096, + "step": 15350 + }, + { + "epoch": 1.536, + "grad_norm": 32.5767936706543, + "learning_rate": 6.930600000000001e-06, + "loss": 0.6651, + "step": 15360 + }, + { + "epoch": 1.537, + "grad_norm": 22.27090835571289, + "learning_rate": 6.928600000000001e-06, + "loss": 0.6513, + "step": 15370 + }, + { + "epoch": 1.538, + "grad_norm": 57.235572814941406, + "learning_rate": 6.926600000000001e-06, + "loss": 0.8128, + "step": 15380 + }, + { + "epoch": 1.5390000000000001, + "grad_norm": 26.9422664642334, + "learning_rate": 6.924600000000001e-06, + "loss": 0.4483, + "step": 15390 + }, + { + "epoch": 1.54, + "grad_norm": 10.947314262390137, + "learning_rate": 6.9226000000000004e-06, + "loss": 0.6516, + "step": 15400 + }, + { + "epoch": 1.541, + "grad_norm": 48.50078201293945, + "learning_rate": 6.9206e-06, + "loss": 0.8598, + "step": 15410 + }, + { + "epoch": 1.542, + "grad_norm": 26.725366592407227, + "learning_rate": 6.918600000000001e-06, + "loss": 0.7719, + "step": 15420 + }, + { + "epoch": 1.5430000000000001, + "grad_norm": 23.61861228942871, + "learning_rate": 6.916600000000001e-06, + "loss": 0.9362, + "step": 15430 + }, + { + "epoch": 1.544, + "grad_norm": 24.58664894104004, + "learning_rate": 6.9146e-06, + "loss": 0.7702, + "step": 15440 + }, + { + "epoch": 1.545, + "grad_norm": 16.35002899169922, + "learning_rate": 6.9126000000000005e-06, + "loss": 0.5286, + "step": 15450 + }, + { + "epoch": 1.546, + "grad_norm": 7.408541202545166, + "learning_rate": 6.9106e-06, + "loss": 0.6786, + "step": 15460 + }, + { + "epoch": 1.5470000000000002, + "grad_norm": 29.840011596679688, + "learning_rate": 6.908600000000001e-06, + "loss": 0.7321, + "step": 15470 + }, + { + "epoch": 1.548, + "grad_norm": 24.454252243041992, + "learning_rate": 6.906600000000001e-06, + "loss": 0.7347, + "step": 15480 + }, + { + "epoch": 1.549, + "grad_norm": 8.857321739196777, + "learning_rate": 6.9046e-06, + "loss": 0.7328, + "step": 15490 + }, + { + "epoch": 1.55, + "grad_norm": 21.619884490966797, + "learning_rate": 6.902600000000001e-06, + "loss": 0.7709, + "step": 15500 + }, + { + "epoch": 1.5510000000000002, + "grad_norm": 26.981609344482422, + "learning_rate": 6.9006000000000005e-06, + "loss": 0.7383, + "step": 15510 + }, + { + "epoch": 1.552, + "grad_norm": 24.25237274169922, + "learning_rate": 6.898600000000001e-06, + "loss": 0.582, + "step": 15520 + }, + { + "epoch": 1.553, + "grad_norm": 20.094484329223633, + "learning_rate": 6.896600000000001e-06, + "loss": 0.8337, + "step": 15530 + }, + { + "epoch": 1.554, + "grad_norm": 33.374324798583984, + "learning_rate": 6.8946e-06, + "loss": 0.634, + "step": 15540 + }, + { + "epoch": 1.5550000000000002, + "grad_norm": 23.03681182861328, + "learning_rate": 6.892600000000001e-06, + "loss": 0.4505, + "step": 15550 + }, + { + "epoch": 1.556, + "grad_norm": 36.90275955200195, + "learning_rate": 6.8906000000000006e-06, + "loss": 0.8079, + "step": 15560 + }, + { + "epoch": 1.557, + "grad_norm": 21.511165618896484, + "learning_rate": 6.8886e-06, + "loss": 0.6231, + "step": 15570 + }, + { + "epoch": 1.558, + "grad_norm": 27.871326446533203, + "learning_rate": 6.886600000000001e-06, + "loss": 0.5137, + "step": 15580 + }, + { + "epoch": 1.5590000000000002, + "grad_norm": 32.82701873779297, + "learning_rate": 6.8846e-06, + "loss": 0.8568, + "step": 15590 + }, + { + "epoch": 1.56, + "grad_norm": 7.283423900604248, + "learning_rate": 6.8826e-06, + "loss": 0.6777, + "step": 15600 + }, + { + "epoch": 1.561, + "grad_norm": 37.66361999511719, + "learning_rate": 6.880600000000001e-06, + "loss": 0.3779, + "step": 15610 + }, + { + "epoch": 1.562, + "grad_norm": 29.52036476135254, + "learning_rate": 6.8786000000000005e-06, + "loss": 0.6852, + "step": 15620 + }, + { + "epoch": 1.563, + "grad_norm": 8.317204475402832, + "learning_rate": 6.876600000000001e-06, + "loss": 0.5511, + "step": 15630 + }, + { + "epoch": 1.564, + "grad_norm": 41.191375732421875, + "learning_rate": 6.8746e-06, + "loss": 0.8747, + "step": 15640 + }, + { + "epoch": 1.565, + "grad_norm": 39.559818267822266, + "learning_rate": 6.8726e-06, + "loss": 0.8863, + "step": 15650 + }, + { + "epoch": 1.5659999999999998, + "grad_norm": 30.73655891418457, + "learning_rate": 6.870600000000001e-06, + "loss": 0.7081, + "step": 15660 + }, + { + "epoch": 1.567, + "grad_norm": 30.881059646606445, + "learning_rate": 6.868600000000001e-06, + "loss": 0.7772, + "step": 15670 + }, + { + "epoch": 1.568, + "grad_norm": 31.458768844604492, + "learning_rate": 6.866600000000001e-06, + "loss": 0.6283, + "step": 15680 + }, + { + "epoch": 1.569, + "grad_norm": 18.553871154785156, + "learning_rate": 6.8646e-06, + "loss": 0.5786, + "step": 15690 + }, + { + "epoch": 1.5699999999999998, + "grad_norm": 4.743809223175049, + "learning_rate": 6.8626e-06, + "loss": 0.7825, + "step": 15700 + }, + { + "epoch": 1.571, + "grad_norm": 46.41438293457031, + "learning_rate": 6.860600000000001e-06, + "loss": 0.4837, + "step": 15710 + }, + { + "epoch": 1.572, + "grad_norm": 17.342342376708984, + "learning_rate": 6.858600000000001e-06, + "loss": 0.4237, + "step": 15720 + }, + { + "epoch": 1.573, + "grad_norm": 37.21664047241211, + "learning_rate": 6.856600000000001e-06, + "loss": 0.6188, + "step": 15730 + }, + { + "epoch": 1.5739999999999998, + "grad_norm": 45.30168914794922, + "learning_rate": 6.8546e-06, + "loss": 0.9802, + "step": 15740 + }, + { + "epoch": 1.575, + "grad_norm": 38.685813903808594, + "learning_rate": 6.8526e-06, + "loss": 0.7087, + "step": 15750 + }, + { + "epoch": 1.576, + "grad_norm": 23.965330123901367, + "learning_rate": 6.850600000000001e-06, + "loss": 0.5126, + "step": 15760 + }, + { + "epoch": 1.577, + "grad_norm": 43.43236541748047, + "learning_rate": 6.848600000000001e-06, + "loss": 0.6661, + "step": 15770 + }, + { + "epoch": 1.5779999999999998, + "grad_norm": 29.220190048217773, + "learning_rate": 6.846600000000001e-06, + "loss": 0.7872, + "step": 15780 + }, + { + "epoch": 1.579, + "grad_norm": 4.1027021408081055, + "learning_rate": 6.8446000000000005e-06, + "loss": 0.6425, + "step": 15790 + }, + { + "epoch": 1.58, + "grad_norm": 20.48824119567871, + "learning_rate": 6.8426e-06, + "loss": 0.8026, + "step": 15800 + }, + { + "epoch": 1.581, + "grad_norm": 16.290855407714844, + "learning_rate": 6.8406e-06, + "loss": 0.8203, + "step": 15810 + }, + { + "epoch": 1.5819999999999999, + "grad_norm": 14.963516235351562, + "learning_rate": 6.838600000000001e-06, + "loss": 0.6832, + "step": 15820 + }, + { + "epoch": 1.583, + "grad_norm": 37.97867965698242, + "learning_rate": 6.836600000000001e-06, + "loss": 0.6167, + "step": 15830 + }, + { + "epoch": 1.584, + "grad_norm": 29.7786865234375, + "learning_rate": 6.8346e-06, + "loss": 0.8014, + "step": 15840 + }, + { + "epoch": 1.585, + "grad_norm": 4.787508487701416, + "learning_rate": 6.8326000000000004e-06, + "loss": 0.7712, + "step": 15850 + }, + { + "epoch": 1.5859999999999999, + "grad_norm": 39.955650329589844, + "learning_rate": 6.8306e-06, + "loss": 0.5754, + "step": 15860 + }, + { + "epoch": 1.587, + "grad_norm": 9.246562957763672, + "learning_rate": 6.828600000000001e-06, + "loss": 0.7853, + "step": 15870 + }, + { + "epoch": 1.588, + "grad_norm": 16.79322052001953, + "learning_rate": 6.826600000000001e-06, + "loss": 1.0043, + "step": 15880 + }, + { + "epoch": 1.589, + "grad_norm": 46.77156448364258, + "learning_rate": 6.8246e-06, + "loss": 0.5613, + "step": 15890 + }, + { + "epoch": 1.5899999999999999, + "grad_norm": 30.43254852294922, + "learning_rate": 6.8226000000000005e-06, + "loss": 0.5192, + "step": 15900 + }, + { + "epoch": 1.591, + "grad_norm": 24.879961013793945, + "learning_rate": 6.8206e-06, + "loss": 0.6208, + "step": 15910 + }, + { + "epoch": 1.592, + "grad_norm": 31.248703002929688, + "learning_rate": 6.818600000000001e-06, + "loss": 0.5469, + "step": 15920 + }, + { + "epoch": 1.593, + "grad_norm": 43.610923767089844, + "learning_rate": 6.816600000000001e-06, + "loss": 0.8376, + "step": 15930 + }, + { + "epoch": 1.5939999999999999, + "grad_norm": 27.251039505004883, + "learning_rate": 6.8146e-06, + "loss": 0.8027, + "step": 15940 + }, + { + "epoch": 1.595, + "grad_norm": 52.440940856933594, + "learning_rate": 6.812600000000001e-06, + "loss": 0.6246, + "step": 15950 + }, + { + "epoch": 1.596, + "grad_norm": 52.91548156738281, + "learning_rate": 6.8106000000000005e-06, + "loss": 0.6263, + "step": 15960 + }, + { + "epoch": 1.597, + "grad_norm": 30.92323112487793, + "learning_rate": 6.808600000000001e-06, + "loss": 0.6969, + "step": 15970 + }, + { + "epoch": 1.5979999999999999, + "grad_norm": 76.39839172363281, + "learning_rate": 6.806600000000001e-06, + "loss": 1.0139, + "step": 15980 + }, + { + "epoch": 1.599, + "grad_norm": 21.792797088623047, + "learning_rate": 6.8046e-06, + "loss": 0.8526, + "step": 15990 + }, + { + "epoch": 1.6, + "grad_norm": 6.62412166595459, + "learning_rate": 6.802600000000001e-06, + "loss": 0.6768, + "step": 16000 + }, + { + "epoch": 1.601, + "grad_norm": 26.514087677001953, + "learning_rate": 6.800600000000001e-06, + "loss": 0.6667, + "step": 16010 + }, + { + "epoch": 1.6019999999999999, + "grad_norm": 8.58350658416748, + "learning_rate": 6.7986000000000004e-06, + "loss": 0.3468, + "step": 16020 + }, + { + "epoch": 1.603, + "grad_norm": 6.864509582519531, + "learning_rate": 6.796600000000001e-06, + "loss": 0.6009, + "step": 16030 + }, + { + "epoch": 1.604, + "grad_norm": 38.441192626953125, + "learning_rate": 6.7946e-06, + "loss": 0.9751, + "step": 16040 + }, + { + "epoch": 1.605, + "grad_norm": 14.290736198425293, + "learning_rate": 6.7926e-06, + "loss": 0.5905, + "step": 16050 + }, + { + "epoch": 1.6059999999999999, + "grad_norm": 21.486791610717773, + "learning_rate": 6.790600000000001e-06, + "loss": 0.7175, + "step": 16060 + }, + { + "epoch": 1.607, + "grad_norm": 85.47362518310547, + "learning_rate": 6.7886000000000005e-06, + "loss": 0.7642, + "step": 16070 + }, + { + "epoch": 1.608, + "grad_norm": 38.84056091308594, + "learning_rate": 6.786600000000001e-06, + "loss": 0.6609, + "step": 16080 + }, + { + "epoch": 1.609, + "grad_norm": 5.32833194732666, + "learning_rate": 6.7846e-06, + "loss": 0.5081, + "step": 16090 + }, + { + "epoch": 1.6099999999999999, + "grad_norm": 23.446685791015625, + "learning_rate": 6.7826e-06, + "loss": 0.7069, + "step": 16100 + }, + { + "epoch": 1.611, + "grad_norm": 65.45818328857422, + "learning_rate": 6.780600000000001e-06, + "loss": 0.7685, + "step": 16110 + }, + { + "epoch": 1.612, + "grad_norm": 20.963775634765625, + "learning_rate": 6.778600000000001e-06, + "loss": 0.6136, + "step": 16120 + }, + { + "epoch": 1.613, + "grad_norm": 52.342918395996094, + "learning_rate": 6.776600000000001e-06, + "loss": 0.8376, + "step": 16130 + }, + { + "epoch": 1.6139999999999999, + "grad_norm": 48.46240234375, + "learning_rate": 6.7746e-06, + "loss": 0.6365, + "step": 16140 + }, + { + "epoch": 1.615, + "grad_norm": 44.245235443115234, + "learning_rate": 6.7726e-06, + "loss": 0.8167, + "step": 16150 + }, + { + "epoch": 1.616, + "grad_norm": 20.52388572692871, + "learning_rate": 6.770600000000001e-06, + "loss": 0.5238, + "step": 16160 + }, + { + "epoch": 1.617, + "grad_norm": 41.138763427734375, + "learning_rate": 6.768600000000001e-06, + "loss": 0.7287, + "step": 16170 + }, + { + "epoch": 1.6179999999999999, + "grad_norm": 26.4044132232666, + "learning_rate": 6.7666000000000006e-06, + "loss": 0.638, + "step": 16180 + }, + { + "epoch": 1.619, + "grad_norm": 28.66182518005371, + "learning_rate": 6.7646e-06, + "loss": 0.7564, + "step": 16190 + }, + { + "epoch": 1.62, + "grad_norm": 148.91444396972656, + "learning_rate": 6.7626e-06, + "loss": 0.5724, + "step": 16200 + }, + { + "epoch": 1.621, + "grad_norm": 27.070476531982422, + "learning_rate": 6.7606e-06, + "loss": 0.5099, + "step": 16210 + }, + { + "epoch": 1.6219999999999999, + "grad_norm": 29.128446578979492, + "learning_rate": 6.758600000000001e-06, + "loss": 0.7376, + "step": 16220 + }, + { + "epoch": 1.623, + "grad_norm": 40.75493240356445, + "learning_rate": 6.756600000000001e-06, + "loss": 0.8102, + "step": 16230 + }, + { + "epoch": 1.624, + "grad_norm": 13.366748809814453, + "learning_rate": 6.7546e-06, + "loss": 0.7062, + "step": 16240 + }, + { + "epoch": 1.625, + "grad_norm": 26.001070022583008, + "learning_rate": 6.7526e-06, + "loss": 0.7237, + "step": 16250 + }, + { + "epoch": 1.626, + "grad_norm": 22.817955017089844, + "learning_rate": 6.7506e-06, + "loss": 0.9075, + "step": 16260 + }, + { + "epoch": 1.627, + "grad_norm": 33.775001525878906, + "learning_rate": 6.748600000000001e-06, + "loss": 0.4802, + "step": 16270 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 26.016061782836914, + "learning_rate": 6.746600000000001e-06, + "loss": 0.7084, + "step": 16280 + }, + { + "epoch": 1.629, + "grad_norm": 29.101972579956055, + "learning_rate": 6.7446e-06, + "loss": 0.8943, + "step": 16290 + }, + { + "epoch": 1.63, + "grad_norm": 40.83861541748047, + "learning_rate": 6.7426000000000005e-06, + "loss": 0.5705, + "step": 16300 + }, + { + "epoch": 1.631, + "grad_norm": 46.490352630615234, + "learning_rate": 6.7406e-06, + "loss": 0.523, + "step": 16310 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 2.7267861366271973, + "learning_rate": 6.738600000000001e-06, + "loss": 0.5519, + "step": 16320 + }, + { + "epoch": 1.633, + "grad_norm": 35.80474853515625, + "learning_rate": 6.736600000000001e-06, + "loss": 0.721, + "step": 16330 + }, + { + "epoch": 1.634, + "grad_norm": 27.28360939025879, + "learning_rate": 6.7346e-06, + "loss": 0.7681, + "step": 16340 + }, + { + "epoch": 1.635, + "grad_norm": 27.563488006591797, + "learning_rate": 6.7326000000000006e-06, + "loss": 0.6124, + "step": 16350 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 42.63063430786133, + "learning_rate": 6.7306e-06, + "loss": 0.7241, + "step": 16360 + }, + { + "epoch": 1.637, + "grad_norm": 21.946151733398438, + "learning_rate": 6.728600000000001e-06, + "loss": 0.6195, + "step": 16370 + }, + { + "epoch": 1.638, + "grad_norm": 49.707210540771484, + "learning_rate": 6.726600000000001e-06, + "loss": 0.7359, + "step": 16380 + }, + { + "epoch": 1.639, + "grad_norm": 66.19535827636719, + "learning_rate": 6.7246e-06, + "loss": 0.3962, + "step": 16390 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 26.122718811035156, + "learning_rate": 6.722600000000001e-06, + "loss": 0.7547, + "step": 16400 + }, + { + "epoch": 1.641, + "grad_norm": 13.014486312866211, + "learning_rate": 6.7206000000000005e-06, + "loss": 0.5428, + "step": 16410 + }, + { + "epoch": 1.642, + "grad_norm": 48.25709915161133, + "learning_rate": 6.7186e-06, + "loss": 0.551, + "step": 16420 + }, + { + "epoch": 1.643, + "grad_norm": 20.289514541625977, + "learning_rate": 6.716600000000001e-06, + "loss": 0.4724, + "step": 16430 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 33.06206130981445, + "learning_rate": 6.7146e-06, + "loss": 0.8169, + "step": 16440 + }, + { + "epoch": 1.645, + "grad_norm": 29.4952392578125, + "learning_rate": 6.7126e-06, + "loss": 0.4904, + "step": 16450 + }, + { + "epoch": 1.646, + "grad_norm": 19.41499900817871, + "learning_rate": 6.710600000000001e-06, + "loss": 0.3842, + "step": 16460 + }, + { + "epoch": 1.647, + "grad_norm": 51.508609771728516, + "learning_rate": 6.7086000000000004e-06, + "loss": 0.74, + "step": 16470 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 46.98064422607422, + "learning_rate": 6.706600000000001e-06, + "loss": 0.8286, + "step": 16480 + }, + { + "epoch": 1.649, + "grad_norm": 21.764245986938477, + "learning_rate": 6.7046e-06, + "loss": 0.9053, + "step": 16490 + }, + { + "epoch": 1.65, + "grad_norm": 8.565622329711914, + "learning_rate": 6.7026e-06, + "loss": 0.6417, + "step": 16500 + }, + { + "epoch": 1.651, + "grad_norm": 42.838314056396484, + "learning_rate": 6.700600000000001e-06, + "loss": 0.4717, + "step": 16510 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 33.5409049987793, + "learning_rate": 6.6986000000000005e-06, + "loss": 0.544, + "step": 16520 + }, + { + "epoch": 1.653, + "grad_norm": 27.133085250854492, + "learning_rate": 6.696600000000001e-06, + "loss": 0.6666, + "step": 16530 + }, + { + "epoch": 1.654, + "grad_norm": 25.304351806640625, + "learning_rate": 6.694600000000001e-06, + "loss": 0.7644, + "step": 16540 + }, + { + "epoch": 1.655, + "grad_norm": 49.61137008666992, + "learning_rate": 6.6926e-06, + "loss": 0.5898, + "step": 16550 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 18.069475173950195, + "learning_rate": 6.690600000000001e-06, + "loss": 0.7118, + "step": 16560 + }, + { + "epoch": 1.657, + "grad_norm": 56.22249984741211, + "learning_rate": 6.688600000000001e-06, + "loss": 0.728, + "step": 16570 + }, + { + "epoch": 1.658, + "grad_norm": 26.706298828125, + "learning_rate": 6.6866000000000005e-06, + "loss": 0.9413, + "step": 16580 + }, + { + "epoch": 1.659, + "grad_norm": 34.80197525024414, + "learning_rate": 6.684600000000001e-06, + "loss": 0.6705, + "step": 16590 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 34.63695526123047, + "learning_rate": 6.6826e-06, + "loss": 0.7658, + "step": 16600 + }, + { + "epoch": 1.661, + "grad_norm": 39.2309684753418, + "learning_rate": 6.6806e-06, + "loss": 0.6774, + "step": 16610 + }, + { + "epoch": 1.662, + "grad_norm": 59.59306335449219, + "learning_rate": 6.678600000000001e-06, + "loss": 0.7997, + "step": 16620 + }, + { + "epoch": 1.663, + "grad_norm": 13.926924705505371, + "learning_rate": 6.676600000000001e-06, + "loss": 0.5904, + "step": 16630 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 47.741844177246094, + "learning_rate": 6.674600000000001e-06, + "loss": 0.6835, + "step": 16640 + }, + { + "epoch": 1.665, + "grad_norm": 19.617435455322266, + "learning_rate": 6.6726e-06, + "loss": 0.7402, + "step": 16650 + }, + { + "epoch": 1.666, + "grad_norm": 11.972737312316895, + "learning_rate": 6.6706e-06, + "loss": 0.6702, + "step": 16660 + }, + { + "epoch": 1.667, + "grad_norm": 18.987348556518555, + "learning_rate": 6.668600000000001e-06, + "loss": 0.4913, + "step": 16670 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 31.206674575805664, + "learning_rate": 6.666600000000001e-06, + "loss": 0.5087, + "step": 16680 + }, + { + "epoch": 1.669, + "grad_norm": 35.57680892944336, + "learning_rate": 6.664600000000001e-06, + "loss": 0.6203, + "step": 16690 + }, + { + "epoch": 1.67, + "grad_norm": 23.097082138061523, + "learning_rate": 6.6626e-06, + "loss": 0.6263, + "step": 16700 + }, + { + "epoch": 1.671, + "grad_norm": 2.384589195251465, + "learning_rate": 6.6606e-06, + "loss": 0.7431, + "step": 16710 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 38.233177185058594, + "learning_rate": 6.658600000000001e-06, + "loss": 0.6329, + "step": 16720 + }, + { + "epoch": 1.673, + "grad_norm": 47.652183532714844, + "learning_rate": 6.656600000000001e-06, + "loss": 0.715, + "step": 16730 + }, + { + "epoch": 1.674, + "grad_norm": 14.567292213439941, + "learning_rate": 6.6546000000000015e-06, + "loss": 0.3611, + "step": 16740 + }, + { + "epoch": 1.675, + "grad_norm": 31.54159164428711, + "learning_rate": 6.6526000000000005e-06, + "loss": 0.55, + "step": 16750 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 47.42003631591797, + "learning_rate": 6.6506e-06, + "loss": 0.7384, + "step": 16760 + }, + { + "epoch": 1.677, + "grad_norm": 25.71277618408203, + "learning_rate": 6.648600000000001e-06, + "loss": 0.4905, + "step": 16770 + }, + { + "epoch": 1.678, + "grad_norm": 39.28430938720703, + "learning_rate": 6.646600000000001e-06, + "loss": 0.7597, + "step": 16780 + }, + { + "epoch": 1.679, + "grad_norm": 29.059282302856445, + "learning_rate": 6.644600000000001e-06, + "loss": 0.8572, + "step": 16790 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 45.53642654418945, + "learning_rate": 6.6426000000000006e-06, + "loss": 0.6979, + "step": 16800 + }, + { + "epoch": 1.681, + "grad_norm": 34.41741943359375, + "learning_rate": 6.6406e-06, + "loss": 0.5413, + "step": 16810 + }, + { + "epoch": 1.682, + "grad_norm": 24.35959815979004, + "learning_rate": 6.6386e-06, + "loss": 0.6513, + "step": 16820 + }, + { + "epoch": 1.683, + "grad_norm": 37.356407165527344, + "learning_rate": 6.636600000000001e-06, + "loss": 0.6982, + "step": 16830 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 41.323490142822266, + "learning_rate": 6.634600000000001e-06, + "loss": 0.8573, + "step": 16840 + }, + { + "epoch": 1.685, + "grad_norm": 25.554746627807617, + "learning_rate": 6.6326e-06, + "loss": 0.43, + "step": 16850 + }, + { + "epoch": 1.686, + "grad_norm": 18.734989166259766, + "learning_rate": 6.6306000000000005e-06, + "loss": 0.6716, + "step": 16860 + }, + { + "epoch": 1.687, + "grad_norm": 20.203542709350586, + "learning_rate": 6.6286e-06, + "loss": 0.5459, + "step": 16870 + }, + { + "epoch": 1.688, + "grad_norm": 35.660457611083984, + "learning_rate": 6.626600000000001e-06, + "loss": 0.7359, + "step": 16880 + }, + { + "epoch": 1.689, + "grad_norm": 7.850064277648926, + "learning_rate": 6.624600000000001e-06, + "loss": 0.5398, + "step": 16890 + }, + { + "epoch": 1.69, + "grad_norm": 2.7819433212280273, + "learning_rate": 6.6226e-06, + "loss": 0.4942, + "step": 16900 + }, + { + "epoch": 1.6909999999999998, + "grad_norm": 39.97076416015625, + "learning_rate": 6.620600000000001e-06, + "loss": 1.0417, + "step": 16910 + }, + { + "epoch": 1.692, + "grad_norm": 38.29095458984375, + "learning_rate": 6.6186000000000005e-06, + "loss": 0.8314, + "step": 16920 + }, + { + "epoch": 1.693, + "grad_norm": 30.200014114379883, + "learning_rate": 6.616600000000001e-06, + "loss": 0.8102, + "step": 16930 + }, + { + "epoch": 1.694, + "grad_norm": 15.969015121459961, + "learning_rate": 6.614600000000001e-06, + "loss": 0.2612, + "step": 16940 + }, + { + "epoch": 1.6949999999999998, + "grad_norm": 23.59886932373047, + "learning_rate": 6.6126e-06, + "loss": 0.6885, + "step": 16950 + }, + { + "epoch": 1.696, + "grad_norm": 1.8917691707611084, + "learning_rate": 6.610600000000001e-06, + "loss": 0.6052, + "step": 16960 + }, + { + "epoch": 1.697, + "grad_norm": 62.720245361328125, + "learning_rate": 6.6086000000000006e-06, + "loss": 0.5954, + "step": 16970 + }, + { + "epoch": 1.698, + "grad_norm": 5.497705459594727, + "learning_rate": 6.6066e-06, + "loss": 0.6399, + "step": 16980 + }, + { + "epoch": 1.6989999999999998, + "grad_norm": 16.05719566345215, + "learning_rate": 6.604600000000001e-06, + "loss": 0.5922, + "step": 16990 + }, + { + "epoch": 1.7, + "grad_norm": 1.563225507736206, + "learning_rate": 6.6026e-06, + "loss": 0.4885, + "step": 17000 + }, + { + "epoch": 1.701, + "grad_norm": 41.32305145263672, + "learning_rate": 6.6006e-06, + "loss": 0.8704, + "step": 17010 + }, + { + "epoch": 1.702, + "grad_norm": 50.50960159301758, + "learning_rate": 6.598600000000001e-06, + "loss": 0.6659, + "step": 17020 + }, + { + "epoch": 1.7029999999999998, + "grad_norm": 7.299249649047852, + "learning_rate": 6.5966000000000005e-06, + "loss": 0.6752, + "step": 17030 + }, + { + "epoch": 1.704, + "grad_norm": 26.67925262451172, + "learning_rate": 6.594600000000001e-06, + "loss": 0.7089, + "step": 17040 + }, + { + "epoch": 1.705, + "grad_norm": 43.11434555053711, + "learning_rate": 6.5926e-06, + "loss": 0.4218, + "step": 17050 + }, + { + "epoch": 1.706, + "grad_norm": 30.153173446655273, + "learning_rate": 6.5906e-06, + "loss": 0.9292, + "step": 17060 + }, + { + "epoch": 1.7069999999999999, + "grad_norm": 26.88372230529785, + "learning_rate": 6.588600000000001e-06, + "loss": 0.9461, + "step": 17070 + }, + { + "epoch": 1.708, + "grad_norm": 12.442044258117676, + "learning_rate": 6.586600000000001e-06, + "loss": 0.4348, + "step": 17080 + }, + { + "epoch": 1.709, + "grad_norm": 42.74766540527344, + "learning_rate": 6.584600000000001e-06, + "loss": 0.6239, + "step": 17090 + }, + { + "epoch": 1.71, + "grad_norm": 30.965904235839844, + "learning_rate": 6.5826e-06, + "loss": 0.9168, + "step": 17100 + }, + { + "epoch": 1.7109999999999999, + "grad_norm": 27.245014190673828, + "learning_rate": 6.5806e-06, + "loss": 0.6842, + "step": 17110 + }, + { + "epoch": 1.712, + "grad_norm": 17.746694564819336, + "learning_rate": 6.578600000000001e-06, + "loss": 0.8224, + "step": 17120 + }, + { + "epoch": 1.713, + "grad_norm": 11.387202262878418, + "learning_rate": 6.576600000000001e-06, + "loss": 0.5531, + "step": 17130 + }, + { + "epoch": 1.714, + "grad_norm": 30.879100799560547, + "learning_rate": 6.574600000000001e-06, + "loss": 0.6209, + "step": 17140 + }, + { + "epoch": 1.7149999999999999, + "grad_norm": 9.201401710510254, + "learning_rate": 6.5726e-06, + "loss": 0.6358, + "step": 17150 + }, + { + "epoch": 1.716, + "grad_norm": 32.12496566772461, + "learning_rate": 6.5706e-06, + "loss": 0.6251, + "step": 17160 + }, + { + "epoch": 1.717, + "grad_norm": 7.410712242126465, + "learning_rate": 6.568600000000001e-06, + "loss": 0.6895, + "step": 17170 + }, + { + "epoch": 1.718, + "grad_norm": 40.6132926940918, + "learning_rate": 6.566600000000001e-06, + "loss": 0.681, + "step": 17180 + }, + { + "epoch": 1.7189999999999999, + "grad_norm": 18.442171096801758, + "learning_rate": 6.564600000000001e-06, + "loss": 0.5489, + "step": 17190 + }, + { + "epoch": 1.72, + "grad_norm": 11.694601058959961, + "learning_rate": 6.5626000000000005e-06, + "loss": 0.6267, + "step": 17200 + }, + { + "epoch": 1.721, + "grad_norm": 17.21006202697754, + "learning_rate": 6.5606e-06, + "loss": 1.1118, + "step": 17210 + }, + { + "epoch": 1.722, + "grad_norm": 14.537155151367188, + "learning_rate": 6.5586e-06, + "loss": 0.8024, + "step": 17220 + }, + { + "epoch": 1.7229999999999999, + "grad_norm": 32.91739273071289, + "learning_rate": 6.556600000000001e-06, + "loss": 0.7176, + "step": 17230 + }, + { + "epoch": 1.724, + "grad_norm": 21.398927688598633, + "learning_rate": 6.554600000000001e-06, + "loss": 0.6832, + "step": 17240 + }, + { + "epoch": 1.725, + "grad_norm": 37.23515701293945, + "learning_rate": 6.5526e-06, + "loss": 0.6286, + "step": 17250 + }, + { + "epoch": 1.726, + "grad_norm": 30.678726196289062, + "learning_rate": 6.5506000000000004e-06, + "loss": 0.6981, + "step": 17260 + }, + { + "epoch": 1.7269999999999999, + "grad_norm": 28.972732543945312, + "learning_rate": 6.5486e-06, + "loss": 0.6119, + "step": 17270 + }, + { + "epoch": 1.728, + "grad_norm": 37.57304000854492, + "learning_rate": 6.546600000000001e-06, + "loss": 0.7778, + "step": 17280 + }, + { + "epoch": 1.729, + "grad_norm": 16.991241455078125, + "learning_rate": 6.544600000000001e-06, + "loss": 0.6761, + "step": 17290 + }, + { + "epoch": 1.73, + "grad_norm": 24.713104248046875, + "learning_rate": 6.5426e-06, + "loss": 0.8166, + "step": 17300 + }, + { + "epoch": 1.7309999999999999, + "grad_norm": 18.130779266357422, + "learning_rate": 6.5406000000000005e-06, + "loss": 0.5623, + "step": 17310 + }, + { + "epoch": 1.732, + "grad_norm": 6.897360324859619, + "learning_rate": 6.5386e-06, + "loss": 0.4677, + "step": 17320 + }, + { + "epoch": 1.733, + "grad_norm": 15.79992961883545, + "learning_rate": 6.536600000000001e-06, + "loss": 0.6267, + "step": 17330 + }, + { + "epoch": 1.734, + "grad_norm": 27.68032455444336, + "learning_rate": 6.534600000000001e-06, + "loss": 0.6454, + "step": 17340 + }, + { + "epoch": 1.7349999999999999, + "grad_norm": 28.13858413696289, + "learning_rate": 6.5326e-06, + "loss": 0.7075, + "step": 17350 + }, + { + "epoch": 1.736, + "grad_norm": 7.756270408630371, + "learning_rate": 6.530600000000001e-06, + "loss": 0.5348, + "step": 17360 + }, + { + "epoch": 1.737, + "grad_norm": 35.861351013183594, + "learning_rate": 6.5286000000000005e-06, + "loss": 0.8423, + "step": 17370 + }, + { + "epoch": 1.738, + "grad_norm": 14.177804946899414, + "learning_rate": 6.5266e-06, + "loss": 0.5516, + "step": 17380 + }, + { + "epoch": 1.7389999999999999, + "grad_norm": 23.096698760986328, + "learning_rate": 6.524600000000001e-06, + "loss": 0.8236, + "step": 17390 + }, + { + "epoch": 1.74, + "grad_norm": 38.693511962890625, + "learning_rate": 6.5226e-06, + "loss": 0.7141, + "step": 17400 + }, + { + "epoch": 1.741, + "grad_norm": 25.841833114624023, + "learning_rate": 6.5206e-06, + "loss": 0.7845, + "step": 17410 + }, + { + "epoch": 1.742, + "grad_norm": 24.260513305664062, + "learning_rate": 6.5186000000000006e-06, + "loss": 0.6659, + "step": 17420 + }, + { + "epoch": 1.7429999999999999, + "grad_norm": 24.908824920654297, + "learning_rate": 6.5166000000000004e-06, + "loss": 0.5314, + "step": 17430 + }, + { + "epoch": 1.744, + "grad_norm": 25.634967803955078, + "learning_rate": 6.514600000000001e-06, + "loss": 0.7284, + "step": 17440 + }, + { + "epoch": 1.745, + "grad_norm": 15.970552444458008, + "learning_rate": 6.5126e-06, + "loss": 0.9723, + "step": 17450 + }, + { + "epoch": 1.746, + "grad_norm": 28.712669372558594, + "learning_rate": 6.5106e-06, + "loss": 0.4265, + "step": 17460 + }, + { + "epoch": 1.7469999999999999, + "grad_norm": 27.592082977294922, + "learning_rate": 6.508600000000001e-06, + "loss": 0.5927, + "step": 17470 + }, + { + "epoch": 1.748, + "grad_norm": 42.75637435913086, + "learning_rate": 6.5066000000000005e-06, + "loss": 0.7495, + "step": 17480 + }, + { + "epoch": 1.749, + "grad_norm": 39.701114654541016, + "learning_rate": 6.504600000000001e-06, + "loss": 0.5361, + "step": 17490 + }, + { + "epoch": 1.75, + "grad_norm": 19.57805633544922, + "learning_rate": 6.5026e-06, + "loss": 0.9434, + "step": 17500 + }, + { + "epoch": 1.751, + "grad_norm": 24.80259132385254, + "learning_rate": 6.5006e-06, + "loss": 0.533, + "step": 17510 + }, + { + "epoch": 1.752, + "grad_norm": 30.908178329467773, + "learning_rate": 6.498600000000001e-06, + "loss": 0.9068, + "step": 17520 + }, + { + "epoch": 1.7530000000000001, + "grad_norm": 48.78810119628906, + "learning_rate": 6.496600000000001e-06, + "loss": 0.8086, + "step": 17530 + }, + { + "epoch": 1.754, + "grad_norm": 36.677059173583984, + "learning_rate": 6.494600000000001e-06, + "loss": 0.7572, + "step": 17540 + }, + { + "epoch": 1.755, + "grad_norm": 22.354658126831055, + "learning_rate": 6.4926e-06, + "loss": 0.6053, + "step": 17550 + }, + { + "epoch": 1.756, + "grad_norm": 32.97988510131836, + "learning_rate": 6.4906e-06, + "loss": 0.5435, + "step": 17560 + }, + { + "epoch": 1.7570000000000001, + "grad_norm": 18.22273063659668, + "learning_rate": 6.488600000000001e-06, + "loss": 0.456, + "step": 17570 + }, + { + "epoch": 1.758, + "grad_norm": 24.372108459472656, + "learning_rate": 6.486600000000001e-06, + "loss": 0.5838, + "step": 17580 + }, + { + "epoch": 1.759, + "grad_norm": 12.150777816772461, + "learning_rate": 6.4846000000000006e-06, + "loss": 0.4555, + "step": 17590 + }, + { + "epoch": 1.76, + "grad_norm": 3.295907497406006, + "learning_rate": 6.4826e-06, + "loss": 0.4124, + "step": 17600 + }, + { + "epoch": 1.7610000000000001, + "grad_norm": 28.266719818115234, + "learning_rate": 6.4806e-06, + "loss": 0.7032, + "step": 17610 + }, + { + "epoch": 1.762, + "grad_norm": 51.440616607666016, + "learning_rate": 6.4786e-06, + "loss": 0.6094, + "step": 17620 + }, + { + "epoch": 1.763, + "grad_norm": 16.108505249023438, + "learning_rate": 6.476600000000001e-06, + "loss": 0.5252, + "step": 17630 + }, + { + "epoch": 1.764, + "grad_norm": 42.28464889526367, + "learning_rate": 6.474600000000001e-06, + "loss": 0.7524, + "step": 17640 + }, + { + "epoch": 1.7650000000000001, + "grad_norm": 10.865596771240234, + "learning_rate": 6.4726e-06, + "loss": 0.6395, + "step": 17650 + }, + { + "epoch": 1.766, + "grad_norm": 22.09259033203125, + "learning_rate": 6.4706e-06, + "loss": 0.472, + "step": 17660 + }, + { + "epoch": 1.767, + "grad_norm": 10.556814193725586, + "learning_rate": 6.4686e-06, + "loss": 0.3279, + "step": 17670 + }, + { + "epoch": 1.768, + "grad_norm": 69.1151351928711, + "learning_rate": 6.466600000000001e-06, + "loss": 0.8119, + "step": 17680 + }, + { + "epoch": 1.7690000000000001, + "grad_norm": 49.826446533203125, + "learning_rate": 6.464600000000001e-06, + "loss": 0.7345, + "step": 17690 + }, + { + "epoch": 1.77, + "grad_norm": 37.7425537109375, + "learning_rate": 6.4626e-06, + "loss": 0.7113, + "step": 17700 + }, + { + "epoch": 1.771, + "grad_norm": 25.57686996459961, + "learning_rate": 6.4606000000000005e-06, + "loss": 0.7251, + "step": 17710 + }, + { + "epoch": 1.772, + "grad_norm": 18.124256134033203, + "learning_rate": 6.4586e-06, + "loss": 0.797, + "step": 17720 + }, + { + "epoch": 1.7730000000000001, + "grad_norm": 65.94688415527344, + "learning_rate": 6.456600000000001e-06, + "loss": 0.6839, + "step": 17730 + }, + { + "epoch": 1.774, + "grad_norm": 30.200220108032227, + "learning_rate": 6.454600000000001e-06, + "loss": 0.4463, + "step": 17740 + }, + { + "epoch": 1.775, + "grad_norm": 23.35943031311035, + "learning_rate": 6.4526e-06, + "loss": 0.5262, + "step": 17750 + }, + { + "epoch": 1.776, + "grad_norm": 41.625030517578125, + "learning_rate": 6.4506000000000005e-06, + "loss": 0.5537, + "step": 17760 + }, + { + "epoch": 1.7770000000000001, + "grad_norm": 31.092355728149414, + "learning_rate": 6.4486e-06, + "loss": 0.6805, + "step": 17770 + }, + { + "epoch": 1.778, + "grad_norm": 6.774554252624512, + "learning_rate": 6.4466e-06, + "loss": 0.6299, + "step": 17780 + }, + { + "epoch": 1.779, + "grad_norm": 22.500181198120117, + "learning_rate": 6.444600000000001e-06, + "loss": 0.5055, + "step": 17790 + }, + { + "epoch": 1.78, + "grad_norm": 40.5450553894043, + "learning_rate": 6.442600000000001e-06, + "loss": 0.5327, + "step": 17800 + }, + { + "epoch": 1.7810000000000001, + "grad_norm": 16.847360610961914, + "learning_rate": 6.4406e-06, + "loss": 0.6484, + "step": 17810 + }, + { + "epoch": 1.782, + "grad_norm": 25.066572189331055, + "learning_rate": 6.4386000000000005e-06, + "loss": 0.4881, + "step": 17820 + }, + { + "epoch": 1.783, + "grad_norm": 31.86751365661621, + "learning_rate": 6.4366e-06, + "loss": 0.4612, + "step": 17830 + }, + { + "epoch": 1.784, + "grad_norm": 27.578292846679688, + "learning_rate": 6.434600000000001e-06, + "loss": 0.8895, + "step": 17840 + }, + { + "epoch": 1.7850000000000001, + "grad_norm": 29.506324768066406, + "learning_rate": 6.432600000000001e-06, + "loss": 0.7933, + "step": 17850 + }, + { + "epoch": 1.786, + "grad_norm": 4.340175628662109, + "learning_rate": 6.4306e-06, + "loss": 0.5756, + "step": 17860 + }, + { + "epoch": 1.787, + "grad_norm": 20.72159767150879, + "learning_rate": 6.428600000000001e-06, + "loss": 0.695, + "step": 17870 + }, + { + "epoch": 1.788, + "grad_norm": 22.929832458496094, + "learning_rate": 6.4266000000000004e-06, + "loss": 0.4567, + "step": 17880 + }, + { + "epoch": 1.7890000000000001, + "grad_norm": 13.51591968536377, + "learning_rate": 6.424600000000001e-06, + "loss": 0.6086, + "step": 17890 + }, + { + "epoch": 1.79, + "grad_norm": 48.332122802734375, + "learning_rate": 6.422600000000001e-06, + "loss": 0.6551, + "step": 17900 + }, + { + "epoch": 1.791, + "grad_norm": 22.90260124206543, + "learning_rate": 6.4206e-06, + "loss": 0.6961, + "step": 17910 + }, + { + "epoch": 1.792, + "grad_norm": 16.833646774291992, + "learning_rate": 6.418600000000001e-06, + "loss": 0.4337, + "step": 17920 + }, + { + "epoch": 1.7930000000000001, + "grad_norm": 35.251670837402344, + "learning_rate": 6.4166000000000005e-06, + "loss": 0.7702, + "step": 17930 + }, + { + "epoch": 1.794, + "grad_norm": 34.65640640258789, + "learning_rate": 6.414600000000001e-06, + "loss": 0.7296, + "step": 17940 + }, + { + "epoch": 1.795, + "grad_norm": 53.794921875, + "learning_rate": 6.412600000000001e-06, + "loss": 0.7949, + "step": 17950 + }, + { + "epoch": 1.796, + "grad_norm": 36.07053756713867, + "learning_rate": 6.4106e-06, + "loss": 0.7731, + "step": 17960 + }, + { + "epoch": 1.7970000000000002, + "grad_norm": 50.78809356689453, + "learning_rate": 6.408600000000001e-06, + "loss": 0.5625, + "step": 17970 + }, + { + "epoch": 1.798, + "grad_norm": 28.448030471801758, + "learning_rate": 6.406600000000001e-06, + "loss": 0.555, + "step": 17980 + }, + { + "epoch": 1.799, + "grad_norm": 22.320802688598633, + "learning_rate": 6.4046000000000005e-06, + "loss": 0.6709, + "step": 17990 + }, + { + "epoch": 1.8, + "grad_norm": 4.228553771972656, + "learning_rate": 6.402600000000001e-06, + "loss": 0.6453, + "step": 18000 + }, + { + "epoch": 1.8010000000000002, + "grad_norm": 36.92367172241211, + "learning_rate": 6.4006e-06, + "loss": 0.9075, + "step": 18010 + }, + { + "epoch": 1.802, + "grad_norm": 30.44162368774414, + "learning_rate": 6.3986e-06, + "loss": 0.6903, + "step": 18020 + }, + { + "epoch": 1.803, + "grad_norm": 39.92036056518555, + "learning_rate": 6.396600000000001e-06, + "loss": 0.8592, + "step": 18030 + }, + { + "epoch": 1.804, + "grad_norm": 44.56736373901367, + "learning_rate": 6.3946000000000006e-06, + "loss": 0.4859, + "step": 18040 + }, + { + "epoch": 1.8050000000000002, + "grad_norm": 25.35263442993164, + "learning_rate": 6.392600000000001e-06, + "loss": 0.6489, + "step": 18050 + }, + { + "epoch": 1.806, + "grad_norm": 73.92831420898438, + "learning_rate": 6.3906e-06, + "loss": 0.7088, + "step": 18060 + }, + { + "epoch": 1.807, + "grad_norm": 75.07518768310547, + "learning_rate": 6.3886e-06, + "loss": 0.7994, + "step": 18070 + }, + { + "epoch": 1.808, + "grad_norm": 35.053138732910156, + "learning_rate": 6.386600000000001e-06, + "loss": 1.0469, + "step": 18080 + }, + { + "epoch": 1.8090000000000002, + "grad_norm": 21.79502296447754, + "learning_rate": 6.384600000000001e-06, + "loss": 0.7169, + "step": 18090 + }, + { + "epoch": 1.81, + "grad_norm": 6.465487003326416, + "learning_rate": 6.382600000000001e-06, + "loss": 0.9225, + "step": 18100 + }, + { + "epoch": 1.811, + "grad_norm": 45.01616287231445, + "learning_rate": 6.3806e-06, + "loss": 0.5866, + "step": 18110 + }, + { + "epoch": 1.812, + "grad_norm": 59.63362503051758, + "learning_rate": 6.3786e-06, + "loss": 0.8389, + "step": 18120 + }, + { + "epoch": 1.813, + "grad_norm": 5.388458728790283, + "learning_rate": 6.376600000000001e-06, + "loss": 0.4796, + "step": 18130 + }, + { + "epoch": 1.814, + "grad_norm": 2.917116165161133, + "learning_rate": 6.374600000000001e-06, + "loss": 0.4393, + "step": 18140 + }, + { + "epoch": 1.815, + "grad_norm": 18.18976593017578, + "learning_rate": 6.372600000000001e-06, + "loss": 0.8059, + "step": 18150 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 30.708412170410156, + "learning_rate": 6.3706000000000005e-06, + "loss": 0.6153, + "step": 18160 + }, + { + "epoch": 1.817, + "grad_norm": 29.721654891967773, + "learning_rate": 6.3686e-06, + "loss": 0.5348, + "step": 18170 + }, + { + "epoch": 1.818, + "grad_norm": 29.426971435546875, + "learning_rate": 6.3666e-06, + "loss": 0.564, + "step": 18180 + }, + { + "epoch": 1.819, + "grad_norm": 10.837160110473633, + "learning_rate": 6.364600000000001e-06, + "loss": 0.5361, + "step": 18190 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 51.735557556152344, + "learning_rate": 6.362600000000001e-06, + "loss": 0.6383, + "step": 18200 + }, + { + "epoch": 1.821, + "grad_norm": 28.849987030029297, + "learning_rate": 6.3606e-06, + "loss": 0.5701, + "step": 18210 + }, + { + "epoch": 1.822, + "grad_norm": 0.3885299265384674, + "learning_rate": 6.3586e-06, + "loss": 0.2929, + "step": 18220 + }, + { + "epoch": 1.823, + "grad_norm": 41.65828323364258, + "learning_rate": 6.3566e-06, + "loss": 0.8778, + "step": 18230 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 46.33951187133789, + "learning_rate": 6.354600000000001e-06, + "loss": 0.9669, + "step": 18240 + }, + { + "epoch": 1.825, + "grad_norm": 116.5798568725586, + "learning_rate": 6.352600000000001e-06, + "loss": 0.5901, + "step": 18250 + }, + { + "epoch": 1.826, + "grad_norm": 11.925887107849121, + "learning_rate": 6.3506e-06, + "loss": 0.5316, + "step": 18260 + }, + { + "epoch": 1.827, + "grad_norm": 17.010337829589844, + "learning_rate": 6.3486000000000005e-06, + "loss": 0.7052, + "step": 18270 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 15.04338264465332, + "learning_rate": 6.3466e-06, + "loss": 0.5075, + "step": 18280 + }, + { + "epoch": 1.829, + "grad_norm": 53.3408203125, + "learning_rate": 6.344600000000001e-06, + "loss": 0.5449, + "step": 18290 + }, + { + "epoch": 1.83, + "grad_norm": 15.94815444946289, + "learning_rate": 6.342600000000001e-06, + "loss": 0.6648, + "step": 18300 + }, + { + "epoch": 1.831, + "grad_norm": 12.17188549041748, + "learning_rate": 6.3406e-06, + "loss": 0.6536, + "step": 18310 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 39.76250457763672, + "learning_rate": 6.338600000000001e-06, + "loss": 0.7032, + "step": 18320 + }, + { + "epoch": 1.833, + "grad_norm": 40.960411071777344, + "learning_rate": 6.3366000000000005e-06, + "loss": 0.6843, + "step": 18330 + }, + { + "epoch": 1.834, + "grad_norm": 57.23612976074219, + "learning_rate": 6.334600000000001e-06, + "loss": 0.6186, + "step": 18340 + }, + { + "epoch": 1.835, + "grad_norm": 32.05949020385742, + "learning_rate": 6.332600000000001e-06, + "loss": 0.5321, + "step": 18350 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 27.393653869628906, + "learning_rate": 6.3306e-06, + "loss": 0.8525, + "step": 18360 + }, + { + "epoch": 1.837, + "grad_norm": 17.125869750976562, + "learning_rate": 6.328600000000001e-06, + "loss": 0.4895, + "step": 18370 + }, + { + "epoch": 1.838, + "grad_norm": 54.76142883300781, + "learning_rate": 6.3266000000000005e-06, + "loss": 0.7369, + "step": 18380 + }, + { + "epoch": 1.839, + "grad_norm": 41.17182159423828, + "learning_rate": 6.3246e-06, + "loss": 0.7504, + "step": 18390 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 18.484394073486328, + "learning_rate": 6.322600000000001e-06, + "loss": 0.7947, + "step": 18400 + }, + { + "epoch": 1.841, + "grad_norm": 10.778761863708496, + "learning_rate": 6.3206e-06, + "loss": 0.5847, + "step": 18410 + }, + { + "epoch": 1.842, + "grad_norm": 36.59711837768555, + "learning_rate": 6.3186e-06, + "loss": 0.5724, + "step": 18420 + }, + { + "epoch": 1.843, + "grad_norm": 21.72235870361328, + "learning_rate": 6.316600000000001e-06, + "loss": 0.5684, + "step": 18430 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 28.071380615234375, + "learning_rate": 6.3146000000000005e-06, + "loss": 0.6841, + "step": 18440 + }, + { + "epoch": 1.845, + "grad_norm": 15.306550025939941, + "learning_rate": 6.312600000000001e-06, + "loss": 0.5565, + "step": 18450 + }, + { + "epoch": 1.846, + "grad_norm": 49.12186813354492, + "learning_rate": 6.3106e-06, + "loss": 0.7218, + "step": 18460 + }, + { + "epoch": 1.847, + "grad_norm": 27.92928695678711, + "learning_rate": 6.3086e-06, + "loss": 0.7262, + "step": 18470 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 30.815195083618164, + "learning_rate": 6.306600000000001e-06, + "loss": 0.6921, + "step": 18480 + }, + { + "epoch": 1.849, + "grad_norm": 9.238826751708984, + "learning_rate": 6.304600000000001e-06, + "loss": 0.6324, + "step": 18490 + }, + { + "epoch": 1.85, + "grad_norm": 33.87525177001953, + "learning_rate": 6.302600000000001e-06, + "loss": 0.9252, + "step": 18500 + }, + { + "epoch": 1.851, + "grad_norm": 23.98631477355957, + "learning_rate": 6.3006e-06, + "loss": 0.6777, + "step": 18510 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 55.89791488647461, + "learning_rate": 6.2986e-06, + "loss": 0.9501, + "step": 18520 + }, + { + "epoch": 1.853, + "grad_norm": 49.16728591918945, + "learning_rate": 6.296600000000001e-06, + "loss": 0.6926, + "step": 18530 + }, + { + "epoch": 1.854, + "grad_norm": 49.215492248535156, + "learning_rate": 6.294600000000001e-06, + "loss": 0.7731, + "step": 18540 + }, + { + "epoch": 1.855, + "grad_norm": 20.07518196105957, + "learning_rate": 6.2926000000000005e-06, + "loss": 0.6389, + "step": 18550 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 11.670626640319824, + "learning_rate": 6.2906e-06, + "loss": 0.5868, + "step": 18560 + }, + { + "epoch": 1.857, + "grad_norm": 37.362640380859375, + "learning_rate": 6.2886e-06, + "loss": 0.6444, + "step": 18570 + }, + { + "epoch": 1.858, + "grad_norm": 34.82246017456055, + "learning_rate": 6.2866e-06, + "loss": 0.4443, + "step": 18580 + }, + { + "epoch": 1.859, + "grad_norm": 39.475189208984375, + "learning_rate": 6.284600000000001e-06, + "loss": 0.4061, + "step": 18590 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 44.0338249206543, + "learning_rate": 6.282600000000001e-06, + "loss": 0.7411, + "step": 18600 + }, + { + "epoch": 1.861, + "grad_norm": 37.729103088378906, + "learning_rate": 6.2806e-06, + "loss": 0.799, + "step": 18610 + }, + { + "epoch": 1.862, + "grad_norm": 12.825973510742188, + "learning_rate": 6.2786e-06, + "loss": 0.3664, + "step": 18620 + }, + { + "epoch": 1.863, + "grad_norm": 30.061628341674805, + "learning_rate": 6.2766e-06, + "loss": 0.88, + "step": 18630 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 6.273592948913574, + "learning_rate": 6.274600000000001e-06, + "loss": 0.526, + "step": 18640 + }, + { + "epoch": 1.865, + "grad_norm": 28.17704200744629, + "learning_rate": 6.272600000000001e-06, + "loss": 0.7294, + "step": 18650 + }, + { + "epoch": 1.866, + "grad_norm": 45.9304084777832, + "learning_rate": 6.2706e-06, + "loss": 0.6401, + "step": 18660 + }, + { + "epoch": 1.867, + "grad_norm": 5.665647506713867, + "learning_rate": 6.2686000000000004e-06, + "loss": 0.5935, + "step": 18670 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 11.19158935546875, + "learning_rate": 6.2666e-06, + "loss": 0.4248, + "step": 18680 + }, + { + "epoch": 1.869, + "grad_norm": 22.568464279174805, + "learning_rate": 6.264600000000001e-06, + "loss": 0.6085, + "step": 18690 + }, + { + "epoch": 1.87, + "grad_norm": 58.15055847167969, + "learning_rate": 6.262600000000001e-06, + "loss": 0.5467, + "step": 18700 + }, + { + "epoch": 1.871, + "grad_norm": 28.386558532714844, + "learning_rate": 6.2606e-06, + "loss": 0.6753, + "step": 18710 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 44.6679573059082, + "learning_rate": 6.2586000000000005e-06, + "loss": 0.9636, + "step": 18720 + }, + { + "epoch": 1.873, + "grad_norm": 27.022314071655273, + "learning_rate": 6.2566e-06, + "loss": 0.6403, + "step": 18730 + }, + { + "epoch": 1.874, + "grad_norm": 36.570796966552734, + "learning_rate": 6.254600000000001e-06, + "loss": 0.7745, + "step": 18740 + }, + { + "epoch": 1.875, + "grad_norm": 34.11301040649414, + "learning_rate": 6.252600000000001e-06, + "loss": 0.6801, + "step": 18750 + }, + { + "epoch": 1.876, + "grad_norm": 66.19660186767578, + "learning_rate": 6.2506e-06, + "loss": 0.6163, + "step": 18760 + }, + { + "epoch": 1.877, + "grad_norm": 9.71645736694336, + "learning_rate": 6.248600000000001e-06, + "loss": 0.4776, + "step": 18770 + }, + { + "epoch": 1.8780000000000001, + "grad_norm": 23.817113876342773, + "learning_rate": 6.2466000000000005e-06, + "loss": 0.587, + "step": 18780 + }, + { + "epoch": 1.879, + "grad_norm": 64.90528869628906, + "learning_rate": 6.2446e-06, + "loss": 0.7563, + "step": 18790 + }, + { + "epoch": 1.88, + "grad_norm": 31.6854190826416, + "learning_rate": 6.242600000000001e-06, + "loss": 0.6346, + "step": 18800 + }, + { + "epoch": 1.881, + "grad_norm": 38.89235305786133, + "learning_rate": 6.2406e-06, + "loss": 0.7503, + "step": 18810 + }, + { + "epoch": 1.8820000000000001, + "grad_norm": 24.65252685546875, + "learning_rate": 6.2386e-06, + "loss": 0.7953, + "step": 18820 + }, + { + "epoch": 1.883, + "grad_norm": 46.37358474731445, + "learning_rate": 6.2366000000000006e-06, + "loss": 0.7451, + "step": 18830 + }, + { + "epoch": 1.884, + "grad_norm": 80.3861083984375, + "learning_rate": 6.2346e-06, + "loss": 0.8484, + "step": 18840 + }, + { + "epoch": 1.885, + "grad_norm": 36.111209869384766, + "learning_rate": 6.232600000000001e-06, + "loss": 0.7848, + "step": 18850 + }, + { + "epoch": 1.8860000000000001, + "grad_norm": 31.149621963500977, + "learning_rate": 6.2306e-06, + "loss": 0.8536, + "step": 18860 + }, + { + "epoch": 1.887, + "grad_norm": 31.306196212768555, + "learning_rate": 6.2286e-06, + "loss": 0.6689, + "step": 18870 + }, + { + "epoch": 1.888, + "grad_norm": 46.65980911254883, + "learning_rate": 6.226600000000001e-06, + "loss": 0.656, + "step": 18880 + }, + { + "epoch": 1.889, + "grad_norm": 37.753135681152344, + "learning_rate": 6.2246000000000005e-06, + "loss": 0.7444, + "step": 18890 + }, + { + "epoch": 1.8900000000000001, + "grad_norm": 36.70481491088867, + "learning_rate": 6.222600000000001e-06, + "loss": 0.5093, + "step": 18900 + }, + { + "epoch": 1.891, + "grad_norm": 25.8837833404541, + "learning_rate": 6.2206e-06, + "loss": 0.7465, + "step": 18910 + }, + { + "epoch": 1.892, + "grad_norm": 16.100866317749023, + "learning_rate": 6.2186e-06, + "loss": 0.5148, + "step": 18920 + }, + { + "epoch": 1.893, + "grad_norm": 37.50735092163086, + "learning_rate": 6.216600000000001e-06, + "loss": 0.5519, + "step": 18930 + }, + { + "epoch": 1.8940000000000001, + "grad_norm": 37.880340576171875, + "learning_rate": 6.214600000000001e-06, + "loss": 0.6402, + "step": 18940 + }, + { + "epoch": 1.895, + "grad_norm": 19.375473022460938, + "learning_rate": 6.2126000000000005e-06, + "loss": 0.4894, + "step": 18950 + }, + { + "epoch": 1.896, + "grad_norm": 24.145061492919922, + "learning_rate": 6.2106e-06, + "loss": 0.766, + "step": 18960 + }, + { + "epoch": 1.897, + "grad_norm": 9.419266700744629, + "learning_rate": 6.2086e-06, + "loss": 0.8279, + "step": 18970 + }, + { + "epoch": 1.8980000000000001, + "grad_norm": 19.154048919677734, + "learning_rate": 6.2066e-06, + "loss": 0.6761, + "step": 18980 + }, + { + "epoch": 1.899, + "grad_norm": 33.68185043334961, + "learning_rate": 6.204600000000001e-06, + "loss": 0.5217, + "step": 18990 + }, + { + "epoch": 1.9, + "grad_norm": 23.5676212310791, + "learning_rate": 6.2026000000000006e-06, + "loss": 0.619, + "step": 19000 + }, + { + "epoch": 1.901, + "grad_norm": 28.72123146057129, + "learning_rate": 6.2005999999999996e-06, + "loss": 0.8742, + "step": 19010 + }, + { + "epoch": 1.9020000000000001, + "grad_norm": 9.565010070800781, + "learning_rate": 6.1986e-06, + "loss": 0.5942, + "step": 19020 + }, + { + "epoch": 1.903, + "grad_norm": 20.975059509277344, + "learning_rate": 6.1966e-06, + "loss": 0.5888, + "step": 19030 + }, + { + "epoch": 1.904, + "grad_norm": 15.559560775756836, + "learning_rate": 6.194600000000001e-06, + "loss": 0.5595, + "step": 19040 + }, + { + "epoch": 1.905, + "grad_norm": 31.841747283935547, + "learning_rate": 6.192600000000001e-06, + "loss": 1.067, + "step": 19050 + }, + { + "epoch": 1.9060000000000001, + "grad_norm": 7.151304244995117, + "learning_rate": 6.190600000000001e-06, + "loss": 0.6589, + "step": 19060 + }, + { + "epoch": 1.907, + "grad_norm": 42.31881332397461, + "learning_rate": 6.1886e-06, + "loss": 0.7246, + "step": 19070 + }, + { + "epoch": 1.908, + "grad_norm": 10.477072715759277, + "learning_rate": 6.1866e-06, + "loss": 0.5011, + "step": 19080 + }, + { + "epoch": 1.909, + "grad_norm": 24.97276496887207, + "learning_rate": 6.184600000000001e-06, + "loss": 0.712, + "step": 19090 + }, + { + "epoch": 1.9100000000000001, + "grad_norm": 56.96651840209961, + "learning_rate": 6.182600000000001e-06, + "loss": 0.7178, + "step": 19100 + }, + { + "epoch": 1.911, + "grad_norm": 39.4375, + "learning_rate": 6.1806000000000014e-06, + "loss": 0.666, + "step": 19110 + }, + { + "epoch": 1.912, + "grad_norm": 66.98754119873047, + "learning_rate": 6.1786000000000004e-06, + "loss": 1.0206, + "step": 19120 + }, + { + "epoch": 1.913, + "grad_norm": 29.384794235229492, + "learning_rate": 6.1766e-06, + "loss": 0.8413, + "step": 19130 + }, + { + "epoch": 1.9140000000000001, + "grad_norm": 22.646846771240234, + "learning_rate": 6.174600000000001e-06, + "loss": 0.7687, + "step": 19140 + }, + { + "epoch": 1.915, + "grad_norm": 32.5865592956543, + "learning_rate": 6.172600000000001e-06, + "loss": 0.7559, + "step": 19150 + }, + { + "epoch": 1.916, + "grad_norm": 14.892390251159668, + "learning_rate": 6.170600000000001e-06, + "loss": 0.7594, + "step": 19160 + }, + { + "epoch": 1.917, + "grad_norm": 32.36692810058594, + "learning_rate": 6.1686000000000005e-06, + "loss": 0.6105, + "step": 19170 + }, + { + "epoch": 1.9180000000000001, + "grad_norm": 22.858366012573242, + "learning_rate": 6.1666e-06, + "loss": 0.6479, + "step": 19180 + }, + { + "epoch": 1.919, + "grad_norm": 34.3131217956543, + "learning_rate": 6.1646e-06, + "loss": 0.6657, + "step": 19190 + }, + { + "epoch": 1.92, + "grad_norm": 14.601320266723633, + "learning_rate": 6.162600000000001e-06, + "loss": 0.6711, + "step": 19200 + }, + { + "epoch": 1.921, + "grad_norm": 27.481473922729492, + "learning_rate": 6.160600000000001e-06, + "loss": 0.6777, + "step": 19210 + }, + { + "epoch": 1.9220000000000002, + "grad_norm": 19.15306282043457, + "learning_rate": 6.1586e-06, + "loss": 0.8164, + "step": 19220 + }, + { + "epoch": 1.923, + "grad_norm": 18.720388412475586, + "learning_rate": 6.1566000000000005e-06, + "loss": 0.5016, + "step": 19230 + }, + { + "epoch": 1.924, + "grad_norm": 21.875688552856445, + "learning_rate": 6.1546e-06, + "loss": 0.6871, + "step": 19240 + }, + { + "epoch": 1.925, + "grad_norm": 19.90581512451172, + "learning_rate": 6.152600000000001e-06, + "loss": 0.8132, + "step": 19250 + }, + { + "epoch": 1.9260000000000002, + "grad_norm": 38.14246368408203, + "learning_rate": 6.150600000000001e-06, + "loss": 0.6122, + "step": 19260 + }, + { + "epoch": 1.927, + "grad_norm": 16.996580123901367, + "learning_rate": 6.1486e-06, + "loss": 0.561, + "step": 19270 + }, + { + "epoch": 1.928, + "grad_norm": 15.28274154663086, + "learning_rate": 6.146600000000001e-06, + "loss": 0.4495, + "step": 19280 + }, + { + "epoch": 1.929, + "grad_norm": 32.38314437866211, + "learning_rate": 6.1446000000000004e-06, + "loss": 0.6292, + "step": 19290 + }, + { + "epoch": 1.9300000000000002, + "grad_norm": 19.988807678222656, + "learning_rate": 6.142600000000001e-06, + "loss": 0.7398, + "step": 19300 + }, + { + "epoch": 1.931, + "grad_norm": 40.12685012817383, + "learning_rate": 6.140600000000001e-06, + "loss": 0.8673, + "step": 19310 + }, + { + "epoch": 1.932, + "grad_norm": 23.791723251342773, + "learning_rate": 6.1386e-06, + "loss": 0.7318, + "step": 19320 + }, + { + "epoch": 1.933, + "grad_norm": 5.818343162536621, + "learning_rate": 6.136600000000001e-06, + "loss": 0.4892, + "step": 19330 + }, + { + "epoch": 1.9340000000000002, + "grad_norm": 32.458740234375, + "learning_rate": 6.1346000000000005e-06, + "loss": 0.4111, + "step": 19340 + }, + { + "epoch": 1.935, + "grad_norm": 23.6914119720459, + "learning_rate": 6.1326e-06, + "loss": 0.417, + "step": 19350 + }, + { + "epoch": 1.936, + "grad_norm": 39.17317199707031, + "learning_rate": 6.130600000000001e-06, + "loss": 0.8256, + "step": 19360 + }, + { + "epoch": 1.937, + "grad_norm": 4.7436957359313965, + "learning_rate": 6.1286e-06, + "loss": 0.5982, + "step": 19370 + }, + { + "epoch": 1.938, + "grad_norm": 16.462995529174805, + "learning_rate": 6.1266e-06, + "loss": 0.7216, + "step": 19380 + }, + { + "epoch": 1.939, + "grad_norm": 12.548359870910645, + "learning_rate": 6.124600000000001e-06, + "loss": 0.7207, + "step": 19390 + }, + { + "epoch": 1.94, + "grad_norm": 11.830649375915527, + "learning_rate": 6.1226000000000005e-06, + "loss": 0.4089, + "step": 19400 + }, + { + "epoch": 1.9409999999999998, + "grad_norm": 12.117684364318848, + "learning_rate": 6.120600000000001e-06, + "loss": 0.5172, + "step": 19410 + }, + { + "epoch": 1.942, + "grad_norm": 25.5642147064209, + "learning_rate": 6.1186e-06, + "loss": 0.4171, + "step": 19420 + }, + { + "epoch": 1.943, + "grad_norm": 42.071712493896484, + "learning_rate": 6.1166e-06, + "loss": 0.7356, + "step": 19430 + }, + { + "epoch": 1.944, + "grad_norm": 2.329131603240967, + "learning_rate": 6.114600000000001e-06, + "loss": 0.4786, + "step": 19440 + }, + { + "epoch": 1.9449999999999998, + "grad_norm": 41.95789337158203, + "learning_rate": 6.1126000000000006e-06, + "loss": 0.5854, + "step": 19450 + }, + { + "epoch": 1.946, + "grad_norm": 14.594151496887207, + "learning_rate": 6.110600000000001e-06, + "loss": 0.687, + "step": 19460 + }, + { + "epoch": 1.947, + "grad_norm": 74.92787170410156, + "learning_rate": 6.1086e-06, + "loss": 0.8889, + "step": 19470 + }, + { + "epoch": 1.948, + "grad_norm": 42.688167572021484, + "learning_rate": 6.1066e-06, + "loss": 0.507, + "step": 19480 + }, + { + "epoch": 1.9489999999999998, + "grad_norm": 37.9570198059082, + "learning_rate": 6.104600000000001e-06, + "loss": 0.7797, + "step": 19490 + }, + { + "epoch": 1.95, + "grad_norm": 29.89501953125, + "learning_rate": 6.102600000000001e-06, + "loss": 0.7377, + "step": 19500 + }, + { + "epoch": 1.951, + "grad_norm": 8.201680183410645, + "learning_rate": 6.100600000000001e-06, + "loss": 0.9134, + "step": 19510 + }, + { + "epoch": 1.952, + "grad_norm": 28.5087890625, + "learning_rate": 6.0986e-06, + "loss": 0.6407, + "step": 19520 + }, + { + "epoch": 1.9529999999999998, + "grad_norm": 13.32079029083252, + "learning_rate": 6.0966e-06, + "loss": 0.6662, + "step": 19530 + }, + { + "epoch": 1.954, + "grad_norm": 16.217845916748047, + "learning_rate": 6.094600000000001e-06, + "loss": 0.5368, + "step": 19540 + }, + { + "epoch": 1.955, + "grad_norm": 25.147905349731445, + "learning_rate": 6.092600000000001e-06, + "loss": 0.7631, + "step": 19550 + }, + { + "epoch": 1.956, + "grad_norm": 36.97098922729492, + "learning_rate": 6.090600000000001e-06, + "loss": 0.7657, + "step": 19560 + }, + { + "epoch": 1.9569999999999999, + "grad_norm": 32.233978271484375, + "learning_rate": 6.0886000000000005e-06, + "loss": 0.7187, + "step": 19570 + }, + { + "epoch": 1.958, + "grad_norm": 22.951942443847656, + "learning_rate": 6.0866e-06, + "loss": 0.8062, + "step": 19580 + }, + { + "epoch": 1.959, + "grad_norm": 23.635204315185547, + "learning_rate": 6.0846e-06, + "loss": 0.6772, + "step": 19590 + }, + { + "epoch": 1.96, + "grad_norm": 33.38811492919922, + "learning_rate": 6.082600000000001e-06, + "loss": 0.6484, + "step": 19600 + }, + { + "epoch": 1.9609999999999999, + "grad_norm": 22.453102111816406, + "learning_rate": 6.080600000000001e-06, + "loss": 0.7624, + "step": 19610 + }, + { + "epoch": 1.962, + "grad_norm": 50.9228401184082, + "learning_rate": 6.0786e-06, + "loss": 0.7692, + "step": 19620 + }, + { + "epoch": 1.963, + "grad_norm": 12.734217643737793, + "learning_rate": 6.0766e-06, + "loss": 0.8659, + "step": 19630 + }, + { + "epoch": 1.964, + "grad_norm": 22.461109161376953, + "learning_rate": 6.0746e-06, + "loss": 0.8955, + "step": 19640 + }, + { + "epoch": 1.9649999999999999, + "grad_norm": 31.198320388793945, + "learning_rate": 6.072600000000001e-06, + "loss": 0.6954, + "step": 19650 + }, + { + "epoch": 1.966, + "grad_norm": 17.338960647583008, + "learning_rate": 6.070600000000001e-06, + "loss": 0.6051, + "step": 19660 + }, + { + "epoch": 1.967, + "grad_norm": 19.843788146972656, + "learning_rate": 6.0686e-06, + "loss": 0.6954, + "step": 19670 + }, + { + "epoch": 1.968, + "grad_norm": 28.969507217407227, + "learning_rate": 6.0666000000000005e-06, + "loss": 0.5193, + "step": 19680 + }, + { + "epoch": 1.9689999999999999, + "grad_norm": 13.265837669372559, + "learning_rate": 6.0646e-06, + "loss": 0.6375, + "step": 19690 + }, + { + "epoch": 1.97, + "grad_norm": 8.410111427307129, + "learning_rate": 6.062600000000001e-06, + "loss": 0.7512, + "step": 19700 + }, + { + "epoch": 1.971, + "grad_norm": 34.03276062011719, + "learning_rate": 6.060600000000001e-06, + "loss": 0.6919, + "step": 19710 + }, + { + "epoch": 1.972, + "grad_norm": 9.901426315307617, + "learning_rate": 6.0586e-06, + "loss": 0.4584, + "step": 19720 + }, + { + "epoch": 1.9729999999999999, + "grad_norm": 26.24072265625, + "learning_rate": 6.056600000000001e-06, + "loss": 0.5403, + "step": 19730 + }, + { + "epoch": 1.974, + "grad_norm": 38.819496154785156, + "learning_rate": 6.0546000000000004e-06, + "loss": 0.7342, + "step": 19740 + }, + { + "epoch": 1.975, + "grad_norm": 30.7410888671875, + "learning_rate": 6.0526e-06, + "loss": 0.5699, + "step": 19750 + }, + { + "epoch": 1.976, + "grad_norm": 19.44249153137207, + "learning_rate": 6.050600000000001e-06, + "loss": 0.5331, + "step": 19760 + }, + { + "epoch": 1.9769999999999999, + "grad_norm": 5.10331392288208, + "learning_rate": 6.0486e-06, + "loss": 0.5346, + "step": 19770 + }, + { + "epoch": 1.978, + "grad_norm": 21.655014038085938, + "learning_rate": 6.0466e-06, + "loss": 0.7944, + "step": 19780 + }, + { + "epoch": 1.979, + "grad_norm": 29.06139373779297, + "learning_rate": 6.0446000000000005e-06, + "loss": 0.8133, + "step": 19790 + }, + { + "epoch": 1.98, + "grad_norm": 41.67628479003906, + "learning_rate": 6.0426e-06, + "loss": 0.6099, + "step": 19800 + }, + { + "epoch": 1.9809999999999999, + "grad_norm": 58.43052673339844, + "learning_rate": 6.040600000000001e-06, + "loss": 0.7782, + "step": 19810 + }, + { + "epoch": 1.982, + "grad_norm": 24.17974853515625, + "learning_rate": 6.0386e-06, + "loss": 0.5837, + "step": 19820 + }, + { + "epoch": 1.983, + "grad_norm": 20.998645782470703, + "learning_rate": 6.0366e-06, + "loss": 0.8658, + "step": 19830 + }, + { + "epoch": 1.984, + "grad_norm": 9.17137336730957, + "learning_rate": 6.034600000000001e-06, + "loss": 0.5392, + "step": 19840 + }, + { + "epoch": 1.9849999999999999, + "grad_norm": 21.920196533203125, + "learning_rate": 6.0326000000000005e-06, + "loss": 0.6472, + "step": 19850 + }, + { + "epoch": 1.986, + "grad_norm": 11.771697044372559, + "learning_rate": 6.030600000000001e-06, + "loss": 0.6313, + "step": 19860 + }, + { + "epoch": 1.987, + "grad_norm": 24.2471981048584, + "learning_rate": 6.0286e-06, + "loss": 0.6562, + "step": 19870 + }, + { + "epoch": 1.988, + "grad_norm": 22.034807205200195, + "learning_rate": 6.0266e-06, + "loss": 0.6229, + "step": 19880 + }, + { + "epoch": 1.9889999999999999, + "grad_norm": 36.45222854614258, + "learning_rate": 6.024600000000001e-06, + "loss": 0.7467, + "step": 19890 + }, + { + "epoch": 1.99, + "grad_norm": 33.47389221191406, + "learning_rate": 6.022600000000001e-06, + "loss": 0.6745, + "step": 19900 + }, + { + "epoch": 1.991, + "grad_norm": 68.49720764160156, + "learning_rate": 6.020600000000001e-06, + "loss": 0.7931, + "step": 19910 + }, + { + "epoch": 1.992, + "grad_norm": 44.77395248413086, + "learning_rate": 6.0186e-06, + "loss": 0.7077, + "step": 19920 + }, + { + "epoch": 1.9929999999999999, + "grad_norm": 26.6000919342041, + "learning_rate": 6.0166e-06, + "loss": 0.7528, + "step": 19930 + }, + { + "epoch": 1.994, + "grad_norm": 26.399738311767578, + "learning_rate": 6.014600000000001e-06, + "loss": 0.3916, + "step": 19940 + }, + { + "epoch": 1.995, + "grad_norm": 12.024267196655273, + "learning_rate": 6.012600000000001e-06, + "loss": 0.838, + "step": 19950 + }, + { + "epoch": 1.996, + "grad_norm": 0.7568420171737671, + "learning_rate": 6.0106000000000005e-06, + "loss": 0.5821, + "step": 19960 + }, + { + "epoch": 1.9969999999999999, + "grad_norm": 19.045326232910156, + "learning_rate": 6.0086e-06, + "loss": 0.4877, + "step": 19970 + }, + { + "epoch": 1.998, + "grad_norm": 30.190324783325195, + "learning_rate": 6.0066e-06, + "loss": 0.8412, + "step": 19980 + }, + { + "epoch": 1.999, + "grad_norm": 20.587814331054688, + "learning_rate": 6.0046e-06, + "loss": 0.7776, + "step": 19990 + }, + { + "epoch": 2.0, + "grad_norm": 47.415870666503906, + "learning_rate": 6.002600000000001e-06, + "loss": 0.8361, + "step": 20000 + }, + { + "epoch": 2.001, + "grad_norm": 34.35898971557617, + "learning_rate": 6.000600000000001e-06, + "loss": 0.6689, + "step": 20010 + }, + { + "epoch": 2.002, + "grad_norm": 22.506715774536133, + "learning_rate": 5.9986e-06, + "loss": 0.4416, + "step": 20020 + }, + { + "epoch": 2.003, + "grad_norm": 33.02839279174805, + "learning_rate": 5.9966e-06, + "loss": 0.6362, + "step": 20030 + }, + { + "epoch": 2.004, + "grad_norm": 52.246097564697266, + "learning_rate": 5.9946e-06, + "loss": 0.6402, + "step": 20040 + }, + { + "epoch": 2.005, + "grad_norm": 17.559724807739258, + "learning_rate": 5.992600000000001e-06, + "loss": 0.6283, + "step": 20050 + }, + { + "epoch": 2.006, + "grad_norm": 41.119911193847656, + "learning_rate": 5.990600000000001e-06, + "loss": 0.856, + "step": 20060 + }, + { + "epoch": 2.007, + "grad_norm": 39.30539321899414, + "learning_rate": 5.9886e-06, + "loss": 0.6278, + "step": 20070 + }, + { + "epoch": 2.008, + "grad_norm": 30.116628646850586, + "learning_rate": 5.9866e-06, + "loss": 0.5567, + "step": 20080 + }, + { + "epoch": 2.009, + "grad_norm": 42.305233001708984, + "learning_rate": 5.9846e-06, + "loss": 0.5619, + "step": 20090 + }, + { + "epoch": 2.01, + "grad_norm": 23.330278396606445, + "learning_rate": 5.982600000000001e-06, + "loss": 0.5062, + "step": 20100 + }, + { + "epoch": 2.011, + "grad_norm": 0.366443008184433, + "learning_rate": 5.980600000000001e-06, + "loss": 0.5095, + "step": 20110 + }, + { + "epoch": 2.012, + "grad_norm": 22.624025344848633, + "learning_rate": 5.9786e-06, + "loss": 0.5983, + "step": 20120 + }, + { + "epoch": 2.013, + "grad_norm": 55.463741302490234, + "learning_rate": 5.9766000000000005e-06, + "loss": 0.7777, + "step": 20130 + }, + { + "epoch": 2.014, + "grad_norm": 26.516307830810547, + "learning_rate": 5.9746e-06, + "loss": 0.5525, + "step": 20140 + }, + { + "epoch": 2.015, + "grad_norm": 24.06287384033203, + "learning_rate": 5.9726e-06, + "loss": 0.7574, + "step": 20150 + }, + { + "epoch": 2.016, + "grad_norm": 50.38890075683594, + "learning_rate": 5.970600000000001e-06, + "loss": 0.5241, + "step": 20160 + }, + { + "epoch": 2.017, + "grad_norm": 33.88356018066406, + "learning_rate": 5.9686e-06, + "loss": 0.4269, + "step": 20170 + }, + { + "epoch": 2.018, + "grad_norm": 26.697917938232422, + "learning_rate": 5.9666e-06, + "loss": 0.6051, + "step": 20180 + }, + { + "epoch": 2.019, + "grad_norm": 15.631553649902344, + "learning_rate": 5.9646000000000005e-06, + "loss": 0.6533, + "step": 20190 + }, + { + "epoch": 2.02, + "grad_norm": 12.76919937133789, + "learning_rate": 5.9626e-06, + "loss": 0.7292, + "step": 20200 + }, + { + "epoch": 2.021, + "grad_norm": 29.44046974182129, + "learning_rate": 5.960600000000001e-06, + "loss": 0.7371, + "step": 20210 + }, + { + "epoch": 2.022, + "grad_norm": 26.83331298828125, + "learning_rate": 5.9586e-06, + "loss": 0.8509, + "step": 20220 + }, + { + "epoch": 2.023, + "grad_norm": 18.271591186523438, + "learning_rate": 5.9566e-06, + "loss": 0.5269, + "step": 20230 + }, + { + "epoch": 2.024, + "grad_norm": 81.81536102294922, + "learning_rate": 5.9546000000000006e-06, + "loss": 0.7498, + "step": 20240 + }, + { + "epoch": 2.025, + "grad_norm": 42.30961608886719, + "learning_rate": 5.9526e-06, + "loss": 0.5365, + "step": 20250 + }, + { + "epoch": 2.026, + "grad_norm": 22.208972930908203, + "learning_rate": 5.950600000000001e-06, + "loss": 0.5537, + "step": 20260 + }, + { + "epoch": 2.027, + "grad_norm": 1.5519413948059082, + "learning_rate": 5.948600000000001e-06, + "loss": 0.7741, + "step": 20270 + }, + { + "epoch": 2.028, + "grad_norm": 24.482988357543945, + "learning_rate": 5.9466e-06, + "loss": 0.4574, + "step": 20280 + }, + { + "epoch": 2.029, + "grad_norm": 7.925387382507324, + "learning_rate": 5.944600000000001e-06, + "loss": 0.6198, + "step": 20290 + }, + { + "epoch": 2.03, + "grad_norm": 35.66553497314453, + "learning_rate": 5.9426000000000005e-06, + "loss": 0.8091, + "step": 20300 + }, + { + "epoch": 2.031, + "grad_norm": 36.801700592041016, + "learning_rate": 5.940600000000001e-06, + "loss": 0.5905, + "step": 20310 + }, + { + "epoch": 2.032, + "grad_norm": 16.00217628479004, + "learning_rate": 5.938600000000001e-06, + "loss": 0.6229, + "step": 20320 + }, + { + "epoch": 2.033, + "grad_norm": 41.883907318115234, + "learning_rate": 5.9366e-06, + "loss": 0.617, + "step": 20330 + }, + { + "epoch": 2.034, + "grad_norm": 42.81610870361328, + "learning_rate": 5.934600000000001e-06, + "loss": 0.5014, + "step": 20340 + }, + { + "epoch": 2.035, + "grad_norm": 40.49055862426758, + "learning_rate": 5.932600000000001e-06, + "loss": 0.4363, + "step": 20350 + }, + { + "epoch": 2.036, + "grad_norm": 21.71941566467285, + "learning_rate": 5.9306000000000004e-06, + "loss": 0.853, + "step": 20360 + }, + { + "epoch": 2.037, + "grad_norm": 63.479583740234375, + "learning_rate": 5.928600000000001e-06, + "loss": 0.8359, + "step": 20370 + }, + { + "epoch": 2.038, + "grad_norm": 11.940189361572266, + "learning_rate": 5.9266e-06, + "loss": 0.4842, + "step": 20380 + }, + { + "epoch": 2.039, + "grad_norm": 13.797748565673828, + "learning_rate": 5.9246e-06, + "loss": 0.3111, + "step": 20390 + }, + { + "epoch": 2.04, + "grad_norm": 26.523082733154297, + "learning_rate": 5.922600000000001e-06, + "loss": 0.8263, + "step": 20400 + }, + { + "epoch": 2.041, + "grad_norm": 15.565614700317383, + "learning_rate": 5.9206000000000005e-06, + "loss": 0.2726, + "step": 20410 + }, + { + "epoch": 2.042, + "grad_norm": 23.820993423461914, + "learning_rate": 5.918600000000001e-06, + "loss": 0.8273, + "step": 20420 + }, + { + "epoch": 2.043, + "grad_norm": 22.535480499267578, + "learning_rate": 5.9166e-06, + "loss": 0.614, + "step": 20430 + }, + { + "epoch": 2.044, + "grad_norm": 60.1475830078125, + "learning_rate": 5.9146e-06, + "loss": 0.7601, + "step": 20440 + }, + { + "epoch": 2.045, + "grad_norm": 27.413978576660156, + "learning_rate": 5.912600000000001e-06, + "loss": 0.7644, + "step": 20450 + }, + { + "epoch": 2.046, + "grad_norm": 7.223193645477295, + "learning_rate": 5.910600000000001e-06, + "loss": 0.5678, + "step": 20460 + }, + { + "epoch": 2.047, + "grad_norm": 15.275238990783691, + "learning_rate": 5.908600000000001e-06, + "loss": 0.7498, + "step": 20470 + }, + { + "epoch": 2.048, + "grad_norm": 68.40691375732422, + "learning_rate": 5.9066e-06, + "loss": 0.9087, + "step": 20480 + }, + { + "epoch": 2.049, + "grad_norm": 18.788619995117188, + "learning_rate": 5.9046e-06, + "loss": 0.8295, + "step": 20490 + }, + { + "epoch": 2.05, + "grad_norm": 57.324222564697266, + "learning_rate": 5.902600000000001e-06, + "loss": 0.7998, + "step": 20500 + }, + { + "epoch": 2.051, + "grad_norm": 6.82062292098999, + "learning_rate": 5.900600000000001e-06, + "loss": 0.4767, + "step": 20510 + }, + { + "epoch": 2.052, + "grad_norm": 35.272586822509766, + "learning_rate": 5.898600000000001e-06, + "loss": 0.5846, + "step": 20520 + }, + { + "epoch": 2.053, + "grad_norm": 29.768474578857422, + "learning_rate": 5.8966000000000004e-06, + "loss": 0.6642, + "step": 20530 + }, + { + "epoch": 2.054, + "grad_norm": 29.56476593017578, + "learning_rate": 5.8946e-06, + "loss": 0.7453, + "step": 20540 + }, + { + "epoch": 2.055, + "grad_norm": 55.21421813964844, + "learning_rate": 5.8926e-06, + "loss": 0.6613, + "step": 20550 + }, + { + "epoch": 2.056, + "grad_norm": 38.4114875793457, + "learning_rate": 5.890600000000001e-06, + "loss": 0.6084, + "step": 20560 + }, + { + "epoch": 2.057, + "grad_norm": 6.092259883880615, + "learning_rate": 5.888600000000001e-06, + "loss": 0.3543, + "step": 20570 + }, + { + "epoch": 2.058, + "grad_norm": 30.972808837890625, + "learning_rate": 5.8866e-06, + "loss": 0.6939, + "step": 20580 + }, + { + "epoch": 2.059, + "grad_norm": 48.587074279785156, + "learning_rate": 5.8846e-06, + "loss": 0.7431, + "step": 20590 + }, + { + "epoch": 2.06, + "grad_norm": 8.721481323242188, + "learning_rate": 5.8826e-06, + "loss": 0.6013, + "step": 20600 + }, + { + "epoch": 2.061, + "grad_norm": 55.05281066894531, + "learning_rate": 5.880600000000001e-06, + "loss": 0.8299, + "step": 20610 + }, + { + "epoch": 2.062, + "grad_norm": 13.547633171081543, + "learning_rate": 5.878600000000001e-06, + "loss": 0.6287, + "step": 20620 + }, + { + "epoch": 2.063, + "grad_norm": 43.30170440673828, + "learning_rate": 5.8766e-06, + "loss": 0.9062, + "step": 20630 + }, + { + "epoch": 2.064, + "grad_norm": 69.8653335571289, + "learning_rate": 5.8746000000000005e-06, + "loss": 0.6167, + "step": 20640 + }, + { + "epoch": 2.065, + "grad_norm": 13.36156940460205, + "learning_rate": 5.8726e-06, + "loss": 0.5346, + "step": 20650 + }, + { + "epoch": 2.066, + "grad_norm": 55.982967376708984, + "learning_rate": 5.870600000000001e-06, + "loss": 0.6719, + "step": 20660 + }, + { + "epoch": 2.067, + "grad_norm": 19.473783493041992, + "learning_rate": 5.868600000000001e-06, + "loss": 0.6472, + "step": 20670 + }, + { + "epoch": 2.068, + "grad_norm": 47.84219741821289, + "learning_rate": 5.8666e-06, + "loss": 0.6803, + "step": 20680 + }, + { + "epoch": 2.069, + "grad_norm": 22.7913761138916, + "learning_rate": 5.8646000000000006e-06, + "loss": 0.9931, + "step": 20690 + }, + { + "epoch": 2.07, + "grad_norm": 30.682554244995117, + "learning_rate": 5.8626e-06, + "loss": 0.5268, + "step": 20700 + }, + { + "epoch": 2.071, + "grad_norm": 30.542743682861328, + "learning_rate": 5.860600000000001e-06, + "loss": 0.5187, + "step": 20710 + }, + { + "epoch": 2.072, + "grad_norm": 33.08385467529297, + "learning_rate": 5.858600000000001e-06, + "loss": 0.7148, + "step": 20720 + }, + { + "epoch": 2.073, + "grad_norm": 43.1998291015625, + "learning_rate": 5.8566e-06, + "loss": 0.5574, + "step": 20730 + }, + { + "epoch": 2.074, + "grad_norm": 35.44502258300781, + "learning_rate": 5.854600000000001e-06, + "loss": 0.5502, + "step": 20740 + }, + { + "epoch": 2.075, + "grad_norm": 23.778690338134766, + "learning_rate": 5.8526000000000005e-06, + "loss": 0.7837, + "step": 20750 + }, + { + "epoch": 2.076, + "grad_norm": 15.768932342529297, + "learning_rate": 5.8506e-06, + "loss": 0.5188, + "step": 20760 + }, + { + "epoch": 2.077, + "grad_norm": 57.871437072753906, + "learning_rate": 5.848600000000001e-06, + "loss": 0.523, + "step": 20770 + }, + { + "epoch": 2.078, + "grad_norm": 45.32200622558594, + "learning_rate": 5.8466e-06, + "loss": 0.8176, + "step": 20780 + }, + { + "epoch": 2.079, + "grad_norm": 32.9245491027832, + "learning_rate": 5.8446e-06, + "loss": 0.5147, + "step": 20790 + }, + { + "epoch": 2.08, + "grad_norm": 44.795677185058594, + "learning_rate": 5.842600000000001e-06, + "loss": 0.745, + "step": 20800 + }, + { + "epoch": 2.081, + "grad_norm": 49.823299407958984, + "learning_rate": 5.8406000000000005e-06, + "loss": 0.7354, + "step": 20810 + }, + { + "epoch": 2.082, + "grad_norm": 6.510171413421631, + "learning_rate": 5.838600000000001e-06, + "loss": 0.4389, + "step": 20820 + }, + { + "epoch": 2.083, + "grad_norm": 24.822555541992188, + "learning_rate": 5.8366e-06, + "loss": 0.7396, + "step": 20830 + }, + { + "epoch": 2.084, + "grad_norm": 35.97170639038086, + "learning_rate": 5.8346e-06, + "loss": 0.5066, + "step": 20840 + }, + { + "epoch": 2.085, + "grad_norm": 62.91329574584961, + "learning_rate": 5.832600000000001e-06, + "loss": 1.0351, + "step": 20850 + }, + { + "epoch": 2.086, + "grad_norm": 40.740325927734375, + "learning_rate": 5.8306000000000006e-06, + "loss": 0.4666, + "step": 20860 + }, + { + "epoch": 2.087, + "grad_norm": 35.693050384521484, + "learning_rate": 5.828600000000001e-06, + "loss": 0.6951, + "step": 20870 + }, + { + "epoch": 2.088, + "grad_norm": 7.824371814727783, + "learning_rate": 5.8266e-06, + "loss": 0.6165, + "step": 20880 + }, + { + "epoch": 2.089, + "grad_norm": 18.26198959350586, + "learning_rate": 5.8246e-06, + "loss": 0.6349, + "step": 20890 + }, + { + "epoch": 2.09, + "grad_norm": 33.34101486206055, + "learning_rate": 5.822600000000001e-06, + "loss": 0.5883, + "step": 20900 + }, + { + "epoch": 2.091, + "grad_norm": 9.032116889953613, + "learning_rate": 5.820600000000001e-06, + "loss": 0.8677, + "step": 20910 + }, + { + "epoch": 2.092, + "grad_norm": 61.170379638671875, + "learning_rate": 5.8186000000000005e-06, + "loss": 0.6158, + "step": 20920 + }, + { + "epoch": 2.093, + "grad_norm": 21.601675033569336, + "learning_rate": 5.8166e-06, + "loss": 0.7612, + "step": 20930 + }, + { + "epoch": 2.094, + "grad_norm": 22.31391716003418, + "learning_rate": 5.8146e-06, + "loss": 0.6095, + "step": 20940 + }, + { + "epoch": 2.095, + "grad_norm": 33.5605583190918, + "learning_rate": 5.8126e-06, + "loss": 0.503, + "step": 20950 + }, + { + "epoch": 2.096, + "grad_norm": 15.684220314025879, + "learning_rate": 5.810600000000001e-06, + "loss": 0.5778, + "step": 20960 + }, + { + "epoch": 2.097, + "grad_norm": 45.926124572753906, + "learning_rate": 5.808600000000001e-06, + "loss": 0.7229, + "step": 20970 + }, + { + "epoch": 2.098, + "grad_norm": 27.19575309753418, + "learning_rate": 5.8066e-06, + "loss": 0.5593, + "step": 20980 + }, + { + "epoch": 2.099, + "grad_norm": 28.116910934448242, + "learning_rate": 5.8046e-06, + "loss": 0.8757, + "step": 20990 + }, + { + "epoch": 2.1, + "grad_norm": 16.367252349853516, + "learning_rate": 5.8026e-06, + "loss": 0.4212, + "step": 21000 + }, + { + "epoch": 2.101, + "grad_norm": 39.175296783447266, + "learning_rate": 5.800600000000001e-06, + "loss": 0.4437, + "step": 21010 + }, + { + "epoch": 2.102, + "grad_norm": 49.59600830078125, + "learning_rate": 5.798600000000001e-06, + "loss": 0.7073, + "step": 21020 + }, + { + "epoch": 2.103, + "grad_norm": 10.100749969482422, + "learning_rate": 5.7966e-06, + "loss": 0.8202, + "step": 21030 + }, + { + "epoch": 2.104, + "grad_norm": 29.555816650390625, + "learning_rate": 5.7946e-06, + "loss": 0.5118, + "step": 21040 + }, + { + "epoch": 2.105, + "grad_norm": 22.045875549316406, + "learning_rate": 5.7926e-06, + "loss": 0.4967, + "step": 21050 + }, + { + "epoch": 2.106, + "grad_norm": 43.3510856628418, + "learning_rate": 5.790600000000001e-06, + "loss": 0.9821, + "step": 21060 + }, + { + "epoch": 2.107, + "grad_norm": 43.989376068115234, + "learning_rate": 5.788600000000001e-06, + "loss": 0.5552, + "step": 21070 + }, + { + "epoch": 2.108, + "grad_norm": 30.01483917236328, + "learning_rate": 5.7866e-06, + "loss": 0.5964, + "step": 21080 + }, + { + "epoch": 2.109, + "grad_norm": 33.281497955322266, + "learning_rate": 5.7846000000000005e-06, + "loss": 0.7582, + "step": 21090 + }, + { + "epoch": 2.11, + "grad_norm": 69.60244750976562, + "learning_rate": 5.7826e-06, + "loss": 0.5143, + "step": 21100 + }, + { + "epoch": 2.111, + "grad_norm": 2.7783589363098145, + "learning_rate": 5.780600000000001e-06, + "loss": 0.6714, + "step": 21110 + }, + { + "epoch": 2.112, + "grad_norm": 60.577178955078125, + "learning_rate": 5.778600000000001e-06, + "loss": 0.5207, + "step": 21120 + }, + { + "epoch": 2.113, + "grad_norm": 29.13699722290039, + "learning_rate": 5.7766e-06, + "loss": 0.5172, + "step": 21130 + }, + { + "epoch": 2.114, + "grad_norm": 48.14480209350586, + "learning_rate": 5.774600000000001e-06, + "loss": 0.6832, + "step": 21140 + }, + { + "epoch": 2.115, + "grad_norm": 14.728019714355469, + "learning_rate": 5.7726000000000004e-06, + "loss": 0.8305, + "step": 21150 + }, + { + "epoch": 2.116, + "grad_norm": 5.194901466369629, + "learning_rate": 5.7706e-06, + "loss": 0.6312, + "step": 21160 + }, + { + "epoch": 2.117, + "grad_norm": 48.74283218383789, + "learning_rate": 5.768600000000001e-06, + "loss": 0.6754, + "step": 21170 + }, + { + "epoch": 2.118, + "grad_norm": 22.42253303527832, + "learning_rate": 5.7666e-06, + "loss": 0.8935, + "step": 21180 + }, + { + "epoch": 2.1189999999999998, + "grad_norm": 24.238758087158203, + "learning_rate": 5.7646e-06, + "loss": 0.6695, + "step": 21190 + }, + { + "epoch": 2.12, + "grad_norm": 39.905906677246094, + "learning_rate": 5.7626000000000005e-06, + "loss": 0.6241, + "step": 21200 + }, + { + "epoch": 2.121, + "grad_norm": 35.234169006347656, + "learning_rate": 5.7606e-06, + "loss": 0.6531, + "step": 21210 + }, + { + "epoch": 2.122, + "grad_norm": 12.21348762512207, + "learning_rate": 5.758600000000001e-06, + "loss": 0.7264, + "step": 21220 + }, + { + "epoch": 2.123, + "grad_norm": 32.99053192138672, + "learning_rate": 5.7566e-06, + "loss": 0.4499, + "step": 21230 + }, + { + "epoch": 2.124, + "grad_norm": 53.96305465698242, + "learning_rate": 5.7546e-06, + "loss": 0.7433, + "step": 21240 + }, + { + "epoch": 2.125, + "grad_norm": 9.550210952758789, + "learning_rate": 5.752600000000001e-06, + "loss": 0.7014, + "step": 21250 + }, + { + "epoch": 2.126, + "grad_norm": 1.4421324729919434, + "learning_rate": 5.7506000000000005e-06, + "loss": 0.4575, + "step": 21260 + }, + { + "epoch": 2.127, + "grad_norm": 13.814619064331055, + "learning_rate": 5.748600000000001e-06, + "loss": 0.5578, + "step": 21270 + }, + { + "epoch": 2.128, + "grad_norm": 1.734191656112671, + "learning_rate": 5.7466e-06, + "loss": 0.8801, + "step": 21280 + }, + { + "epoch": 2.129, + "grad_norm": 11.361309051513672, + "learning_rate": 5.7446e-06, + "loss": 0.4905, + "step": 21290 + }, + { + "epoch": 2.13, + "grad_norm": 50.14918899536133, + "learning_rate": 5.742600000000001e-06, + "loss": 0.7295, + "step": 21300 + }, + { + "epoch": 2.1310000000000002, + "grad_norm": 34.93165588378906, + "learning_rate": 5.7406000000000006e-06, + "loss": 0.7161, + "step": 21310 + }, + { + "epoch": 2.132, + "grad_norm": 22.720502853393555, + "learning_rate": 5.7386e-06, + "loss": 0.5065, + "step": 21320 + }, + { + "epoch": 2.133, + "grad_norm": 39.20326614379883, + "learning_rate": 5.7366e-06, + "loss": 0.7608, + "step": 21330 + }, + { + "epoch": 2.134, + "grad_norm": 39.12361145019531, + "learning_rate": 5.7346e-06, + "loss": 0.6045, + "step": 21340 + }, + { + "epoch": 2.135, + "grad_norm": 19.994277954101562, + "learning_rate": 5.7326e-06, + "loss": 0.3712, + "step": 21350 + }, + { + "epoch": 2.136, + "grad_norm": 66.90753173828125, + "learning_rate": 5.730600000000001e-06, + "loss": 0.6658, + "step": 21360 + }, + { + "epoch": 2.137, + "grad_norm": 36.23524856567383, + "learning_rate": 5.7286000000000005e-06, + "loss": 0.6202, + "step": 21370 + }, + { + "epoch": 2.138, + "grad_norm": 4.688386917114258, + "learning_rate": 5.7265999999999995e-06, + "loss": 0.6627, + "step": 21380 + }, + { + "epoch": 2.1390000000000002, + "grad_norm": 24.567197799682617, + "learning_rate": 5.7246e-06, + "loss": 0.6649, + "step": 21390 + }, + { + "epoch": 2.14, + "grad_norm": 6.481902599334717, + "learning_rate": 5.7226e-06, + "loss": 0.8944, + "step": 21400 + }, + { + "epoch": 2.141, + "grad_norm": 18.482616424560547, + "learning_rate": 5.720600000000001e-06, + "loss": 0.4641, + "step": 21410 + }, + { + "epoch": 2.142, + "grad_norm": 15.100506782531738, + "learning_rate": 5.718600000000001e-06, + "loss": 0.502, + "step": 21420 + }, + { + "epoch": 2.143, + "grad_norm": 16.71766471862793, + "learning_rate": 5.7166e-06, + "loss": 0.7693, + "step": 21430 + }, + { + "epoch": 2.144, + "grad_norm": 37.1085205078125, + "learning_rate": 5.7146e-06, + "loss": 0.8897, + "step": 21440 + }, + { + "epoch": 2.145, + "grad_norm": 23.989057540893555, + "learning_rate": 5.7126e-06, + "loss": 0.614, + "step": 21450 + }, + { + "epoch": 2.146, + "grad_norm": 9.404507637023926, + "learning_rate": 5.710600000000001e-06, + "loss": 0.6387, + "step": 21460 + }, + { + "epoch": 2.147, + "grad_norm": 21.427249908447266, + "learning_rate": 5.708600000000001e-06, + "loss": 0.695, + "step": 21470 + }, + { + "epoch": 2.148, + "grad_norm": 28.601951599121094, + "learning_rate": 5.7066e-06, + "loss": 0.6689, + "step": 21480 + }, + { + "epoch": 2.149, + "grad_norm": 37.28999328613281, + "learning_rate": 5.7046e-06, + "loss": 0.6496, + "step": 21490 + }, + { + "epoch": 2.15, + "grad_norm": 12.694632530212402, + "learning_rate": 5.7026e-06, + "loss": 0.5615, + "step": 21500 + }, + { + "epoch": 2.151, + "grad_norm": 19.564849853515625, + "learning_rate": 5.700600000000001e-06, + "loss": 0.5441, + "step": 21510 + }, + { + "epoch": 2.152, + "grad_norm": 31.632789611816406, + "learning_rate": 5.698600000000001e-06, + "loss": 0.5756, + "step": 21520 + }, + { + "epoch": 2.153, + "grad_norm": 14.230611801147461, + "learning_rate": 5.696600000000001e-06, + "loss": 0.5026, + "step": 21530 + }, + { + "epoch": 2.154, + "grad_norm": 44.073448181152344, + "learning_rate": 5.6946000000000005e-06, + "loss": 0.6345, + "step": 21540 + }, + { + "epoch": 2.155, + "grad_norm": 58.922603607177734, + "learning_rate": 5.6926e-06, + "loss": 0.6142, + "step": 21550 + }, + { + "epoch": 2.156, + "grad_norm": 14.919763565063477, + "learning_rate": 5.6906e-06, + "loss": 0.4576, + "step": 21560 + }, + { + "epoch": 2.157, + "grad_norm": 25.46698760986328, + "learning_rate": 5.688600000000001e-06, + "loss": 0.8048, + "step": 21570 + }, + { + "epoch": 2.158, + "grad_norm": 54.56586837768555, + "learning_rate": 5.686600000000001e-06, + "loss": 0.5692, + "step": 21580 + }, + { + "epoch": 2.159, + "grad_norm": 35.88685607910156, + "learning_rate": 5.6846e-06, + "loss": 0.4866, + "step": 21590 + }, + { + "epoch": 2.16, + "grad_norm": 23.260847091674805, + "learning_rate": 5.6826000000000004e-06, + "loss": 0.488, + "step": 21600 + }, + { + "epoch": 2.161, + "grad_norm": 51.36183547973633, + "learning_rate": 5.6806e-06, + "loss": 0.7119, + "step": 21610 + }, + { + "epoch": 2.162, + "grad_norm": 26.24199867248535, + "learning_rate": 5.678600000000001e-06, + "loss": 0.784, + "step": 21620 + }, + { + "epoch": 2.163, + "grad_norm": 28.365392684936523, + "learning_rate": 5.676600000000001e-06, + "loss": 0.7343, + "step": 21630 + }, + { + "epoch": 2.164, + "grad_norm": 50.15872573852539, + "learning_rate": 5.6746e-06, + "loss": 0.5722, + "step": 21640 + }, + { + "epoch": 2.165, + "grad_norm": 45.75239562988281, + "learning_rate": 5.6726000000000005e-06, + "loss": 0.5819, + "step": 21650 + }, + { + "epoch": 2.166, + "grad_norm": 26.17607307434082, + "learning_rate": 5.6706e-06, + "loss": 0.4819, + "step": 21660 + }, + { + "epoch": 2.167, + "grad_norm": 6.468771457672119, + "learning_rate": 5.668600000000001e-06, + "loss": 0.6419, + "step": 21670 + }, + { + "epoch": 2.168, + "grad_norm": 0.5758807063102722, + "learning_rate": 5.666600000000001e-06, + "loss": 0.6369, + "step": 21680 + }, + { + "epoch": 2.169, + "grad_norm": 41.119407653808594, + "learning_rate": 5.6646e-06, + "loss": 0.5396, + "step": 21690 + }, + { + "epoch": 2.17, + "grad_norm": 20.98927116394043, + "learning_rate": 5.662600000000001e-06, + "loss": 0.5572, + "step": 21700 + }, + { + "epoch": 2.171, + "grad_norm": 2.9193451404571533, + "learning_rate": 5.6606000000000005e-06, + "loss": 0.6213, + "step": 21710 + }, + { + "epoch": 2.172, + "grad_norm": 41.307823181152344, + "learning_rate": 5.6586e-06, + "loss": 0.7824, + "step": 21720 + }, + { + "epoch": 2.173, + "grad_norm": 49.53284454345703, + "learning_rate": 5.656600000000001e-06, + "loss": 0.7281, + "step": 21730 + }, + { + "epoch": 2.174, + "grad_norm": 80.82706451416016, + "learning_rate": 5.6546e-06, + "loss": 0.525, + "step": 21740 + }, + { + "epoch": 2.175, + "grad_norm": 10.112666130065918, + "learning_rate": 5.6526e-06, + "loss": 0.8426, + "step": 21750 + }, + { + "epoch": 2.176, + "grad_norm": 30.3128719329834, + "learning_rate": 5.650600000000001e-06, + "loss": 0.5265, + "step": 21760 + }, + { + "epoch": 2.177, + "grad_norm": 11.140154838562012, + "learning_rate": 5.6486000000000004e-06, + "loss": 0.7528, + "step": 21770 + }, + { + "epoch": 2.178, + "grad_norm": 32.421485900878906, + "learning_rate": 5.646600000000001e-06, + "loss": 0.5861, + "step": 21780 + }, + { + "epoch": 2.179, + "grad_norm": 21.485401153564453, + "learning_rate": 5.6446e-06, + "loss": 0.4652, + "step": 21790 + }, + { + "epoch": 2.18, + "grad_norm": 18.479448318481445, + "learning_rate": 5.6426e-06, + "loss": 0.8183, + "step": 21800 + }, + { + "epoch": 2.181, + "grad_norm": 26.815753936767578, + "learning_rate": 5.640600000000001e-06, + "loss": 0.5524, + "step": 21810 + }, + { + "epoch": 2.182, + "grad_norm": 6.771590232849121, + "learning_rate": 5.6386000000000005e-06, + "loss": 0.6103, + "step": 21820 + }, + { + "epoch": 2.183, + "grad_norm": 7.171220302581787, + "learning_rate": 5.636600000000001e-06, + "loss": 0.5156, + "step": 21830 + }, + { + "epoch": 2.184, + "grad_norm": 49.12594985961914, + "learning_rate": 5.6346e-06, + "loss": 0.7338, + "step": 21840 + }, + { + "epoch": 2.185, + "grad_norm": 26.85982322692871, + "learning_rate": 5.6326e-06, + "loss": 0.9972, + "step": 21850 + }, + { + "epoch": 2.186, + "grad_norm": 22.835695266723633, + "learning_rate": 5.630600000000001e-06, + "loss": 0.7974, + "step": 21860 + }, + { + "epoch": 2.187, + "grad_norm": 8.738889694213867, + "learning_rate": 5.628600000000001e-06, + "loss": 0.6405, + "step": 21870 + }, + { + "epoch": 2.188, + "grad_norm": 38.73388671875, + "learning_rate": 5.626600000000001e-06, + "loss": 0.9766, + "step": 21880 + }, + { + "epoch": 2.189, + "grad_norm": 42.84649658203125, + "learning_rate": 5.6246e-06, + "loss": 0.7369, + "step": 21890 + }, + { + "epoch": 2.19, + "grad_norm": 22.246782302856445, + "learning_rate": 5.6226e-06, + "loss": 0.6748, + "step": 21900 + }, + { + "epoch": 2.191, + "grad_norm": 41.804264068603516, + "learning_rate": 5.620600000000001e-06, + "loss": 0.5704, + "step": 21910 + }, + { + "epoch": 2.192, + "grad_norm": 55.34584045410156, + "learning_rate": 5.618600000000001e-06, + "loss": 0.7983, + "step": 21920 + }, + { + "epoch": 2.193, + "grad_norm": 24.581790924072266, + "learning_rate": 5.6166000000000006e-06, + "loss": 0.6806, + "step": 21930 + }, + { + "epoch": 2.194, + "grad_norm": 55.127342224121094, + "learning_rate": 5.6146e-06, + "loss": 0.6151, + "step": 21940 + }, + { + "epoch": 2.195, + "grad_norm": 29.649892807006836, + "learning_rate": 5.6126e-06, + "loss": 0.5617, + "step": 21950 + }, + { + "epoch": 2.196, + "grad_norm": 27.13589096069336, + "learning_rate": 5.6108000000000006e-06, + "loss": 0.8283, + "step": 21960 + }, + { + "epoch": 2.197, + "grad_norm": 12.190668106079102, + "learning_rate": 5.608800000000001e-06, + "loss": 0.6444, + "step": 21970 + }, + { + "epoch": 2.198, + "grad_norm": 30.363269805908203, + "learning_rate": 5.6068e-06, + "loss": 0.4189, + "step": 21980 + }, + { + "epoch": 2.199, + "grad_norm": 40.20530700683594, + "learning_rate": 5.6048e-06, + "loss": 0.5882, + "step": 21990 + }, + { + "epoch": 2.2, + "grad_norm": 13.80125617980957, + "learning_rate": 5.602800000000001e-06, + "loss": 0.3865, + "step": 22000 + }, + { + "epoch": 2.201, + "grad_norm": 37.33003616333008, + "learning_rate": 5.600800000000001e-06, + "loss": 0.6091, + "step": 22010 + }, + { + "epoch": 2.202, + "grad_norm": 98.78890228271484, + "learning_rate": 5.5988000000000005e-06, + "loss": 0.62, + "step": 22020 + }, + { + "epoch": 2.203, + "grad_norm": 41.13166427612305, + "learning_rate": 5.5968e-06, + "loss": 0.5836, + "step": 22030 + }, + { + "epoch": 2.204, + "grad_norm": 42.02532958984375, + "learning_rate": 5.5948e-06, + "loss": 0.5915, + "step": 22040 + }, + { + "epoch": 2.205, + "grad_norm": 42.591224670410156, + "learning_rate": 5.5928e-06, + "loss": 0.4783, + "step": 22050 + }, + { + "epoch": 2.206, + "grad_norm": 30.582590103149414, + "learning_rate": 5.590800000000001e-06, + "loss": 0.7235, + "step": 22060 + }, + { + "epoch": 2.207, + "grad_norm": 32.897727966308594, + "learning_rate": 5.588800000000001e-06, + "loss": 0.7734, + "step": 22070 + }, + { + "epoch": 2.208, + "grad_norm": 18.399417877197266, + "learning_rate": 5.5868e-06, + "loss": 0.7114, + "step": 22080 + }, + { + "epoch": 2.209, + "grad_norm": 8.900433540344238, + "learning_rate": 5.5848e-06, + "loss": 0.5118, + "step": 22090 + }, + { + "epoch": 2.21, + "grad_norm": 42.656253814697266, + "learning_rate": 5.5828e-06, + "loss": 0.6707, + "step": 22100 + }, + { + "epoch": 2.211, + "grad_norm": 15.351293563842773, + "learning_rate": 5.580800000000001e-06, + "loss": 0.6547, + "step": 22110 + }, + { + "epoch": 2.212, + "grad_norm": 35.78485107421875, + "learning_rate": 5.578800000000001e-06, + "loss": 0.8694, + "step": 22120 + }, + { + "epoch": 2.213, + "grad_norm": 90.62995147705078, + "learning_rate": 5.5768e-06, + "loss": 0.8345, + "step": 22130 + }, + { + "epoch": 2.214, + "grad_norm": 24.1240177154541, + "learning_rate": 5.5748e-06, + "loss": 0.6986, + "step": 22140 + }, + { + "epoch": 2.215, + "grad_norm": 37.8781623840332, + "learning_rate": 5.5728e-06, + "loss": 0.8083, + "step": 22150 + }, + { + "epoch": 2.216, + "grad_norm": 4.367500305175781, + "learning_rate": 5.570800000000001e-06, + "loss": 0.2719, + "step": 22160 + }, + { + "epoch": 2.217, + "grad_norm": 40.212860107421875, + "learning_rate": 5.568800000000001e-06, + "loss": 0.5333, + "step": 22170 + }, + { + "epoch": 2.218, + "grad_norm": 41.37190628051758, + "learning_rate": 5.5668e-06, + "loss": 0.483, + "step": 22180 + }, + { + "epoch": 2.219, + "grad_norm": 55.49161148071289, + "learning_rate": 5.5648000000000005e-06, + "loss": 0.7027, + "step": 22190 + }, + { + "epoch": 2.22, + "grad_norm": 47.93917465209961, + "learning_rate": 5.5628e-06, + "loss": 0.6786, + "step": 22200 + }, + { + "epoch": 2.221, + "grad_norm": 45.09898376464844, + "learning_rate": 5.560800000000001e-06, + "loss": 0.8251, + "step": 22210 + }, + { + "epoch": 2.222, + "grad_norm": 43.21870422363281, + "learning_rate": 5.558800000000001e-06, + "loss": 0.9235, + "step": 22220 + }, + { + "epoch": 2.223, + "grad_norm": 42.25673294067383, + "learning_rate": 5.5568e-06, + "loss": 0.85, + "step": 22230 + }, + { + "epoch": 2.224, + "grad_norm": 22.41086196899414, + "learning_rate": 5.554800000000001e-06, + "loss": 0.5968, + "step": 22240 + }, + { + "epoch": 2.225, + "grad_norm": 21.879209518432617, + "learning_rate": 5.5528000000000004e-06, + "loss": 0.6044, + "step": 22250 + }, + { + "epoch": 2.226, + "grad_norm": 33.574073791503906, + "learning_rate": 5.5508e-06, + "loss": 0.52, + "step": 22260 + }, + { + "epoch": 2.227, + "grad_norm": 0.5494199395179749, + "learning_rate": 5.548800000000001e-06, + "loss": 0.7395, + "step": 22270 + }, + { + "epoch": 2.228, + "grad_norm": 26.909727096557617, + "learning_rate": 5.5468e-06, + "loss": 0.4841, + "step": 22280 + }, + { + "epoch": 2.229, + "grad_norm": 35.930931091308594, + "learning_rate": 5.5448e-06, + "loss": 0.8255, + "step": 22290 + }, + { + "epoch": 2.23, + "grad_norm": 30.434680938720703, + "learning_rate": 5.5428000000000005e-06, + "loss": 0.7121, + "step": 22300 + }, + { + "epoch": 2.231, + "grad_norm": 24.429750442504883, + "learning_rate": 5.5408e-06, + "loss": 0.8339, + "step": 22310 + }, + { + "epoch": 2.232, + "grad_norm": 31.06974983215332, + "learning_rate": 5.538800000000001e-06, + "loss": 0.8602, + "step": 22320 + }, + { + "epoch": 2.233, + "grad_norm": 54.97211837768555, + "learning_rate": 5.5368e-06, + "loss": 0.694, + "step": 22330 + }, + { + "epoch": 2.234, + "grad_norm": 25.800743103027344, + "learning_rate": 5.5348e-06, + "loss": 0.5709, + "step": 22340 + }, + { + "epoch": 2.235, + "grad_norm": 15.872865676879883, + "learning_rate": 5.532800000000001e-06, + "loss": 0.5848, + "step": 22350 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 8.24520206451416, + "learning_rate": 5.5308000000000005e-06, + "loss": 0.5325, + "step": 22360 + }, + { + "epoch": 2.237, + "grad_norm": 6.170074462890625, + "learning_rate": 5.528800000000001e-06, + "loss": 0.6424, + "step": 22370 + }, + { + "epoch": 2.238, + "grad_norm": 21.991802215576172, + "learning_rate": 5.5268e-06, + "loss": 0.5698, + "step": 22380 + }, + { + "epoch": 2.239, + "grad_norm": 19.199386596679688, + "learning_rate": 5.5248e-06, + "loss": 0.4919, + "step": 22390 + }, + { + "epoch": 2.24, + "grad_norm": 49.97930908203125, + "learning_rate": 5.522800000000001e-06, + "loss": 0.7104, + "step": 22400 + }, + { + "epoch": 2.241, + "grad_norm": 36.936805725097656, + "learning_rate": 5.5208000000000006e-06, + "loss": 0.986, + "step": 22410 + }, + { + "epoch": 2.242, + "grad_norm": 26.627878189086914, + "learning_rate": 5.5188e-06, + "loss": 0.8035, + "step": 22420 + }, + { + "epoch": 2.243, + "grad_norm": 11.643138885498047, + "learning_rate": 5.5168e-06, + "loss": 0.5016, + "step": 22430 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 37.188358306884766, + "learning_rate": 5.5148e-06, + "loss": 0.8121, + "step": 22440 + }, + { + "epoch": 2.245, + "grad_norm": 10.066282272338867, + "learning_rate": 5.5128e-06, + "loss": 0.6035, + "step": 22450 + }, + { + "epoch": 2.246, + "grad_norm": 13.461889266967773, + "learning_rate": 5.510800000000001e-06, + "loss": 0.5461, + "step": 22460 + }, + { + "epoch": 2.247, + "grad_norm": 49.27286911010742, + "learning_rate": 5.5088000000000005e-06, + "loss": 0.7651, + "step": 22470 + }, + { + "epoch": 2.248, + "grad_norm": 30.79641342163086, + "learning_rate": 5.5067999999999995e-06, + "loss": 0.5639, + "step": 22480 + }, + { + "epoch": 2.249, + "grad_norm": 22.9559268951416, + "learning_rate": 5.5048e-06, + "loss": 0.7904, + "step": 22490 + }, + { + "epoch": 2.25, + "grad_norm": 4.752798557281494, + "learning_rate": 5.5028e-06, + "loss": 0.7704, + "step": 22500 + }, + { + "epoch": 2.251, + "grad_norm": 9.406001091003418, + "learning_rate": 5.500800000000001e-06, + "loss": 0.867, + "step": 22510 + }, + { + "epoch": 2.252, + "grad_norm": 13.422652244567871, + "learning_rate": 5.498800000000001e-06, + "loss": 0.4994, + "step": 22520 + }, + { + "epoch": 2.253, + "grad_norm": 24.475555419921875, + "learning_rate": 5.4968e-06, + "loss": 0.74, + "step": 22530 + }, + { + "epoch": 2.254, + "grad_norm": 3.736154317855835, + "learning_rate": 5.4948e-06, + "loss": 0.5785, + "step": 22540 + }, + { + "epoch": 2.255, + "grad_norm": 1.4272233247756958, + "learning_rate": 5.4928e-06, + "loss": 1.0276, + "step": 22550 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 25.722667694091797, + "learning_rate": 5.490800000000001e-06, + "loss": 1.0477, + "step": 22560 + }, + { + "epoch": 2.257, + "grad_norm": 25.320680618286133, + "learning_rate": 5.488800000000001e-06, + "loss": 0.5366, + "step": 22570 + }, + { + "epoch": 2.258, + "grad_norm": 13.106619834899902, + "learning_rate": 5.4868e-06, + "loss": 0.4721, + "step": 22580 + }, + { + "epoch": 2.259, + "grad_norm": 1.7035945653915405, + "learning_rate": 5.4848e-06, + "loss": 0.6387, + "step": 22590 + }, + { + "epoch": 2.26, + "grad_norm": 18.009597778320312, + "learning_rate": 5.4828e-06, + "loss": 0.6352, + "step": 22600 + }, + { + "epoch": 2.261, + "grad_norm": 12.074342727661133, + "learning_rate": 5.480800000000001e-06, + "loss": 0.7541, + "step": 22610 + }, + { + "epoch": 2.262, + "grad_norm": 38.93974304199219, + "learning_rate": 5.478800000000001e-06, + "loss": 0.8047, + "step": 22620 + }, + { + "epoch": 2.263, + "grad_norm": 15.713391304016113, + "learning_rate": 5.4768e-06, + "loss": 0.5585, + "step": 22630 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 20.313886642456055, + "learning_rate": 5.4748000000000005e-06, + "loss": 0.3573, + "step": 22640 + }, + { + "epoch": 2.265, + "grad_norm": 9.712434768676758, + "learning_rate": 5.4728e-06, + "loss": 0.5595, + "step": 22650 + }, + { + "epoch": 2.266, + "grad_norm": 32.309200286865234, + "learning_rate": 5.4708e-06, + "loss": 0.407, + "step": 22660 + }, + { + "epoch": 2.267, + "grad_norm": 46.730037689208984, + "learning_rate": 5.468800000000001e-06, + "loss": 0.8093, + "step": 22670 + }, + { + "epoch": 2.268, + "grad_norm": 11.148622512817383, + "learning_rate": 5.466800000000001e-06, + "loss": 0.6787, + "step": 22680 + }, + { + "epoch": 2.269, + "grad_norm": 47.33662033081055, + "learning_rate": 5.4648e-06, + "loss": 0.6553, + "step": 22690 + }, + { + "epoch": 2.27, + "grad_norm": 5.559915542602539, + "learning_rate": 5.4628000000000005e-06, + "loss": 0.7154, + "step": 22700 + }, + { + "epoch": 2.271, + "grad_norm": 30.307464599609375, + "learning_rate": 5.4608e-06, + "loss": 0.5123, + "step": 22710 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 20.74936294555664, + "learning_rate": 5.458800000000001e-06, + "loss": 0.6701, + "step": 22720 + }, + { + "epoch": 2.273, + "grad_norm": 8.080650329589844, + "learning_rate": 5.456800000000001e-06, + "loss": 0.6648, + "step": 22730 + }, + { + "epoch": 2.274, + "grad_norm": 13.168947219848633, + "learning_rate": 5.4548e-06, + "loss": 0.4532, + "step": 22740 + }, + { + "epoch": 2.275, + "grad_norm": 42.89329147338867, + "learning_rate": 5.4528000000000005e-06, + "loss": 0.5806, + "step": 22750 + }, + { + "epoch": 2.276, + "grad_norm": 24.9879093170166, + "learning_rate": 5.4508e-06, + "loss": 0.6691, + "step": 22760 + }, + { + "epoch": 2.277, + "grad_norm": 28.52879524230957, + "learning_rate": 5.448800000000001e-06, + "loss": 0.4086, + "step": 22770 + }, + { + "epoch": 2.278, + "grad_norm": 20.092578887939453, + "learning_rate": 5.446800000000001e-06, + "loss": 0.7222, + "step": 22780 + }, + { + "epoch": 2.279, + "grad_norm": 34.03709411621094, + "learning_rate": 5.4448e-06, + "loss": 0.8185, + "step": 22790 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 25.46604347229004, + "learning_rate": 5.442800000000001e-06, + "loss": 0.643, + "step": 22800 + }, + { + "epoch": 2.281, + "grad_norm": 25.507854461669922, + "learning_rate": 5.4408000000000005e-06, + "loss": 0.5392, + "step": 22810 + }, + { + "epoch": 2.282, + "grad_norm": 20.695667266845703, + "learning_rate": 5.4388e-06, + "loss": 0.8377, + "step": 22820 + }, + { + "epoch": 2.283, + "grad_norm": 34.970245361328125, + "learning_rate": 5.436800000000001e-06, + "loss": 0.6591, + "step": 22830 + }, + { + "epoch": 2.284, + "grad_norm": 59.59330749511719, + "learning_rate": 5.4348e-06, + "loss": 0.9524, + "step": 22840 + }, + { + "epoch": 2.285, + "grad_norm": 46.58605194091797, + "learning_rate": 5.4328e-06, + "loss": 0.6674, + "step": 22850 + }, + { + "epoch": 2.286, + "grad_norm": 13.24176025390625, + "learning_rate": 5.430800000000001e-06, + "loss": 0.6838, + "step": 22860 + }, + { + "epoch": 2.287, + "grad_norm": 48.0534782409668, + "learning_rate": 5.4288000000000004e-06, + "loss": 0.6875, + "step": 22870 + }, + { + "epoch": 2.288, + "grad_norm": 47.62519073486328, + "learning_rate": 5.426800000000001e-06, + "loss": 0.7306, + "step": 22880 + }, + { + "epoch": 2.289, + "grad_norm": 27.308670043945312, + "learning_rate": 5.4248e-06, + "loss": 0.5339, + "step": 22890 + }, + { + "epoch": 2.29, + "grad_norm": 26.38911247253418, + "learning_rate": 5.4228e-06, + "loss": 0.7351, + "step": 22900 + }, + { + "epoch": 2.291, + "grad_norm": 41.1791877746582, + "learning_rate": 5.420800000000001e-06, + "loss": 0.5436, + "step": 22910 + }, + { + "epoch": 2.292, + "grad_norm": 26.171401977539062, + "learning_rate": 5.4188000000000005e-06, + "loss": 0.4981, + "step": 22920 + }, + { + "epoch": 2.293, + "grad_norm": 19.563926696777344, + "learning_rate": 5.416800000000001e-06, + "loss": 0.748, + "step": 22930 + }, + { + "epoch": 2.294, + "grad_norm": 47.672515869140625, + "learning_rate": 5.4148e-06, + "loss": 0.7037, + "step": 22940 + }, + { + "epoch": 2.295, + "grad_norm": 62.735050201416016, + "learning_rate": 5.4128e-06, + "loss": 0.6723, + "step": 22950 + }, + { + "epoch": 2.296, + "grad_norm": 34.751407623291016, + "learning_rate": 5.410800000000001e-06, + "loss": 0.5159, + "step": 22960 + }, + { + "epoch": 2.297, + "grad_norm": 7.955129623413086, + "learning_rate": 5.408800000000001e-06, + "loss": 0.5092, + "step": 22970 + }, + { + "epoch": 2.298, + "grad_norm": 26.632680892944336, + "learning_rate": 5.406800000000001e-06, + "loss": 0.5358, + "step": 22980 + }, + { + "epoch": 2.299, + "grad_norm": 8.946920394897461, + "learning_rate": 5.4048e-06, + "loss": 0.6629, + "step": 22990 + }, + { + "epoch": 2.3, + "grad_norm": 24.032062530517578, + "learning_rate": 5.4028e-06, + "loss": 0.7799, + "step": 23000 + }, + { + "epoch": 2.301, + "grad_norm": 31.619062423706055, + "learning_rate": 5.400800000000001e-06, + "loss": 0.693, + "step": 23010 + }, + { + "epoch": 2.302, + "grad_norm": 57.403263092041016, + "learning_rate": 5.398800000000001e-06, + "loss": 0.5805, + "step": 23020 + }, + { + "epoch": 2.303, + "grad_norm": 40.945125579833984, + "learning_rate": 5.3968000000000006e-06, + "loss": 0.6838, + "step": 23030 + }, + { + "epoch": 2.304, + "grad_norm": 26.93997573852539, + "learning_rate": 5.3948000000000004e-06, + "loss": 0.4315, + "step": 23040 + }, + { + "epoch": 2.305, + "grad_norm": 24.222333908081055, + "learning_rate": 5.3928e-06, + "loss": 0.5856, + "step": 23050 + }, + { + "epoch": 2.306, + "grad_norm": 29.424251556396484, + "learning_rate": 5.3908e-06, + "loss": 0.4816, + "step": 23060 + }, + { + "epoch": 2.307, + "grad_norm": 51.99330139160156, + "learning_rate": 5.388800000000001e-06, + "loss": 0.7163, + "step": 23070 + }, + { + "epoch": 2.308, + "grad_norm": 12.303630828857422, + "learning_rate": 5.386800000000001e-06, + "loss": 0.3429, + "step": 23080 + }, + { + "epoch": 2.309, + "grad_norm": 57.72380828857422, + "learning_rate": 5.3848e-06, + "loss": 0.4764, + "step": 23090 + }, + { + "epoch": 2.31, + "grad_norm": 29.551307678222656, + "learning_rate": 5.3828e-06, + "loss": 0.8045, + "step": 23100 + }, + { + "epoch": 2.311, + "grad_norm": 39.42962646484375, + "learning_rate": 5.3808e-06, + "loss": 0.682, + "step": 23110 + }, + { + "epoch": 2.312, + "grad_norm": 8.102534294128418, + "learning_rate": 5.378800000000001e-06, + "loss": 0.6236, + "step": 23120 + }, + { + "epoch": 2.313, + "grad_norm": 34.7684211730957, + "learning_rate": 5.376800000000001e-06, + "loss": 0.4118, + "step": 23130 + }, + { + "epoch": 2.314, + "grad_norm": 18.475805282592773, + "learning_rate": 5.3748e-06, + "loss": 0.5975, + "step": 23140 + }, + { + "epoch": 2.315, + "grad_norm": 34.82725524902344, + "learning_rate": 5.3728000000000005e-06, + "loss": 0.6382, + "step": 23150 + }, + { + "epoch": 2.316, + "grad_norm": 33.135921478271484, + "learning_rate": 5.3708e-06, + "loss": 0.6132, + "step": 23160 + }, + { + "epoch": 2.317, + "grad_norm": 31.031635284423828, + "learning_rate": 5.368800000000001e-06, + "loss": 0.3783, + "step": 23170 + }, + { + "epoch": 2.318, + "grad_norm": 44.96309280395508, + "learning_rate": 5.366800000000001e-06, + "loss": 0.8038, + "step": 23180 + }, + { + "epoch": 2.319, + "grad_norm": 50.742454528808594, + "learning_rate": 5.3648e-06, + "loss": 0.6723, + "step": 23190 + }, + { + "epoch": 2.32, + "grad_norm": 19.635595321655273, + "learning_rate": 5.3628000000000006e-06, + "loss": 0.4662, + "step": 23200 + }, + { + "epoch": 2.321, + "grad_norm": 36.878971099853516, + "learning_rate": 5.3608e-06, + "loss": 0.6226, + "step": 23210 + }, + { + "epoch": 2.322, + "grad_norm": 6.729045867919922, + "learning_rate": 5.3588e-06, + "loss": 0.4275, + "step": 23220 + }, + { + "epoch": 2.323, + "grad_norm": 52.66419982910156, + "learning_rate": 5.356800000000001e-06, + "loss": 0.5837, + "step": 23230 + }, + { + "epoch": 2.324, + "grad_norm": 44.77753448486328, + "learning_rate": 5.3548e-06, + "loss": 0.7493, + "step": 23240 + }, + { + "epoch": 2.325, + "grad_norm": 14.312146186828613, + "learning_rate": 5.3528e-06, + "loss": 0.5975, + "step": 23250 + }, + { + "epoch": 2.326, + "grad_norm": 47.50712203979492, + "learning_rate": 5.3508000000000005e-06, + "loss": 0.6969, + "step": 23260 + }, + { + "epoch": 2.327, + "grad_norm": 55.93048095703125, + "learning_rate": 5.3488e-06, + "loss": 0.9151, + "step": 23270 + }, + { + "epoch": 2.328, + "grad_norm": 33.65269470214844, + "learning_rate": 5.346800000000001e-06, + "loss": 0.611, + "step": 23280 + }, + { + "epoch": 2.329, + "grad_norm": 17.26365852355957, + "learning_rate": 5.3448e-06, + "loss": 0.675, + "step": 23290 + }, + { + "epoch": 2.33, + "grad_norm": 25.64186668395996, + "learning_rate": 5.3428e-06, + "loss": 0.6604, + "step": 23300 + }, + { + "epoch": 2.331, + "grad_norm": 48.147682189941406, + "learning_rate": 5.340800000000001e-06, + "loss": 0.8392, + "step": 23310 + }, + { + "epoch": 2.332, + "grad_norm": 25.545421600341797, + "learning_rate": 5.3388000000000005e-06, + "loss": 0.7228, + "step": 23320 + }, + { + "epoch": 2.333, + "grad_norm": 28.70657730102539, + "learning_rate": 5.336800000000001e-06, + "loss": 0.6491, + "step": 23330 + }, + { + "epoch": 2.334, + "grad_norm": 34.852413177490234, + "learning_rate": 5.3348e-06, + "loss": 0.5739, + "step": 23340 + }, + { + "epoch": 2.335, + "grad_norm": 9.7340087890625, + "learning_rate": 5.3328e-06, + "loss": 0.8116, + "step": 23350 + }, + { + "epoch": 2.336, + "grad_norm": 26.43950843811035, + "learning_rate": 5.330800000000001e-06, + "loss": 0.5883, + "step": 23360 + }, + { + "epoch": 2.337, + "grad_norm": 23.425874710083008, + "learning_rate": 5.3288000000000005e-06, + "loss": 0.7699, + "step": 23370 + }, + { + "epoch": 2.338, + "grad_norm": 44.046539306640625, + "learning_rate": 5.326800000000001e-06, + "loss": 0.926, + "step": 23380 + }, + { + "epoch": 2.339, + "grad_norm": 39.17654800415039, + "learning_rate": 5.3248e-06, + "loss": 0.5595, + "step": 23390 + }, + { + "epoch": 2.34, + "grad_norm": 36.046775817871094, + "learning_rate": 5.3228e-06, + "loss": 0.57, + "step": 23400 + }, + { + "epoch": 2.341, + "grad_norm": 17.362285614013672, + "learning_rate": 5.320800000000001e-06, + "loss": 0.6189, + "step": 23410 + }, + { + "epoch": 2.342, + "grad_norm": 17.513729095458984, + "learning_rate": 5.318800000000001e-06, + "loss": 0.54, + "step": 23420 + }, + { + "epoch": 2.343, + "grad_norm": 45.768333435058594, + "learning_rate": 5.3168000000000005e-06, + "loss": 0.752, + "step": 23430 + }, + { + "epoch": 2.344, + "grad_norm": 36.07986831665039, + "learning_rate": 5.3148e-06, + "loss": 0.649, + "step": 23440 + }, + { + "epoch": 2.3449999999999998, + "grad_norm": 50.08828353881836, + "learning_rate": 5.3128e-06, + "loss": 0.6256, + "step": 23450 + }, + { + "epoch": 2.346, + "grad_norm": 21.56818389892578, + "learning_rate": 5.3108e-06, + "loss": 0.5439, + "step": 23460 + }, + { + "epoch": 2.347, + "grad_norm": 12.028852462768555, + "learning_rate": 5.308800000000001e-06, + "loss": 0.2928, + "step": 23470 + }, + { + "epoch": 2.348, + "grad_norm": 25.907634735107422, + "learning_rate": 5.306800000000001e-06, + "loss": 0.4197, + "step": 23480 + }, + { + "epoch": 2.349, + "grad_norm": 25.22545623779297, + "learning_rate": 5.3048e-06, + "loss": 0.4871, + "step": 23490 + }, + { + "epoch": 2.35, + "grad_norm": 27.278261184692383, + "learning_rate": 5.3028e-06, + "loss": 0.5702, + "step": 23500 + }, + { + "epoch": 2.351, + "grad_norm": 6.1130194664001465, + "learning_rate": 5.3008e-06, + "loss": 0.5626, + "step": 23510 + }, + { + "epoch": 2.352, + "grad_norm": 28.173141479492188, + "learning_rate": 5.298800000000001e-06, + "loss": 0.6642, + "step": 23520 + }, + { + "epoch": 2.3529999999999998, + "grad_norm": 45.549293518066406, + "learning_rate": 5.296800000000001e-06, + "loss": 0.7794, + "step": 23530 + }, + { + "epoch": 2.354, + "grad_norm": 59.046600341796875, + "learning_rate": 5.2948e-06, + "loss": 0.8257, + "step": 23540 + }, + { + "epoch": 2.355, + "grad_norm": 30.231481552124023, + "learning_rate": 5.2928e-06, + "loss": 0.5337, + "step": 23550 + }, + { + "epoch": 2.356, + "grad_norm": 28.655134201049805, + "learning_rate": 5.2908e-06, + "loss": 0.6839, + "step": 23560 + }, + { + "epoch": 2.357, + "grad_norm": 57.767459869384766, + "learning_rate": 5.288800000000001e-06, + "loss": 0.9569, + "step": 23570 + }, + { + "epoch": 2.358, + "grad_norm": 55.075984954833984, + "learning_rate": 5.286800000000001e-06, + "loss": 0.6479, + "step": 23580 + }, + { + "epoch": 2.359, + "grad_norm": 14.50174617767334, + "learning_rate": 5.2848e-06, + "loss": 0.7389, + "step": 23590 + }, + { + "epoch": 2.36, + "grad_norm": 36.16502380371094, + "learning_rate": 5.2828000000000005e-06, + "loss": 0.7146, + "step": 23600 + }, + { + "epoch": 2.3609999999999998, + "grad_norm": 20.909074783325195, + "learning_rate": 5.2808e-06, + "loss": 0.4444, + "step": 23610 + }, + { + "epoch": 2.362, + "grad_norm": 34.6703987121582, + "learning_rate": 5.2788e-06, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 2.363, + "grad_norm": 10.016300201416016, + "learning_rate": 5.276800000000001e-06, + "loss": 0.5363, + "step": 23630 + }, + { + "epoch": 2.364, + "grad_norm": 15.961995124816895, + "learning_rate": 5.2748e-06, + "loss": 0.434, + "step": 23640 + }, + { + "epoch": 2.365, + "grad_norm": 29.54912757873535, + "learning_rate": 5.2728e-06, + "loss": 0.5253, + "step": 23650 + }, + { + "epoch": 2.366, + "grad_norm": 46.137672424316406, + "learning_rate": 5.2708000000000004e-06, + "loss": 0.6896, + "step": 23660 + }, + { + "epoch": 2.367, + "grad_norm": 26.812349319458008, + "learning_rate": 5.2688e-06, + "loss": 0.5173, + "step": 23670 + }, + { + "epoch": 2.368, + "grad_norm": 25.67329216003418, + "learning_rate": 5.266800000000001e-06, + "loss": 0.3728, + "step": 23680 + }, + { + "epoch": 2.3689999999999998, + "grad_norm": 56.716339111328125, + "learning_rate": 5.2648e-06, + "loss": 0.6709, + "step": 23690 + }, + { + "epoch": 2.37, + "grad_norm": 81.84050750732422, + "learning_rate": 5.2628e-06, + "loss": 0.8044, + "step": 23700 + }, + { + "epoch": 2.371, + "grad_norm": 45.409515380859375, + "learning_rate": 5.2608000000000005e-06, + "loss": 0.491, + "step": 23710 + }, + { + "epoch": 2.372, + "grad_norm": 48.80219650268555, + "learning_rate": 5.2588e-06, + "loss": 0.4556, + "step": 23720 + }, + { + "epoch": 2.373, + "grad_norm": 23.4346981048584, + "learning_rate": 5.256800000000001e-06, + "loss": 0.8579, + "step": 23730 + }, + { + "epoch": 2.374, + "grad_norm": 33.4806022644043, + "learning_rate": 5.2548e-06, + "loss": 0.8229, + "step": 23740 + }, + { + "epoch": 2.375, + "grad_norm": 49.55470657348633, + "learning_rate": 5.2528e-06, + "loss": 0.6262, + "step": 23750 + }, + { + "epoch": 2.376, + "grad_norm": 27.586681365966797, + "learning_rate": 5.250800000000001e-06, + "loss": 0.6945, + "step": 23760 + }, + { + "epoch": 2.377, + "grad_norm": 56.0458869934082, + "learning_rate": 5.2488000000000005e-06, + "loss": 0.9446, + "step": 23770 + }, + { + "epoch": 2.378, + "grad_norm": 33.53122329711914, + "learning_rate": 5.246800000000001e-06, + "loss": 0.7489, + "step": 23780 + }, + { + "epoch": 2.379, + "grad_norm": 24.26375961303711, + "learning_rate": 5.2448e-06, + "loss": 0.5203, + "step": 23790 + }, + { + "epoch": 2.38, + "grad_norm": 16.765478134155273, + "learning_rate": 5.2428e-06, + "loss": 0.8502, + "step": 23800 + }, + { + "epoch": 2.3810000000000002, + "grad_norm": 52.03241729736328, + "learning_rate": 5.240800000000001e-06, + "loss": 0.4692, + "step": 23810 + }, + { + "epoch": 2.382, + "grad_norm": 26.394987106323242, + "learning_rate": 5.2388000000000006e-06, + "loss": 0.5537, + "step": 23820 + }, + { + "epoch": 2.383, + "grad_norm": 47.837158203125, + "learning_rate": 5.2368e-06, + "loss": 0.6629, + "step": 23830 + }, + { + "epoch": 2.384, + "grad_norm": 4.176435947418213, + "learning_rate": 5.2348e-06, + "loss": 0.56, + "step": 23840 + }, + { + "epoch": 2.385, + "grad_norm": 29.736774444580078, + "learning_rate": 5.2328e-06, + "loss": 0.6556, + "step": 23850 + }, + { + "epoch": 2.386, + "grad_norm": 28.325538635253906, + "learning_rate": 5.2308e-06, + "loss": 0.7128, + "step": 23860 + }, + { + "epoch": 2.387, + "grad_norm": 33.87845230102539, + "learning_rate": 5.228800000000001e-06, + "loss": 0.7559, + "step": 23870 + }, + { + "epoch": 2.388, + "grad_norm": 27.20892906188965, + "learning_rate": 5.2268000000000005e-06, + "loss": 0.5474, + "step": 23880 + }, + { + "epoch": 2.3890000000000002, + "grad_norm": 4.944998741149902, + "learning_rate": 5.224800000000001e-06, + "loss": 0.6466, + "step": 23890 + }, + { + "epoch": 2.39, + "grad_norm": 62.19276428222656, + "learning_rate": 5.2228e-06, + "loss": 0.6979, + "step": 23900 + }, + { + "epoch": 2.391, + "grad_norm": 47.98637390136719, + "learning_rate": 5.2208e-06, + "loss": 0.6794, + "step": 23910 + }, + { + "epoch": 2.392, + "grad_norm": 40.98355484008789, + "learning_rate": 5.218800000000001e-06, + "loss": 0.7471, + "step": 23920 + }, + { + "epoch": 2.393, + "grad_norm": 44.224998474121094, + "learning_rate": 5.216800000000001e-06, + "loss": 0.5519, + "step": 23930 + }, + { + "epoch": 2.394, + "grad_norm": 33.322776794433594, + "learning_rate": 5.214800000000001e-06, + "loss": 0.4908, + "step": 23940 + }, + { + "epoch": 2.395, + "grad_norm": 33.787879943847656, + "learning_rate": 5.2128e-06, + "loss": 0.7247, + "step": 23950 + }, + { + "epoch": 2.396, + "grad_norm": 28.912538528442383, + "learning_rate": 5.2108e-06, + "loss": 0.5081, + "step": 23960 + }, + { + "epoch": 2.3970000000000002, + "grad_norm": 28.065685272216797, + "learning_rate": 5.208800000000001e-06, + "loss": 0.8082, + "step": 23970 + }, + { + "epoch": 2.398, + "grad_norm": 2.713496446609497, + "learning_rate": 5.206800000000001e-06, + "loss": 0.5147, + "step": 23980 + }, + { + "epoch": 2.399, + "grad_norm": 18.171648025512695, + "learning_rate": 5.2048000000000005e-06, + "loss": 0.817, + "step": 23990 + }, + { + "epoch": 2.4, + "grad_norm": 23.39317512512207, + "learning_rate": 5.2028e-06, + "loss": 0.6705, + "step": 24000 + }, + { + "epoch": 2.401, + "grad_norm": 9.348867416381836, + "learning_rate": 5.2008e-06, + "loss": 0.5186, + "step": 24010 + }, + { + "epoch": 2.402, + "grad_norm": 52.66146469116211, + "learning_rate": 5.1988e-06, + "loss": 0.6322, + "step": 24020 + }, + { + "epoch": 2.403, + "grad_norm": 25.316165924072266, + "learning_rate": 5.196800000000001e-06, + "loss": 0.5371, + "step": 24030 + }, + { + "epoch": 2.404, + "grad_norm": 14.026839256286621, + "learning_rate": 5.194800000000001e-06, + "loss": 0.4817, + "step": 24040 + }, + { + "epoch": 2.4050000000000002, + "grad_norm": 37.170867919921875, + "learning_rate": 5.1928e-06, + "loss": 1.0024, + "step": 24050 + }, + { + "epoch": 2.406, + "grad_norm": 12.929289817810059, + "learning_rate": 5.1908e-06, + "loss": 0.5727, + "step": 24060 + }, + { + "epoch": 2.407, + "grad_norm": 16.34523582458496, + "learning_rate": 5.1888e-06, + "loss": 0.5111, + "step": 24070 + }, + { + "epoch": 2.408, + "grad_norm": 68.75968933105469, + "learning_rate": 5.186800000000001e-06, + "loss": 0.7928, + "step": 24080 + }, + { + "epoch": 2.409, + "grad_norm": 23.91437339782715, + "learning_rate": 5.184800000000001e-06, + "loss": 0.7094, + "step": 24090 + }, + { + "epoch": 2.41, + "grad_norm": 52.49360656738281, + "learning_rate": 5.1828e-06, + "loss": 0.5987, + "step": 24100 + }, + { + "epoch": 2.411, + "grad_norm": 35.196495056152344, + "learning_rate": 5.1808000000000004e-06, + "loss": 0.6937, + "step": 24110 + }, + { + "epoch": 2.412, + "grad_norm": 53.06798553466797, + "learning_rate": 5.1788e-06, + "loss": 0.4998, + "step": 24120 + }, + { + "epoch": 2.413, + "grad_norm": 33.701969146728516, + "learning_rate": 5.176800000000001e-06, + "loss": 0.814, + "step": 24130 + }, + { + "epoch": 2.414, + "grad_norm": 18.027755737304688, + "learning_rate": 5.174800000000001e-06, + "loss": 0.7363, + "step": 24140 + }, + { + "epoch": 2.415, + "grad_norm": 47.720741271972656, + "learning_rate": 5.1728e-06, + "loss": 0.6642, + "step": 24150 + }, + { + "epoch": 2.416, + "grad_norm": 62.25669860839844, + "learning_rate": 5.1708000000000005e-06, + "loss": 0.8371, + "step": 24160 + }, + { + "epoch": 2.417, + "grad_norm": 33.065185546875, + "learning_rate": 5.1688e-06, + "loss": 0.7936, + "step": 24170 + }, + { + "epoch": 2.418, + "grad_norm": 13.660017013549805, + "learning_rate": 5.166800000000001e-06, + "loss": 0.7092, + "step": 24180 + }, + { + "epoch": 2.419, + "grad_norm": 28.82988739013672, + "learning_rate": 5.164800000000001e-06, + "loss": 0.6911, + "step": 24190 + }, + { + "epoch": 2.42, + "grad_norm": 20.214262008666992, + "learning_rate": 5.1628e-06, + "loss": 0.7027, + "step": 24200 + }, + { + "epoch": 2.421, + "grad_norm": 60.720298767089844, + "learning_rate": 5.160800000000001e-06, + "loss": 0.7199, + "step": 24210 + }, + { + "epoch": 2.422, + "grad_norm": 31.063989639282227, + "learning_rate": 5.1588000000000005e-06, + "loss": 0.5332, + "step": 24220 + }, + { + "epoch": 2.423, + "grad_norm": 383.9568176269531, + "learning_rate": 5.1568e-06, + "loss": 0.6992, + "step": 24230 + }, + { + "epoch": 2.424, + "grad_norm": 46.2714729309082, + "learning_rate": 5.154800000000001e-06, + "loss": 0.6366, + "step": 24240 + }, + { + "epoch": 2.425, + "grad_norm": 35.338539123535156, + "learning_rate": 5.1528e-06, + "loss": 0.8147, + "step": 24250 + }, + { + "epoch": 2.426, + "grad_norm": 34.018646240234375, + "learning_rate": 5.1508e-06, + "loss": 0.7693, + "step": 24260 + }, + { + "epoch": 2.427, + "grad_norm": 46.39628982543945, + "learning_rate": 5.148800000000001e-06, + "loss": 0.6542, + "step": 24270 + }, + { + "epoch": 2.428, + "grad_norm": 8.005655288696289, + "learning_rate": 5.1468000000000004e-06, + "loss": 0.4196, + "step": 24280 + }, + { + "epoch": 2.429, + "grad_norm": 31.6107234954834, + "learning_rate": 5.144800000000001e-06, + "loss": 0.6921, + "step": 24290 + }, + { + "epoch": 2.43, + "grad_norm": 7.957141876220703, + "learning_rate": 5.1428e-06, + "loss": 0.5301, + "step": 24300 + }, + { + "epoch": 2.431, + "grad_norm": 39.83584213256836, + "learning_rate": 5.1408e-06, + "loss": 0.6387, + "step": 24310 + }, + { + "epoch": 2.432, + "grad_norm": 24.404521942138672, + "learning_rate": 5.138800000000001e-06, + "loss": 0.4989, + "step": 24320 + }, + { + "epoch": 2.433, + "grad_norm": 20.31212043762207, + "learning_rate": 5.1368000000000005e-06, + "loss": 0.5232, + "step": 24330 + }, + { + "epoch": 2.434, + "grad_norm": 44.953060150146484, + "learning_rate": 5.134800000000001e-06, + "loss": 0.6488, + "step": 24340 + }, + { + "epoch": 2.435, + "grad_norm": 42.72674560546875, + "learning_rate": 5.1328e-06, + "loss": 0.6257, + "step": 24350 + }, + { + "epoch": 2.436, + "grad_norm": 42.03239440917969, + "learning_rate": 5.1308e-06, + "loss": 0.7274, + "step": 24360 + }, + { + "epoch": 2.437, + "grad_norm": 40.29648971557617, + "learning_rate": 5.128800000000001e-06, + "loss": 0.4868, + "step": 24370 + }, + { + "epoch": 2.438, + "grad_norm": 9.804975509643555, + "learning_rate": 5.126800000000001e-06, + "loss": 0.6022, + "step": 24380 + }, + { + "epoch": 2.439, + "grad_norm": 18.629207611083984, + "learning_rate": 5.1248000000000005e-06, + "loss": 0.6451, + "step": 24390 + }, + { + "epoch": 2.44, + "grad_norm": 52.884403228759766, + "learning_rate": 5.1228e-06, + "loss": 0.5821, + "step": 24400 + }, + { + "epoch": 2.441, + "grad_norm": 41.7515754699707, + "learning_rate": 5.1208e-06, + "loss": 0.7286, + "step": 24410 + }, + { + "epoch": 2.442, + "grad_norm": 44.79233169555664, + "learning_rate": 5.1188e-06, + "loss": 0.5648, + "step": 24420 + }, + { + "epoch": 2.443, + "grad_norm": 17.482986450195312, + "learning_rate": 5.116800000000001e-06, + "loss": 0.6169, + "step": 24430 + }, + { + "epoch": 2.444, + "grad_norm": 49.76847457885742, + "learning_rate": 5.1148000000000006e-06, + "loss": 0.5268, + "step": 24440 + }, + { + "epoch": 2.445, + "grad_norm": 88.84921264648438, + "learning_rate": 5.1127999999999996e-06, + "loss": 0.9343, + "step": 24450 + }, + { + "epoch": 2.446, + "grad_norm": 52.439720153808594, + "learning_rate": 5.1108e-06, + "loss": 0.6185, + "step": 24460 + }, + { + "epoch": 2.447, + "grad_norm": 3.1405117511749268, + "learning_rate": 5.1090000000000006e-06, + "loss": 0.3354, + "step": 24470 + }, + { + "epoch": 2.448, + "grad_norm": 37.97247314453125, + "learning_rate": 5.107000000000001e-06, + "loss": 0.5834, + "step": 24480 + }, + { + "epoch": 2.449, + "grad_norm": 3.9048521518707275, + "learning_rate": 5.105e-06, + "loss": 0.5524, + "step": 24490 + }, + { + "epoch": 2.45, + "grad_norm": 45.24668502807617, + "learning_rate": 5.103e-06, + "loss": 0.4898, + "step": 24500 + }, + { + "epoch": 2.451, + "grad_norm": 46.744415283203125, + "learning_rate": 5.101000000000001e-06, + "loss": 0.7093, + "step": 24510 + }, + { + "epoch": 2.452, + "grad_norm": 36.82590866088867, + "learning_rate": 5.099000000000001e-06, + "loss": 0.5571, + "step": 24520 + }, + { + "epoch": 2.453, + "grad_norm": 14.219284057617188, + "learning_rate": 5.0970000000000005e-06, + "loss": 0.3158, + "step": 24530 + }, + { + "epoch": 2.454, + "grad_norm": 4.428883075714111, + "learning_rate": 5.095e-06, + "loss": 0.6263, + "step": 24540 + }, + { + "epoch": 2.455, + "grad_norm": 40.982872009277344, + "learning_rate": 5.093e-06, + "loss": 0.7157, + "step": 24550 + }, + { + "epoch": 2.456, + "grad_norm": 36.43859100341797, + "learning_rate": 5.091e-06, + "loss": 0.7041, + "step": 24560 + }, + { + "epoch": 2.457, + "grad_norm": 21.157794952392578, + "learning_rate": 5.089000000000001e-06, + "loss": 0.4459, + "step": 24570 + }, + { + "epoch": 2.458, + "grad_norm": 15.586324691772461, + "learning_rate": 5.087000000000001e-06, + "loss": 0.7045, + "step": 24580 + }, + { + "epoch": 2.459, + "grad_norm": 23.27480697631836, + "learning_rate": 5.085e-06, + "loss": 0.4485, + "step": 24590 + }, + { + "epoch": 2.46, + "grad_norm": 30.3853702545166, + "learning_rate": 5.083e-06, + "loss": 0.7559, + "step": 24600 + }, + { + "epoch": 2.461, + "grad_norm": 62.69881820678711, + "learning_rate": 5.081e-06, + "loss": 0.7755, + "step": 24610 + }, + { + "epoch": 2.462, + "grad_norm": 35.382137298583984, + "learning_rate": 5.079000000000001e-06, + "loss": 0.7302, + "step": 24620 + }, + { + "epoch": 2.463, + "grad_norm": 5.012226104736328, + "learning_rate": 5.077000000000001e-06, + "loss": 0.4293, + "step": 24630 + }, + { + "epoch": 2.464, + "grad_norm": 30.396087646484375, + "learning_rate": 5.075e-06, + "loss": 0.5469, + "step": 24640 + }, + { + "epoch": 2.465, + "grad_norm": 23.97662925720215, + "learning_rate": 5.073e-06, + "loss": 0.4722, + "step": 24650 + }, + { + "epoch": 2.466, + "grad_norm": 61.57948303222656, + "learning_rate": 5.071e-06, + "loss": 0.5139, + "step": 24660 + }, + { + "epoch": 2.467, + "grad_norm": 64.28836822509766, + "learning_rate": 5.069000000000001e-06, + "loss": 0.5281, + "step": 24670 + }, + { + "epoch": 2.468, + "grad_norm": 17.3576602935791, + "learning_rate": 5.067000000000001e-06, + "loss": 0.8598, + "step": 24680 + }, + { + "epoch": 2.469, + "grad_norm": 48.70146942138672, + "learning_rate": 5.065e-06, + "loss": 0.6562, + "step": 24690 + }, + { + "epoch": 2.4699999999999998, + "grad_norm": 55.60419845581055, + "learning_rate": 5.0630000000000005e-06, + "loss": 0.6233, + "step": 24700 + }, + { + "epoch": 2.471, + "grad_norm": 12.617278099060059, + "learning_rate": 5.061e-06, + "loss": 0.5946, + "step": 24710 + }, + { + "epoch": 2.472, + "grad_norm": 10.002517700195312, + "learning_rate": 5.059e-06, + "loss": 0.6976, + "step": 24720 + }, + { + "epoch": 2.473, + "grad_norm": 44.83880615234375, + "learning_rate": 5.057000000000001e-06, + "loss": 0.6083, + "step": 24730 + }, + { + "epoch": 2.474, + "grad_norm": 40.62992858886719, + "learning_rate": 5.055e-06, + "loss": 0.5168, + "step": 24740 + }, + { + "epoch": 2.475, + "grad_norm": 39.006439208984375, + "learning_rate": 5.053e-06, + "loss": 0.6952, + "step": 24750 + }, + { + "epoch": 2.476, + "grad_norm": 22.533584594726562, + "learning_rate": 5.0510000000000004e-06, + "loss": 0.5019, + "step": 24760 + }, + { + "epoch": 2.477, + "grad_norm": 37.3040771484375, + "learning_rate": 5.049e-06, + "loss": 0.5401, + "step": 24770 + }, + { + "epoch": 2.4779999999999998, + "grad_norm": 53.25998306274414, + "learning_rate": 5.047000000000001e-06, + "loss": 0.6991, + "step": 24780 + }, + { + "epoch": 2.479, + "grad_norm": 11.337082862854004, + "learning_rate": 5.045e-06, + "loss": 0.4653, + "step": 24790 + }, + { + "epoch": 2.48, + "grad_norm": 5.860915660858154, + "learning_rate": 5.043e-06, + "loss": 0.7585, + "step": 24800 + }, + { + "epoch": 2.481, + "grad_norm": 41.77631759643555, + "learning_rate": 5.0410000000000005e-06, + "loss": 0.8433, + "step": 24810 + }, + { + "epoch": 2.482, + "grad_norm": 13.934993743896484, + "learning_rate": 5.039e-06, + "loss": 0.5707, + "step": 24820 + }, + { + "epoch": 2.483, + "grad_norm": 69.27035522460938, + "learning_rate": 5.037000000000001e-06, + "loss": 0.6715, + "step": 24830 + }, + { + "epoch": 2.484, + "grad_norm": 19.001020431518555, + "learning_rate": 5.035e-06, + "loss": 0.3987, + "step": 24840 + }, + { + "epoch": 2.485, + "grad_norm": 20.378995895385742, + "learning_rate": 5.033e-06, + "loss": 0.8881, + "step": 24850 + }, + { + "epoch": 2.4859999999999998, + "grad_norm": 36.096031188964844, + "learning_rate": 5.031000000000001e-06, + "loss": 0.6329, + "step": 24860 + }, + { + "epoch": 2.487, + "grad_norm": 53.7542610168457, + "learning_rate": 5.0290000000000005e-06, + "loss": 0.6449, + "step": 24870 + }, + { + "epoch": 2.488, + "grad_norm": 24.860681533813477, + "learning_rate": 5.027000000000001e-06, + "loss": 0.3337, + "step": 24880 + }, + { + "epoch": 2.489, + "grad_norm": 51.24372482299805, + "learning_rate": 5.025e-06, + "loss": 1.04, + "step": 24890 + }, + { + "epoch": 2.49, + "grad_norm": 30.71172523498535, + "learning_rate": 5.023e-06, + "loss": 0.6922, + "step": 24900 + }, + { + "epoch": 2.491, + "grad_norm": 29.371429443359375, + "learning_rate": 5.021000000000001e-06, + "loss": 0.6405, + "step": 24910 + }, + { + "epoch": 2.492, + "grad_norm": 26.474454879760742, + "learning_rate": 5.0190000000000006e-06, + "loss": 0.2826, + "step": 24920 + }, + { + "epoch": 2.493, + "grad_norm": 43.77047348022461, + "learning_rate": 5.017e-06, + "loss": 0.9044, + "step": 24930 + }, + { + "epoch": 2.4939999999999998, + "grad_norm": 36.24064254760742, + "learning_rate": 5.015e-06, + "loss": 0.5805, + "step": 24940 + }, + { + "epoch": 2.495, + "grad_norm": 3.8243775367736816, + "learning_rate": 5.013e-06, + "loss": 0.7752, + "step": 24950 + }, + { + "epoch": 2.496, + "grad_norm": 29.713932037353516, + "learning_rate": 5.011e-06, + "loss": 0.5483, + "step": 24960 + }, + { + "epoch": 2.497, + "grad_norm": 31.567214965820312, + "learning_rate": 5.009000000000001e-06, + "loss": 0.5147, + "step": 24970 + }, + { + "epoch": 2.498, + "grad_norm": 28.092866897583008, + "learning_rate": 5.0070000000000005e-06, + "loss": 0.8079, + "step": 24980 + }, + { + "epoch": 2.499, + "grad_norm": 12.45438003540039, + "learning_rate": 5.0049999999999995e-06, + "loss": 0.5923, + "step": 24990 + }, + { + "epoch": 2.5, + "grad_norm": 41.411476135253906, + "learning_rate": 5.003e-06, + "loss": 0.5853, + "step": 25000 + }, + { + "epoch": 2.501, + "grad_norm": 26.281492233276367, + "learning_rate": 5.001e-06, + "loss": 0.7854, + "step": 25010 + }, + { + "epoch": 2.502, + "grad_norm": 31.148704528808594, + "learning_rate": 4.999000000000001e-06, + "loss": 0.5544, + "step": 25020 + }, + { + "epoch": 2.503, + "grad_norm": 19.910507202148438, + "learning_rate": 4.997000000000001e-06, + "loss": 0.6938, + "step": 25030 + }, + { + "epoch": 2.504, + "grad_norm": 13.199621200561523, + "learning_rate": 4.9950000000000005e-06, + "loss": 0.6362, + "step": 25040 + }, + { + "epoch": 2.505, + "grad_norm": 43.331302642822266, + "learning_rate": 4.993e-06, + "loss": 0.6495, + "step": 25050 + }, + { + "epoch": 2.5060000000000002, + "grad_norm": 4.908505439758301, + "learning_rate": 4.991e-06, + "loss": 0.7268, + "step": 25060 + }, + { + "epoch": 2.507, + "grad_norm": 11.888862609863281, + "learning_rate": 4.989000000000001e-06, + "loss": 0.6008, + "step": 25070 + }, + { + "epoch": 2.508, + "grad_norm": 8.154284477233887, + "learning_rate": 4.987e-06, + "loss": 0.5632, + "step": 25080 + }, + { + "epoch": 2.509, + "grad_norm": 38.40000534057617, + "learning_rate": 4.9850000000000006e-06, + "loss": 0.9932, + "step": 25090 + }, + { + "epoch": 2.51, + "grad_norm": 4.843054294586182, + "learning_rate": 4.983e-06, + "loss": 0.5249, + "step": 25100 + }, + { + "epoch": 2.511, + "grad_norm": 24.743989944458008, + "learning_rate": 4.981e-06, + "loss": 0.6258, + "step": 25110 + }, + { + "epoch": 2.512, + "grad_norm": 41.41858673095703, + "learning_rate": 4.979e-06, + "loss": 0.7094, + "step": 25120 + }, + { + "epoch": 2.513, + "grad_norm": 37.239830017089844, + "learning_rate": 4.977e-06, + "loss": 0.7222, + "step": 25130 + }, + { + "epoch": 2.5140000000000002, + "grad_norm": 16.851455688476562, + "learning_rate": 4.975000000000001e-06, + "loss": 0.6925, + "step": 25140 + }, + { + "epoch": 2.515, + "grad_norm": 21.424264907836914, + "learning_rate": 4.9730000000000005e-06, + "loss": 0.6786, + "step": 25150 + }, + { + "epoch": 2.516, + "grad_norm": 37.35005569458008, + "learning_rate": 4.971e-06, + "loss": 0.5343, + "step": 25160 + }, + { + "epoch": 2.517, + "grad_norm": 12.154038429260254, + "learning_rate": 4.969e-06, + "loss": 0.649, + "step": 25170 + }, + { + "epoch": 2.518, + "grad_norm": 21.416759490966797, + "learning_rate": 4.967e-06, + "loss": 0.5783, + "step": 25180 + }, + { + "epoch": 2.519, + "grad_norm": 15.972697257995605, + "learning_rate": 4.965000000000001e-06, + "loss": 0.8398, + "step": 25190 + }, + { + "epoch": 2.52, + "grad_norm": 35.27529525756836, + "learning_rate": 4.963000000000001e-06, + "loss": 0.6247, + "step": 25200 + }, + { + "epoch": 2.521, + "grad_norm": 59.43956756591797, + "learning_rate": 4.9610000000000004e-06, + "loss": 0.797, + "step": 25210 + }, + { + "epoch": 2.5220000000000002, + "grad_norm": 8.644207954406738, + "learning_rate": 4.959e-06, + "loss": 0.4781, + "step": 25220 + }, + { + "epoch": 2.523, + "grad_norm": 32.61981964111328, + "learning_rate": 4.957e-06, + "loss": 0.9126, + "step": 25230 + }, + { + "epoch": 2.524, + "grad_norm": 52.15250778198242, + "learning_rate": 4.955e-06, + "loss": 0.532, + "step": 25240 + }, + { + "epoch": 2.525, + "grad_norm": 23.468826293945312, + "learning_rate": 4.953000000000001e-06, + "loss": 0.7001, + "step": 25250 + }, + { + "epoch": 2.526, + "grad_norm": 12.018453598022461, + "learning_rate": 4.9510000000000005e-06, + "loss": 0.5967, + "step": 25260 + }, + { + "epoch": 2.527, + "grad_norm": 35.626129150390625, + "learning_rate": 4.949e-06, + "loss": 0.5483, + "step": 25270 + }, + { + "epoch": 2.528, + "grad_norm": 21.014564514160156, + "learning_rate": 4.947e-06, + "loss": 0.7496, + "step": 25280 + }, + { + "epoch": 2.529, + "grad_norm": 35.65088653564453, + "learning_rate": 4.945e-06, + "loss": 0.6784, + "step": 25290 + }, + { + "epoch": 2.5300000000000002, + "grad_norm": 21.193424224853516, + "learning_rate": 4.943000000000001e-06, + "loss": 0.9482, + "step": 25300 + }, + { + "epoch": 2.531, + "grad_norm": 94.68802642822266, + "learning_rate": 4.941000000000001e-06, + "loss": 0.9929, + "step": 25310 + }, + { + "epoch": 2.532, + "grad_norm": 15.581934928894043, + "learning_rate": 4.9390000000000005e-06, + "loss": 0.7156, + "step": 25320 + }, + { + "epoch": 2.533, + "grad_norm": 29.984956741333008, + "learning_rate": 4.937e-06, + "loss": 0.5357, + "step": 25330 + }, + { + "epoch": 2.534, + "grad_norm": 54.73834991455078, + "learning_rate": 4.935e-06, + "loss": 0.8771, + "step": 25340 + }, + { + "epoch": 2.535, + "grad_norm": 14.674304962158203, + "learning_rate": 4.933000000000001e-06, + "loss": 0.6677, + "step": 25350 + }, + { + "epoch": 2.536, + "grad_norm": 17.07393455505371, + "learning_rate": 4.931e-06, + "loss": 0.6853, + "step": 25360 + }, + { + "epoch": 2.537, + "grad_norm": 7.97967529296875, + "learning_rate": 4.929000000000001e-06, + "loss": 0.4164, + "step": 25370 + }, + { + "epoch": 2.5380000000000003, + "grad_norm": 31.6351261138916, + "learning_rate": 4.9270000000000004e-06, + "loss": 0.6275, + "step": 25380 + }, + { + "epoch": 2.539, + "grad_norm": 49.94380187988281, + "learning_rate": 4.925e-06, + "loss": 0.7284, + "step": 25390 + }, + { + "epoch": 2.54, + "grad_norm": 32.98598861694336, + "learning_rate": 4.923000000000001e-06, + "loss": 0.7223, + "step": 25400 + }, + { + "epoch": 2.541, + "grad_norm": 81.89440155029297, + "learning_rate": 4.921e-06, + "loss": 0.7486, + "step": 25410 + }, + { + "epoch": 2.542, + "grad_norm": 34.58148193359375, + "learning_rate": 4.919000000000001e-06, + "loss": 0.6063, + "step": 25420 + }, + { + "epoch": 2.543, + "grad_norm": 23.96933364868164, + "learning_rate": 4.9170000000000005e-06, + "loss": 0.6273, + "step": 25430 + }, + { + "epoch": 2.544, + "grad_norm": 2.428050994873047, + "learning_rate": 4.915e-06, + "loss": 0.4913, + "step": 25440 + }, + { + "epoch": 2.545, + "grad_norm": 26.93573570251465, + "learning_rate": 4.913e-06, + "loss": 0.889, + "step": 25450 + }, + { + "epoch": 2.5460000000000003, + "grad_norm": 7.789152145385742, + "learning_rate": 4.911e-06, + "loss": 0.3632, + "step": 25460 + }, + { + "epoch": 2.547, + "grad_norm": 18.7591495513916, + "learning_rate": 4.909000000000001e-06, + "loss": 0.6128, + "step": 25470 + }, + { + "epoch": 2.548, + "grad_norm": 21.817312240600586, + "learning_rate": 4.907000000000001e-06, + "loss": 0.7535, + "step": 25480 + }, + { + "epoch": 2.549, + "grad_norm": 22.82774543762207, + "learning_rate": 4.9050000000000005e-06, + "loss": 0.9183, + "step": 25490 + }, + { + "epoch": 2.55, + "grad_norm": 27.486326217651367, + "learning_rate": 4.903e-06, + "loss": 0.6347, + "step": 25500 + }, + { + "epoch": 2.551, + "grad_norm": 18.89615821838379, + "learning_rate": 4.901e-06, + "loss": 0.6479, + "step": 25510 + }, + { + "epoch": 2.552, + "grad_norm": 39.137046813964844, + "learning_rate": 4.899e-06, + "loss": 0.5722, + "step": 25520 + }, + { + "epoch": 2.553, + "grad_norm": 17.233186721801758, + "learning_rate": 4.897000000000001e-06, + "loss": 0.7472, + "step": 25530 + }, + { + "epoch": 2.5540000000000003, + "grad_norm": 15.910088539123535, + "learning_rate": 4.8950000000000006e-06, + "loss": 0.5386, + "step": 25540 + }, + { + "epoch": 2.555, + "grad_norm": 36.60475158691406, + "learning_rate": 4.893e-06, + "loss": 0.7298, + "step": 25550 + }, + { + "epoch": 2.556, + "grad_norm": 37.67909240722656, + "learning_rate": 4.891e-06, + "loss": 0.6772, + "step": 25560 + }, + { + "epoch": 2.557, + "grad_norm": 6.002618789672852, + "learning_rate": 4.889e-06, + "loss": 0.4854, + "step": 25570 + }, + { + "epoch": 2.558, + "grad_norm": 22.503517150878906, + "learning_rate": 4.887000000000001e-06, + "loss": 0.7424, + "step": 25580 + }, + { + "epoch": 2.559, + "grad_norm": 38.0008430480957, + "learning_rate": 4.885000000000001e-06, + "loss": 0.686, + "step": 25590 + }, + { + "epoch": 2.56, + "grad_norm": 31.778282165527344, + "learning_rate": 4.8830000000000005e-06, + "loss": 0.6194, + "step": 25600 + }, + { + "epoch": 2.561, + "grad_norm": 39.77953338623047, + "learning_rate": 4.881e-06, + "loss": 0.6395, + "step": 25610 + }, + { + "epoch": 2.5620000000000003, + "grad_norm": 5.725687503814697, + "learning_rate": 4.879e-06, + "loss": 0.41, + "step": 25620 + }, + { + "epoch": 2.5629999999999997, + "grad_norm": 28.99380874633789, + "learning_rate": 4.877000000000001e-06, + "loss": 0.5985, + "step": 25630 + }, + { + "epoch": 2.564, + "grad_norm": 18.07556915283203, + "learning_rate": 4.875e-06, + "loss": 0.7484, + "step": 25640 + }, + { + "epoch": 2.565, + "grad_norm": 17.21390724182129, + "learning_rate": 4.873000000000001e-06, + "loss": 0.5817, + "step": 25650 + }, + { + "epoch": 2.566, + "grad_norm": 11.937161445617676, + "learning_rate": 4.8710000000000005e-06, + "loss": 0.4783, + "step": 25660 + }, + { + "epoch": 2.567, + "grad_norm": 18.838775634765625, + "learning_rate": 4.869e-06, + "loss": 0.7863, + "step": 25670 + }, + { + "epoch": 2.568, + "grad_norm": 35.052398681640625, + "learning_rate": 4.867000000000001e-06, + "loss": 0.5906, + "step": 25680 + }, + { + "epoch": 2.569, + "grad_norm": 18.534929275512695, + "learning_rate": 4.865e-06, + "loss": 0.6528, + "step": 25690 + }, + { + "epoch": 2.57, + "grad_norm": 15.753058433532715, + "learning_rate": 4.863000000000001e-06, + "loss": 0.7879, + "step": 25700 + }, + { + "epoch": 2.5709999999999997, + "grad_norm": 25.699085235595703, + "learning_rate": 4.8610000000000006e-06, + "loss": 0.5737, + "step": 25710 + }, + { + "epoch": 2.572, + "grad_norm": 28.720382690429688, + "learning_rate": 4.859e-06, + "loss": 0.6099, + "step": 25720 + }, + { + "epoch": 2.573, + "grad_norm": 7.138073921203613, + "learning_rate": 4.857e-06, + "loss": 0.6004, + "step": 25730 + }, + { + "epoch": 2.574, + "grad_norm": 42.17637252807617, + "learning_rate": 4.855e-06, + "loss": 0.7334, + "step": 25740 + }, + { + "epoch": 2.575, + "grad_norm": 19.05852508544922, + "learning_rate": 4.853000000000001e-06, + "loss": 0.6474, + "step": 25750 + }, + { + "epoch": 2.576, + "grad_norm": 24.66946029663086, + "learning_rate": 4.851e-06, + "loss": 0.7538, + "step": 25760 + }, + { + "epoch": 2.577, + "grad_norm": 25.733699798583984, + "learning_rate": 4.8490000000000005e-06, + "loss": 0.6507, + "step": 25770 + }, + { + "epoch": 2.578, + "grad_norm": 6.0958709716796875, + "learning_rate": 4.847e-06, + "loss": 0.3898, + "step": 25780 + }, + { + "epoch": 2.5789999999999997, + "grad_norm": 9.826627731323242, + "learning_rate": 4.845e-06, + "loss": 0.8563, + "step": 25790 + }, + { + "epoch": 2.58, + "grad_norm": 20.660486221313477, + "learning_rate": 4.843000000000001e-06, + "loss": 0.6671, + "step": 25800 + }, + { + "epoch": 2.581, + "grad_norm": 35.543087005615234, + "learning_rate": 4.841e-06, + "loss": 0.6269, + "step": 25810 + }, + { + "epoch": 2.582, + "grad_norm": 9.25235366821289, + "learning_rate": 4.839000000000001e-06, + "loss": 0.6317, + "step": 25820 + }, + { + "epoch": 2.583, + "grad_norm": 11.168290138244629, + "learning_rate": 4.8370000000000004e-06, + "loss": 0.6362, + "step": 25830 + }, + { + "epoch": 2.584, + "grad_norm": 36.224430084228516, + "learning_rate": 4.835e-06, + "loss": 0.5701, + "step": 25840 + }, + { + "epoch": 2.585, + "grad_norm": 35.46279525756836, + "learning_rate": 4.833e-06, + "loss": 0.5058, + "step": 25850 + }, + { + "epoch": 2.586, + "grad_norm": 11.398242950439453, + "learning_rate": 4.831e-06, + "loss": 0.5375, + "step": 25860 + }, + { + "epoch": 2.5869999999999997, + "grad_norm": 35.309444427490234, + "learning_rate": 4.829000000000001e-06, + "loss": 0.8348, + "step": 25870 + }, + { + "epoch": 2.588, + "grad_norm": 9.43244743347168, + "learning_rate": 4.8270000000000005e-06, + "loss": 0.5513, + "step": 25880 + }, + { + "epoch": 2.589, + "grad_norm": 37.240135192871094, + "learning_rate": 4.825e-06, + "loss": 0.6013, + "step": 25890 + }, + { + "epoch": 2.59, + "grad_norm": 47.575035095214844, + "learning_rate": 4.823e-06, + "loss": 0.7614, + "step": 25900 + }, + { + "epoch": 2.591, + "grad_norm": 8.541173934936523, + "learning_rate": 4.821e-06, + "loss": 0.4809, + "step": 25910 + }, + { + "epoch": 2.592, + "grad_norm": 22.348037719726562, + "learning_rate": 4.819e-06, + "loss": 0.582, + "step": 25920 + }, + { + "epoch": 2.593, + "grad_norm": 2.8846850395202637, + "learning_rate": 4.817000000000001e-06, + "loss": 0.4284, + "step": 25930 + }, + { + "epoch": 2.594, + "grad_norm": 15.262609481811523, + "learning_rate": 4.8150000000000005e-06, + "loss": 0.6563, + "step": 25940 + }, + { + "epoch": 2.5949999999999998, + "grad_norm": 42.77225112915039, + "learning_rate": 4.813e-06, + "loss": 0.7155, + "step": 25950 + }, + { + "epoch": 2.596, + "grad_norm": 38.20702362060547, + "learning_rate": 4.811000000000001e-06, + "loss": 0.6124, + "step": 25960 + }, + { + "epoch": 2.597, + "grad_norm": 16.295507431030273, + "learning_rate": 4.809e-06, + "loss": 0.5062, + "step": 25970 + }, + { + "epoch": 2.598, + "grad_norm": 45.26594543457031, + "learning_rate": 4.807000000000001e-06, + "loss": 0.7953, + "step": 25980 + }, + { + "epoch": 2.599, + "grad_norm": 47.709781646728516, + "learning_rate": 4.805000000000001e-06, + "loss": 0.566, + "step": 25990 + }, + { + "epoch": 2.6, + "grad_norm": 18.982378005981445, + "learning_rate": 4.8030000000000004e-06, + "loss": 0.5725, + "step": 26000 + }, + { + "epoch": 2.601, + "grad_norm": 44.133052825927734, + "learning_rate": 4.801e-06, + "loss": 0.5826, + "step": 26010 + }, + { + "epoch": 2.602, + "grad_norm": 15.568506240844727, + "learning_rate": 4.799e-06, + "loss": 0.5797, + "step": 26020 + }, + { + "epoch": 2.6029999999999998, + "grad_norm": 33.09275436401367, + "learning_rate": 4.797000000000001e-06, + "loss": 0.6618, + "step": 26030 + }, + { + "epoch": 2.604, + "grad_norm": 41.33121871948242, + "learning_rate": 4.795e-06, + "loss": 1.0158, + "step": 26040 + }, + { + "epoch": 2.605, + "grad_norm": 51.52129364013672, + "learning_rate": 4.7930000000000005e-06, + "loss": 0.6314, + "step": 26050 + }, + { + "epoch": 2.606, + "grad_norm": 6.395187854766846, + "learning_rate": 4.791e-06, + "loss": 0.5432, + "step": 26060 + }, + { + "epoch": 2.607, + "grad_norm": 39.273780822753906, + "learning_rate": 4.789e-06, + "loss": 0.5501, + "step": 26070 + }, + { + "epoch": 2.608, + "grad_norm": 35.81938934326172, + "learning_rate": 4.787000000000001e-06, + "loss": 0.3602, + "step": 26080 + }, + { + "epoch": 2.609, + "grad_norm": 35.3363151550293, + "learning_rate": 4.785e-06, + "loss": 0.6854, + "step": 26090 + }, + { + "epoch": 2.61, + "grad_norm": 45.12687301635742, + "learning_rate": 4.783000000000001e-06, + "loss": 0.6236, + "step": 26100 + }, + { + "epoch": 2.6109999999999998, + "grad_norm": 37.093116760253906, + "learning_rate": 4.7810000000000005e-06, + "loss": 0.9301, + "step": 26110 + }, + { + "epoch": 2.612, + "grad_norm": 13.176533699035645, + "learning_rate": 4.779e-06, + "loss": 0.492, + "step": 26120 + }, + { + "epoch": 2.613, + "grad_norm": 48.988155364990234, + "learning_rate": 4.777e-06, + "loss": 0.8739, + "step": 26130 + }, + { + "epoch": 2.614, + "grad_norm": 17.3516788482666, + "learning_rate": 4.775e-06, + "loss": 0.5989, + "step": 26140 + }, + { + "epoch": 2.615, + "grad_norm": 3.241021156311035, + "learning_rate": 4.773000000000001e-06, + "loss": 0.6567, + "step": 26150 + }, + { + "epoch": 2.616, + "grad_norm": 27.495573043823242, + "learning_rate": 4.7710000000000006e-06, + "loss": 0.4817, + "step": 26160 + }, + { + "epoch": 2.617, + "grad_norm": 45.31413650512695, + "learning_rate": 4.769e-06, + "loss": 0.5875, + "step": 26170 + }, + { + "epoch": 2.618, + "grad_norm": 43.09683609008789, + "learning_rate": 4.767e-06, + "loss": 0.752, + "step": 26180 + }, + { + "epoch": 2.6189999999999998, + "grad_norm": 17.2843074798584, + "learning_rate": 4.765e-06, + "loss": 0.6858, + "step": 26190 + }, + { + "epoch": 2.62, + "grad_norm": 47.570281982421875, + "learning_rate": 4.763000000000001e-06, + "loss": 0.6619, + "step": 26200 + }, + { + "epoch": 2.621, + "grad_norm": 2.8026413917541504, + "learning_rate": 4.761000000000001e-06, + "loss": 0.6908, + "step": 26210 + }, + { + "epoch": 2.622, + "grad_norm": 51.79669952392578, + "learning_rate": 4.7590000000000005e-06, + "loss": 0.5145, + "step": 26220 + }, + { + "epoch": 2.623, + "grad_norm": 19.143081665039062, + "learning_rate": 4.757e-06, + "loss": 0.7878, + "step": 26230 + }, + { + "epoch": 2.624, + "grad_norm": 39.0579948425293, + "learning_rate": 4.755e-06, + "loss": 0.5953, + "step": 26240 + }, + { + "epoch": 2.625, + "grad_norm": 52.51986312866211, + "learning_rate": 4.753e-06, + "loss": 0.5562, + "step": 26250 + }, + { + "epoch": 2.626, + "grad_norm": 53.69147872924805, + "learning_rate": 4.751000000000001e-06, + "loss": 0.6106, + "step": 26260 + }, + { + "epoch": 2.627, + "grad_norm": 38.23792266845703, + "learning_rate": 4.749000000000001e-06, + "loss": 0.5109, + "step": 26270 + }, + { + "epoch": 2.628, + "grad_norm": 46.235511779785156, + "learning_rate": 4.7470000000000005e-06, + "loss": 0.5813, + "step": 26280 + }, + { + "epoch": 2.629, + "grad_norm": 19.168806076049805, + "learning_rate": 4.745e-06, + "loss": 0.3629, + "step": 26290 + }, + { + "epoch": 2.63, + "grad_norm": 5.286655426025391, + "learning_rate": 4.743e-06, + "loss": 0.3528, + "step": 26300 + }, + { + "epoch": 2.6310000000000002, + "grad_norm": 52.93207550048828, + "learning_rate": 4.741000000000001e-06, + "loss": 0.6646, + "step": 26310 + }, + { + "epoch": 2.632, + "grad_norm": 6.3632588386535645, + "learning_rate": 4.739e-06, + "loss": 0.2838, + "step": 26320 + }, + { + "epoch": 2.633, + "grad_norm": 42.63784408569336, + "learning_rate": 4.7370000000000006e-06, + "loss": 0.4457, + "step": 26330 + }, + { + "epoch": 2.634, + "grad_norm": 37.047508239746094, + "learning_rate": 4.735e-06, + "loss": 0.8789, + "step": 26340 + }, + { + "epoch": 2.635, + "grad_norm": 4.293346881866455, + "learning_rate": 4.733e-06, + "loss": 0.41, + "step": 26350 + }, + { + "epoch": 2.636, + "grad_norm": 36.892662048339844, + "learning_rate": 4.731000000000001e-06, + "loss": 0.7638, + "step": 26360 + }, + { + "epoch": 2.637, + "grad_norm": 36.081783294677734, + "learning_rate": 4.729e-06, + "loss": 0.9168, + "step": 26370 + }, + { + "epoch": 2.638, + "grad_norm": 3.6100282669067383, + "learning_rate": 4.727000000000001e-06, + "loss": 0.6065, + "step": 26380 + }, + { + "epoch": 2.6390000000000002, + "grad_norm": 51.46051025390625, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.6926, + "step": 26390 + }, + { + "epoch": 2.64, + "grad_norm": 15.446451187133789, + "learning_rate": 4.723e-06, + "loss": 0.5111, + "step": 26400 + }, + { + "epoch": 2.641, + "grad_norm": 59.62565612792969, + "learning_rate": 4.721e-06, + "loss": 0.7106, + "step": 26410 + }, + { + "epoch": 2.642, + "grad_norm": 19.761140823364258, + "learning_rate": 4.719e-06, + "loss": 0.6368, + "step": 26420 + }, + { + "epoch": 2.643, + "grad_norm": 47.500484466552734, + "learning_rate": 4.717000000000001e-06, + "loss": 0.5708, + "step": 26430 + }, + { + "epoch": 2.644, + "grad_norm": 27.15022850036621, + "learning_rate": 4.715e-06, + "loss": 0.6257, + "step": 26440 + }, + { + "epoch": 2.645, + "grad_norm": 45.335968017578125, + "learning_rate": 4.7130000000000004e-06, + "loss": 0.5052, + "step": 26450 + }, + { + "epoch": 2.646, + "grad_norm": 38.73992919921875, + "learning_rate": 4.711e-06, + "loss": 0.6991, + "step": 26460 + }, + { + "epoch": 2.6470000000000002, + "grad_norm": 37.85356140136719, + "learning_rate": 4.709e-06, + "loss": 0.6061, + "step": 26470 + }, + { + "epoch": 2.648, + "grad_norm": 24.0386962890625, + "learning_rate": 4.707000000000001e-06, + "loss": 0.7666, + "step": 26480 + }, + { + "epoch": 2.649, + "grad_norm": 28.294811248779297, + "learning_rate": 4.705e-06, + "loss": 0.5678, + "step": 26490 + }, + { + "epoch": 2.65, + "grad_norm": 41.239707946777344, + "learning_rate": 4.7030000000000005e-06, + "loss": 0.8175, + "step": 26500 + }, + { + "epoch": 2.651, + "grad_norm": 62.72034454345703, + "learning_rate": 4.701e-06, + "loss": 0.5212, + "step": 26510 + }, + { + "epoch": 2.652, + "grad_norm": 38.574623107910156, + "learning_rate": 4.699e-06, + "loss": 0.7919, + "step": 26520 + }, + { + "epoch": 2.653, + "grad_norm": 11.876582145690918, + "learning_rate": 4.697e-06, + "loss": 0.7385, + "step": 26530 + }, + { + "epoch": 2.654, + "grad_norm": 14.082344055175781, + "learning_rate": 4.695e-06, + "loss": 0.3954, + "step": 26540 + }, + { + "epoch": 2.6550000000000002, + "grad_norm": 6.629438877105713, + "learning_rate": 4.693000000000001e-06, + "loss": 0.5655, + "step": 26550 + }, + { + "epoch": 2.656, + "grad_norm": 31.33914566040039, + "learning_rate": 4.6910000000000005e-06, + "loss": 0.7101, + "step": 26560 + }, + { + "epoch": 2.657, + "grad_norm": 32.01548767089844, + "learning_rate": 4.689e-06, + "loss": 0.7707, + "step": 26570 + }, + { + "epoch": 2.658, + "grad_norm": 20.46829605102539, + "learning_rate": 4.687e-06, + "loss": 0.585, + "step": 26580 + }, + { + "epoch": 2.659, + "grad_norm": 21.68409538269043, + "learning_rate": 4.685000000000001e-06, + "loss": 0.6834, + "step": 26590 + }, + { + "epoch": 2.66, + "grad_norm": 53.09114456176758, + "learning_rate": 4.683000000000001e-06, + "loss": 0.6068, + "step": 26600 + }, + { + "epoch": 2.661, + "grad_norm": 7.67567253112793, + "learning_rate": 4.681000000000001e-06, + "loss": 0.4884, + "step": 26610 + }, + { + "epoch": 2.662, + "grad_norm": 19.89399528503418, + "learning_rate": 4.6790000000000004e-06, + "loss": 0.7026, + "step": 26620 + }, + { + "epoch": 2.6630000000000003, + "grad_norm": 22.56330108642578, + "learning_rate": 4.677e-06, + "loss": 0.6527, + "step": 26630 + }, + { + "epoch": 2.664, + "grad_norm": 7.4913105964660645, + "learning_rate": 4.675000000000001e-06, + "loss": 0.5722, + "step": 26640 + }, + { + "epoch": 2.665, + "grad_norm": 21.277095794677734, + "learning_rate": 4.673e-06, + "loss": 0.6265, + "step": 26650 + }, + { + "epoch": 2.666, + "grad_norm": 10.212080955505371, + "learning_rate": 4.671000000000001e-06, + "loss": 0.6366, + "step": 26660 + }, + { + "epoch": 2.667, + "grad_norm": 73.72460174560547, + "learning_rate": 4.6690000000000005e-06, + "loss": 0.5917, + "step": 26670 + }, + { + "epoch": 2.668, + "grad_norm": 36.61571502685547, + "learning_rate": 4.667e-06, + "loss": 0.6464, + "step": 26680 + }, + { + "epoch": 2.669, + "grad_norm": 42.48518371582031, + "learning_rate": 4.665e-06, + "loss": 0.4937, + "step": 26690 + }, + { + "epoch": 2.67, + "grad_norm": 61.28166580200195, + "learning_rate": 4.663e-06, + "loss": 0.7738, + "step": 26700 + }, + { + "epoch": 2.6710000000000003, + "grad_norm": 6.222008228302002, + "learning_rate": 4.661000000000001e-06, + "loss": 0.2444, + "step": 26710 + }, + { + "epoch": 2.672, + "grad_norm": 38.14292907714844, + "learning_rate": 4.659e-06, + "loss": 0.5493, + "step": 26720 + }, + { + "epoch": 2.673, + "grad_norm": 34.81486129760742, + "learning_rate": 4.6570000000000005e-06, + "loss": 0.6373, + "step": 26730 + }, + { + "epoch": 2.674, + "grad_norm": 22.477020263671875, + "learning_rate": 4.655e-06, + "loss": 0.7369, + "step": 26740 + }, + { + "epoch": 2.675, + "grad_norm": 41.28529357910156, + "learning_rate": 4.653e-06, + "loss": 0.4623, + "step": 26750 + }, + { + "epoch": 2.676, + "grad_norm": 34.939598083496094, + "learning_rate": 4.651000000000001e-06, + "loss": 0.5825, + "step": 26760 + }, + { + "epoch": 2.677, + "grad_norm": 17.477685928344727, + "learning_rate": 4.649e-06, + "loss": 0.4918, + "step": 26770 + }, + { + "epoch": 2.678, + "grad_norm": 19.78916358947754, + "learning_rate": 4.6470000000000006e-06, + "loss": 0.7126, + "step": 26780 + }, + { + "epoch": 2.6790000000000003, + "grad_norm": 33.3621826171875, + "learning_rate": 4.645e-06, + "loss": 0.7059, + "step": 26790 + }, + { + "epoch": 2.68, + "grad_norm": 32.21095657348633, + "learning_rate": 4.643e-06, + "loss": 0.5107, + "step": 26800 + }, + { + "epoch": 2.681, + "grad_norm": 21.85660171508789, + "learning_rate": 4.641e-06, + "loss": 0.595, + "step": 26810 + }, + { + "epoch": 2.682, + "grad_norm": 29.33839988708496, + "learning_rate": 4.639e-06, + "loss": 0.5435, + "step": 26820 + }, + { + "epoch": 2.683, + "grad_norm": 19.039310455322266, + "learning_rate": 4.637000000000001e-06, + "loss": 0.7826, + "step": 26830 + }, + { + "epoch": 2.684, + "grad_norm": 31.831010818481445, + "learning_rate": 4.6350000000000005e-06, + "loss": 0.9578, + "step": 26840 + }, + { + "epoch": 2.685, + "grad_norm": 12.974231719970703, + "learning_rate": 4.633e-06, + "loss": 0.6917, + "step": 26850 + }, + { + "epoch": 2.686, + "grad_norm": 42.49304962158203, + "learning_rate": 4.631e-06, + "loss": 0.6856, + "step": 26860 + }, + { + "epoch": 2.6870000000000003, + "grad_norm": 5.649697780609131, + "learning_rate": 4.629e-06, + "loss": 0.4211, + "step": 26870 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 28.178577423095703, + "learning_rate": 4.627000000000001e-06, + "loss": 0.514, + "step": 26880 + }, + { + "epoch": 2.689, + "grad_norm": 14.685444831848145, + "learning_rate": 4.625000000000001e-06, + "loss": 0.6962, + "step": 26890 + }, + { + "epoch": 2.69, + "grad_norm": 32.58005142211914, + "learning_rate": 4.6230000000000005e-06, + "loss": 0.6088, + "step": 26900 + }, + { + "epoch": 2.691, + "grad_norm": 58.40959548950195, + "learning_rate": 4.621e-06, + "loss": 0.5932, + "step": 26910 + }, + { + "epoch": 2.692, + "grad_norm": 2.781147003173828, + "learning_rate": 4.619e-06, + "loss": 0.5929, + "step": 26920 + }, + { + "epoch": 2.693, + "grad_norm": 43.64352798461914, + "learning_rate": 4.617e-06, + "loss": 0.678, + "step": 26930 + }, + { + "epoch": 2.694, + "grad_norm": 39.059879302978516, + "learning_rate": 4.615000000000001e-06, + "loss": 0.7061, + "step": 26940 + }, + { + "epoch": 2.695, + "grad_norm": 36.203399658203125, + "learning_rate": 4.6130000000000006e-06, + "loss": 0.4879, + "step": 26950 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 53.63970184326172, + "learning_rate": 4.611e-06, + "loss": 0.6703, + "step": 26960 + }, + { + "epoch": 2.697, + "grad_norm": 4.569289207458496, + "learning_rate": 4.609e-06, + "loss": 0.5606, + "step": 26970 + }, + { + "epoch": 2.698, + "grad_norm": 25.914350509643555, + "learning_rate": 4.607e-06, + "loss": 0.5693, + "step": 26980 + }, + { + "epoch": 2.699, + "grad_norm": 15.915177345275879, + "learning_rate": 4.605000000000001e-06, + "loss": 0.4112, + "step": 26990 + }, + { + "epoch": 2.7, + "grad_norm": 57.12289810180664, + "learning_rate": 4.603000000000001e-06, + "loss": 0.6674, + "step": 27000 + }, + { + "epoch": 2.701, + "grad_norm": 5.237196445465088, + "learning_rate": 4.6010000000000005e-06, + "loss": 0.6634, + "step": 27010 + }, + { + "epoch": 2.702, + "grad_norm": 29.42719078063965, + "learning_rate": 4.599e-06, + "loss": 0.5788, + "step": 27020 + }, + { + "epoch": 2.703, + "grad_norm": 43.75193405151367, + "learning_rate": 4.597e-06, + "loss": 0.7219, + "step": 27030 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 21.26077651977539, + "learning_rate": 4.595000000000001e-06, + "loss": 0.6192, + "step": 27040 + }, + { + "epoch": 2.705, + "grad_norm": 39.62782287597656, + "learning_rate": 4.593e-06, + "loss": 0.7536, + "step": 27050 + }, + { + "epoch": 2.706, + "grad_norm": 19.186992645263672, + "learning_rate": 4.591000000000001e-06, + "loss": 0.6155, + "step": 27060 + }, + { + "epoch": 2.707, + "grad_norm": 17.996379852294922, + "learning_rate": 4.5890000000000004e-06, + "loss": 0.5156, + "step": 27070 + }, + { + "epoch": 2.708, + "grad_norm": 59.57747268676758, + "learning_rate": 4.587e-06, + "loss": 0.5751, + "step": 27080 + }, + { + "epoch": 2.709, + "grad_norm": 28.020484924316406, + "learning_rate": 4.585e-06, + "loss": 0.6283, + "step": 27090 + }, + { + "epoch": 2.71, + "grad_norm": 16.961652755737305, + "learning_rate": 4.583e-06, + "loss": 0.5277, + "step": 27100 + }, + { + "epoch": 2.711, + "grad_norm": 63.39067077636719, + "learning_rate": 4.581000000000001e-06, + "loss": 0.5592, + "step": 27110 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 8.494537353515625, + "learning_rate": 4.579e-06, + "loss": 0.3698, + "step": 27120 + }, + { + "epoch": 2.713, + "grad_norm": 41.40669250488281, + "learning_rate": 4.577e-06, + "loss": 0.6026, + "step": 27130 + }, + { + "epoch": 2.714, + "grad_norm": 20.117542266845703, + "learning_rate": 4.575e-06, + "loss": 0.4716, + "step": 27140 + }, + { + "epoch": 2.715, + "grad_norm": 42.48599624633789, + "learning_rate": 4.573e-06, + "loss": 0.7185, + "step": 27150 + }, + { + "epoch": 2.716, + "grad_norm": 75.84381866455078, + "learning_rate": 4.571000000000001e-06, + "loss": 0.8573, + "step": 27160 + }, + { + "epoch": 2.717, + "grad_norm": 40.775611877441406, + "learning_rate": 4.569e-06, + "loss": 0.6571, + "step": 27170 + }, + { + "epoch": 2.718, + "grad_norm": 16.275890350341797, + "learning_rate": 4.5670000000000005e-06, + "loss": 0.3679, + "step": 27180 + }, + { + "epoch": 2.719, + "grad_norm": 23.729806900024414, + "learning_rate": 4.565e-06, + "loss": 0.5482, + "step": 27190 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 8.010773658752441, + "learning_rate": 4.563e-06, + "loss": 0.5101, + "step": 27200 + }, + { + "epoch": 2.721, + "grad_norm": 19.037765502929688, + "learning_rate": 4.561e-06, + "loss": 0.6863, + "step": 27210 + }, + { + "epoch": 2.722, + "grad_norm": 26.438081741333008, + "learning_rate": 4.559000000000001e-06, + "loss": 0.866, + "step": 27220 + }, + { + "epoch": 2.723, + "grad_norm": 13.23498821258545, + "learning_rate": 4.557000000000001e-06, + "loss": 0.6461, + "step": 27230 + }, + { + "epoch": 2.724, + "grad_norm": 13.73941421508789, + "learning_rate": 4.5550000000000004e-06, + "loss": 0.7792, + "step": 27240 + }, + { + "epoch": 2.725, + "grad_norm": 30.74558448791504, + "learning_rate": 4.553e-06, + "loss": 0.5246, + "step": 27250 + }, + { + "epoch": 2.726, + "grad_norm": 34.99910354614258, + "learning_rate": 4.551e-06, + "loss": 0.6406, + "step": 27260 + }, + { + "epoch": 2.727, + "grad_norm": 31.215740203857422, + "learning_rate": 4.549000000000001e-06, + "loss": 0.778, + "step": 27270 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 10.44588851928711, + "learning_rate": 4.547000000000001e-06, + "loss": 0.6892, + "step": 27280 + }, + { + "epoch": 2.729, + "grad_norm": 13.254731178283691, + "learning_rate": 4.5450000000000005e-06, + "loss": 0.7861, + "step": 27290 + }, + { + "epoch": 2.73, + "grad_norm": 29.335412979125977, + "learning_rate": 4.543e-06, + "loss": 0.6654, + "step": 27300 + }, + { + "epoch": 2.731, + "grad_norm": 26.643966674804688, + "learning_rate": 4.541e-06, + "loss": 0.7136, + "step": 27310 + }, + { + "epoch": 2.732, + "grad_norm": 5.667937755584717, + "learning_rate": 4.539000000000001e-06, + "loss": 0.4284, + "step": 27320 + }, + { + "epoch": 2.733, + "grad_norm": 40.35080337524414, + "learning_rate": 4.537e-06, + "loss": 0.7426, + "step": 27330 + }, + { + "epoch": 2.734, + "grad_norm": 39.766109466552734, + "learning_rate": 4.535000000000001e-06, + "loss": 0.8392, + "step": 27340 + }, + { + "epoch": 2.735, + "grad_norm": 17.853336334228516, + "learning_rate": 4.5330000000000005e-06, + "loss": 0.5092, + "step": 27350 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 34.687530517578125, + "learning_rate": 4.531e-06, + "loss": 0.4737, + "step": 27360 + }, + { + "epoch": 2.737, + "grad_norm": 33.80595016479492, + "learning_rate": 4.529000000000001e-06, + "loss": 0.7412, + "step": 27370 + }, + { + "epoch": 2.738, + "grad_norm": 45.17950439453125, + "learning_rate": 4.527e-06, + "loss": 0.5256, + "step": 27380 + }, + { + "epoch": 2.739, + "grad_norm": 38.902565002441406, + "learning_rate": 4.525000000000001e-06, + "loss": 0.7947, + "step": 27390 + }, + { + "epoch": 2.74, + "grad_norm": 40.43433380126953, + "learning_rate": 4.5230000000000006e-06, + "loss": 0.7601, + "step": 27400 + }, + { + "epoch": 2.741, + "grad_norm": 48.44124984741211, + "learning_rate": 4.521e-06, + "loss": 0.688, + "step": 27410 + }, + { + "epoch": 2.742, + "grad_norm": 37.88941955566406, + "learning_rate": 4.519e-06, + "loss": 0.7972, + "step": 27420 + }, + { + "epoch": 2.743, + "grad_norm": 34.65846252441406, + "learning_rate": 4.517e-06, + "loss": 0.7524, + "step": 27430 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 36.840789794921875, + "learning_rate": 4.515000000000001e-06, + "loss": 0.4491, + "step": 27440 + }, + { + "epoch": 2.745, + "grad_norm": 14.328316688537598, + "learning_rate": 4.513e-06, + "loss": 0.5018, + "step": 27450 + }, + { + "epoch": 2.746, + "grad_norm": 18.487655639648438, + "learning_rate": 4.5110000000000005e-06, + "loss": 0.7067, + "step": 27460 + }, + { + "epoch": 2.747, + "grad_norm": 5.489202499389648, + "learning_rate": 4.509e-06, + "loss": 0.5449, + "step": 27470 + }, + { + "epoch": 2.748, + "grad_norm": 29.96932601928711, + "learning_rate": 4.507e-06, + "loss": 0.5671, + "step": 27480 + }, + { + "epoch": 2.749, + "grad_norm": 23.164758682250977, + "learning_rate": 4.505e-06, + "loss": 0.7109, + "step": 27490 + }, + { + "epoch": 2.75, + "grad_norm": 37.39947509765625, + "learning_rate": 4.503e-06, + "loss": 0.4134, + "step": 27500 + }, + { + "epoch": 2.751, + "grad_norm": 9.628229141235352, + "learning_rate": 4.501000000000001e-06, + "loss": 0.5925, + "step": 27510 + }, + { + "epoch": 2.752, + "grad_norm": 23.049732208251953, + "learning_rate": 4.4990000000000005e-06, + "loss": 0.4716, + "step": 27520 + }, + { + "epoch": 2.753, + "grad_norm": 27.242902755737305, + "learning_rate": 4.497e-06, + "loss": 0.5761, + "step": 27530 + }, + { + "epoch": 2.754, + "grad_norm": 22.390472412109375, + "learning_rate": 4.495e-06, + "loss": 0.6094, + "step": 27540 + }, + { + "epoch": 2.755, + "grad_norm": 33.42397689819336, + "learning_rate": 4.493e-06, + "loss": 0.5837, + "step": 27550 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 33.163333892822266, + "learning_rate": 4.491000000000001e-06, + "loss": 0.5067, + "step": 27560 + }, + { + "epoch": 2.757, + "grad_norm": 14.397940635681152, + "learning_rate": 4.4890000000000006e-06, + "loss": 0.6467, + "step": 27570 + }, + { + "epoch": 2.758, + "grad_norm": 34.04242706298828, + "learning_rate": 4.487e-06, + "loss": 0.6445, + "step": 27580 + }, + { + "epoch": 2.759, + "grad_norm": 15.349383354187012, + "learning_rate": 4.485e-06, + "loss": 0.5121, + "step": 27590 + }, + { + "epoch": 2.76, + "grad_norm": 5.300539970397949, + "learning_rate": 4.483e-06, + "loss": 0.4665, + "step": 27600 + }, + { + "epoch": 2.761, + "grad_norm": 39.58720016479492, + "learning_rate": 4.481e-06, + "loss": 0.66, + "step": 27610 + }, + { + "epoch": 2.762, + "grad_norm": 26.696575164794922, + "learning_rate": 4.479000000000001e-06, + "loss": 0.8141, + "step": 27620 + }, + { + "epoch": 2.763, + "grad_norm": 43.283931732177734, + "learning_rate": 4.4770000000000005e-06, + "loss": 0.6543, + "step": 27630 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 45.54264831542969, + "learning_rate": 4.475e-06, + "loss": 0.8184, + "step": 27640 + }, + { + "epoch": 2.765, + "grad_norm": 33.091976165771484, + "learning_rate": 4.473e-06, + "loss": 0.5188, + "step": 27650 + }, + { + "epoch": 2.766, + "grad_norm": 21.730209350585938, + "learning_rate": 4.471e-06, + "loss": 0.4846, + "step": 27660 + }, + { + "epoch": 2.767, + "grad_norm": 4.529099464416504, + "learning_rate": 4.469000000000001e-06, + "loss": 1.1101, + "step": 27670 + }, + { + "epoch": 2.768, + "grad_norm": 27.221635818481445, + "learning_rate": 4.467000000000001e-06, + "loss": 0.7868, + "step": 27680 + }, + { + "epoch": 2.769, + "grad_norm": 30.304941177368164, + "learning_rate": 4.4650000000000004e-06, + "loss": 0.5997, + "step": 27690 + }, + { + "epoch": 2.77, + "grad_norm": 35.67657470703125, + "learning_rate": 4.463e-06, + "loss": 0.6457, + "step": 27700 + }, + { + "epoch": 2.771, + "grad_norm": 50.44115447998047, + "learning_rate": 4.461e-06, + "loss": 0.6635, + "step": 27710 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 19.699462890625, + "learning_rate": 4.459000000000001e-06, + "loss": 0.4893, + "step": 27720 + }, + { + "epoch": 2.773, + "grad_norm": 23.55537986755371, + "learning_rate": 4.457e-06, + "loss": 0.6317, + "step": 27730 + }, + { + "epoch": 2.774, + "grad_norm": 81.08770751953125, + "learning_rate": 4.4550000000000005e-06, + "loss": 0.7006, + "step": 27740 + }, + { + "epoch": 2.775, + "grad_norm": 20.223026275634766, + "learning_rate": 4.453e-06, + "loss": 0.5312, + "step": 27750 + }, + { + "epoch": 2.776, + "grad_norm": 46.003047943115234, + "learning_rate": 4.451e-06, + "loss": 0.7338, + "step": 27760 + }, + { + "epoch": 2.777, + "grad_norm": 7.982237339019775, + "learning_rate": 4.449000000000001e-06, + "loss": 0.4504, + "step": 27770 + }, + { + "epoch": 2.778, + "grad_norm": 44.6984977722168, + "learning_rate": 4.447e-06, + "loss": 0.8292, + "step": 27780 + }, + { + "epoch": 2.779, + "grad_norm": 17.92458724975586, + "learning_rate": 4.445000000000001e-06, + "loss": 0.5918, + "step": 27790 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 5.827061176300049, + "learning_rate": 4.4430000000000005e-06, + "loss": 0.549, + "step": 27800 + }, + { + "epoch": 2.781, + "grad_norm": 35.08589553833008, + "learning_rate": 4.441e-06, + "loss": 0.4181, + "step": 27810 + }, + { + "epoch": 2.782, + "grad_norm": 38.79449462890625, + "learning_rate": 4.439e-06, + "loss": 0.5933, + "step": 27820 + }, + { + "epoch": 2.783, + "grad_norm": 3.3199753761291504, + "learning_rate": 4.437e-06, + "loss": 0.4137, + "step": 27830 + }, + { + "epoch": 2.784, + "grad_norm": 30.860759735107422, + "learning_rate": 4.435000000000001e-06, + "loss": 0.5869, + "step": 27840 + }, + { + "epoch": 2.785, + "grad_norm": 37.540809631347656, + "learning_rate": 4.433000000000001e-06, + "loss": 0.9034, + "step": 27850 + }, + { + "epoch": 2.786, + "grad_norm": 7.0977396965026855, + "learning_rate": 4.4310000000000004e-06, + "loss": 0.7018, + "step": 27860 + }, + { + "epoch": 2.787, + "grad_norm": 25.47239875793457, + "learning_rate": 4.429e-06, + "loss": 0.7053, + "step": 27870 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 32.19316864013672, + "learning_rate": 4.427e-06, + "loss": 0.6039, + "step": 27880 + }, + { + "epoch": 2.789, + "grad_norm": 46.36806869506836, + "learning_rate": 4.425e-06, + "loss": 0.6423, + "step": 27890 + }, + { + "epoch": 2.79, + "grad_norm": 14.628954887390137, + "learning_rate": 4.423000000000001e-06, + "loss": 0.5092, + "step": 27900 + }, + { + "epoch": 2.791, + "grad_norm": 39.353546142578125, + "learning_rate": 4.4210000000000005e-06, + "loss": 0.5355, + "step": 27910 + }, + { + "epoch": 2.792, + "grad_norm": 28.54851531982422, + "learning_rate": 4.419e-06, + "loss": 0.5379, + "step": 27920 + }, + { + "epoch": 2.793, + "grad_norm": 24.745616912841797, + "learning_rate": 4.417e-06, + "loss": 0.695, + "step": 27930 + }, + { + "epoch": 2.794, + "grad_norm": 15.356302261352539, + "learning_rate": 4.415e-06, + "loss": 0.5868, + "step": 27940 + }, + { + "epoch": 2.795, + "grad_norm": 56.78498458862305, + "learning_rate": 4.413000000000001e-06, + "loss": 0.5065, + "step": 27950 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 27.250751495361328, + "learning_rate": 4.411000000000001e-06, + "loss": 0.6082, + "step": 27960 + }, + { + "epoch": 2.797, + "grad_norm": 14.399949073791504, + "learning_rate": 4.4090000000000005e-06, + "loss": 0.4962, + "step": 27970 + }, + { + "epoch": 2.798, + "grad_norm": 15.190604209899902, + "learning_rate": 4.407e-06, + "loss": 0.5516, + "step": 27980 + }, + { + "epoch": 2.799, + "grad_norm": 40.881858825683594, + "learning_rate": 4.405e-06, + "loss": 0.914, + "step": 27990 + }, + { + "epoch": 2.8, + "grad_norm": 11.181133270263672, + "learning_rate": 4.403000000000001e-06, + "loss": 1.1243, + "step": 28000 + }, + { + "epoch": 2.801, + "grad_norm": 25.786142349243164, + "learning_rate": 4.401e-06, + "loss": 0.4991, + "step": 28010 + }, + { + "epoch": 2.802, + "grad_norm": 42.99445724487305, + "learning_rate": 4.3990000000000006e-06, + "loss": 0.5364, + "step": 28020 + }, + { + "epoch": 2.803, + "grad_norm": 49.733707427978516, + "learning_rate": 4.397e-06, + "loss": 0.773, + "step": 28030 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 44.46137237548828, + "learning_rate": 4.395e-06, + "loss": 0.5945, + "step": 28040 + }, + { + "epoch": 2.805, + "grad_norm": 59.004329681396484, + "learning_rate": 4.393000000000001e-06, + "loss": 0.6318, + "step": 28050 + }, + { + "epoch": 2.806, + "grad_norm": 32.86210632324219, + "learning_rate": 4.391e-06, + "loss": 0.5343, + "step": 28060 + }, + { + "epoch": 2.807, + "grad_norm": 15.308101654052734, + "learning_rate": 4.389000000000001e-06, + "loss": 0.6305, + "step": 28070 + }, + { + "epoch": 2.808, + "grad_norm": 37.713905334472656, + "learning_rate": 4.3870000000000005e-06, + "loss": 0.4288, + "step": 28080 + }, + { + "epoch": 2.809, + "grad_norm": 22.348554611206055, + "learning_rate": 4.385e-06, + "loss": 0.4645, + "step": 28090 + }, + { + "epoch": 2.81, + "grad_norm": 36.5361328125, + "learning_rate": 4.383e-06, + "loss": 1.1116, + "step": 28100 + }, + { + "epoch": 2.811, + "grad_norm": 39.90386962890625, + "learning_rate": 4.381e-06, + "loss": 0.6124, + "step": 28110 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 49.790897369384766, + "learning_rate": 4.379000000000001e-06, + "loss": 0.7894, + "step": 28120 + }, + { + "epoch": 2.8129999999999997, + "grad_norm": 64.48717498779297, + "learning_rate": 4.377e-06, + "loss": 0.6046, + "step": 28130 + }, + { + "epoch": 2.814, + "grad_norm": 12.735812187194824, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.8096, + "step": 28140 + }, + { + "epoch": 2.815, + "grad_norm": 41.18169403076172, + "learning_rate": 4.373e-06, + "loss": 0.455, + "step": 28150 + }, + { + "epoch": 2.816, + "grad_norm": 16.28331184387207, + "learning_rate": 4.371e-06, + "loss": 0.5784, + "step": 28160 + }, + { + "epoch": 2.817, + "grad_norm": 39.03124237060547, + "learning_rate": 4.369000000000001e-06, + "loss": 0.6922, + "step": 28170 + }, + { + "epoch": 2.818, + "grad_norm": 18.438169479370117, + "learning_rate": 4.367e-06, + "loss": 0.5654, + "step": 28180 + }, + { + "epoch": 2.819, + "grad_norm": 18.553783416748047, + "learning_rate": 4.3650000000000006e-06, + "loss": 0.7399, + "step": 28190 + }, + { + "epoch": 2.82, + "grad_norm": 28.252033233642578, + "learning_rate": 4.363e-06, + "loss": 0.663, + "step": 28200 + }, + { + "epoch": 2.8209999999999997, + "grad_norm": 14.199079513549805, + "learning_rate": 4.361e-06, + "loss": 0.4705, + "step": 28210 + }, + { + "epoch": 2.822, + "grad_norm": 25.68846321105957, + "learning_rate": 4.359e-06, + "loss": 0.5264, + "step": 28220 + }, + { + "epoch": 2.823, + "grad_norm": 55.090919494628906, + "learning_rate": 4.357e-06, + "loss": 0.5085, + "step": 28230 + }, + { + "epoch": 2.824, + "grad_norm": 45.92644500732422, + "learning_rate": 4.355000000000001e-06, + "loss": 1.0086, + "step": 28240 + }, + { + "epoch": 2.825, + "grad_norm": 42.4690055847168, + "learning_rate": 4.3530000000000005e-06, + "loss": 0.9663, + "step": 28250 + }, + { + "epoch": 2.826, + "grad_norm": 43.58584976196289, + "learning_rate": 4.351e-06, + "loss": 0.8671, + "step": 28260 + }, + { + "epoch": 2.827, + "grad_norm": 38.93071365356445, + "learning_rate": 4.349e-06, + "loss": 0.5779, + "step": 28270 + }, + { + "epoch": 2.828, + "grad_norm": 31.59266471862793, + "learning_rate": 4.347e-06, + "loss": 0.6636, + "step": 28280 + }, + { + "epoch": 2.8289999999999997, + "grad_norm": 41.18907928466797, + "learning_rate": 4.345000000000001e-06, + "loss": 0.7013, + "step": 28290 + }, + { + "epoch": 2.83, + "grad_norm": 10.073530197143555, + "learning_rate": 4.343000000000001e-06, + "loss": 0.6073, + "step": 28300 + }, + { + "epoch": 2.831, + "grad_norm": 20.655839920043945, + "learning_rate": 4.3410000000000005e-06, + "loss": 0.3729, + "step": 28310 + }, + { + "epoch": 2.832, + "grad_norm": 18.784809112548828, + "learning_rate": 4.339e-06, + "loss": 0.7303, + "step": 28320 + }, + { + "epoch": 2.833, + "grad_norm": 57.55241775512695, + "learning_rate": 4.337e-06, + "loss": 0.9426, + "step": 28330 + }, + { + "epoch": 2.834, + "grad_norm": 33.25703811645508, + "learning_rate": 4.335e-06, + "loss": 0.8751, + "step": 28340 + }, + { + "epoch": 2.835, + "grad_norm": 8.003162384033203, + "learning_rate": 4.333000000000001e-06, + "loss": 0.57, + "step": 28350 + }, + { + "epoch": 2.836, + "grad_norm": 77.80805969238281, + "learning_rate": 4.3310000000000005e-06, + "loss": 0.5509, + "step": 28360 + }, + { + "epoch": 2.8369999999999997, + "grad_norm": 10.62796401977539, + "learning_rate": 4.329e-06, + "loss": 0.495, + "step": 28370 + }, + { + "epoch": 2.838, + "grad_norm": 32.467105865478516, + "learning_rate": 4.327e-06, + "loss": 0.4321, + "step": 28380 + }, + { + "epoch": 2.839, + "grad_norm": 22.827064514160156, + "learning_rate": 4.325e-06, + "loss": 0.5089, + "step": 28390 + }, + { + "epoch": 2.84, + "grad_norm": 31.53234100341797, + "learning_rate": 4.323000000000001e-06, + "loss": 0.6055, + "step": 28400 + }, + { + "epoch": 2.841, + "grad_norm": 7.039924144744873, + "learning_rate": 4.321e-06, + "loss": 0.6939, + "step": 28410 + }, + { + "epoch": 2.842, + "grad_norm": 14.15356731414795, + "learning_rate": 4.3190000000000005e-06, + "loss": 0.7508, + "step": 28420 + }, + { + "epoch": 2.843, + "grad_norm": 59.17403030395508, + "learning_rate": 4.317e-06, + "loss": 0.5693, + "step": 28430 + }, + { + "epoch": 2.844, + "grad_norm": 8.260176658630371, + "learning_rate": 4.315e-06, + "loss": 0.4824, + "step": 28440 + }, + { + "epoch": 2.8449999999999998, + "grad_norm": 24.74396324157715, + "learning_rate": 4.313000000000001e-06, + "loss": 0.5549, + "step": 28450 + }, + { + "epoch": 2.846, + "grad_norm": 30.768203735351562, + "learning_rate": 4.311e-06, + "loss": 0.6408, + "step": 28460 + }, + { + "epoch": 2.847, + "grad_norm": 37.09162139892578, + "learning_rate": 4.309000000000001e-06, + "loss": 0.6981, + "step": 28470 + }, + { + "epoch": 2.848, + "grad_norm": 21.815000534057617, + "learning_rate": 4.3070000000000004e-06, + "loss": 0.607, + "step": 28480 + }, + { + "epoch": 2.849, + "grad_norm": 43.1075553894043, + "learning_rate": 4.305e-06, + "loss": 0.6963, + "step": 28490 + }, + { + "epoch": 2.85, + "grad_norm": 61.59611129760742, + "learning_rate": 4.303e-06, + "loss": 0.661, + "step": 28500 + }, + { + "epoch": 2.851, + "grad_norm": 18.589096069335938, + "learning_rate": 4.301e-06, + "loss": 0.7131, + "step": 28510 + }, + { + "epoch": 2.852, + "grad_norm": 5.096174716949463, + "learning_rate": 4.299000000000001e-06, + "loss": 0.4882, + "step": 28520 + }, + { + "epoch": 2.8529999999999998, + "grad_norm": 20.526193618774414, + "learning_rate": 4.2970000000000005e-06, + "loss": 0.681, + "step": 28530 + }, + { + "epoch": 2.854, + "grad_norm": 26.11942481994629, + "learning_rate": 4.295e-06, + "loss": 0.7666, + "step": 28540 + }, + { + "epoch": 2.855, + "grad_norm": 26.176834106445312, + "learning_rate": 4.293e-06, + "loss": 0.5955, + "step": 28550 + }, + { + "epoch": 2.856, + "grad_norm": 27.885841369628906, + "learning_rate": 4.291e-06, + "loss": 0.6494, + "step": 28560 + }, + { + "epoch": 2.857, + "grad_norm": 33.89963150024414, + "learning_rate": 4.289000000000001e-06, + "loss": 0.2812, + "step": 28570 + }, + { + "epoch": 2.858, + "grad_norm": 41.28437805175781, + "learning_rate": 4.287000000000001e-06, + "loss": 0.7791, + "step": 28580 + }, + { + "epoch": 2.859, + "grad_norm": 31.66999053955078, + "learning_rate": 4.2850000000000005e-06, + "loss": 0.6356, + "step": 28590 + }, + { + "epoch": 2.86, + "grad_norm": 38.8473014831543, + "learning_rate": 4.283e-06, + "loss": 0.6462, + "step": 28600 + }, + { + "epoch": 2.8609999999999998, + "grad_norm": 38.425048828125, + "learning_rate": 4.281e-06, + "loss": 0.5853, + "step": 28610 + }, + { + "epoch": 2.862, + "grad_norm": 38.103275299072266, + "learning_rate": 4.279e-06, + "loss": 0.4751, + "step": 28620 + }, + { + "epoch": 2.863, + "grad_norm": 44.24375915527344, + "learning_rate": 4.277000000000001e-06, + "loss": 0.7975, + "step": 28630 + }, + { + "epoch": 2.864, + "grad_norm": 27.316816329956055, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.7389, + "step": 28640 + }, + { + "epoch": 2.865, + "grad_norm": 76.10896301269531, + "learning_rate": 4.2730000000000004e-06, + "loss": 0.3966, + "step": 28650 + }, + { + "epoch": 2.866, + "grad_norm": 47.929649353027344, + "learning_rate": 4.271e-06, + "loss": 0.62, + "step": 28660 + }, + { + "epoch": 2.867, + "grad_norm": 52.1750602722168, + "learning_rate": 4.269e-06, + "loss": 0.7978, + "step": 28670 + }, + { + "epoch": 2.868, + "grad_norm": 52.91067886352539, + "learning_rate": 4.267000000000001e-06, + "loss": 0.6759, + "step": 28680 + }, + { + "epoch": 2.8689999999999998, + "grad_norm": 30.69292449951172, + "learning_rate": 4.265000000000001e-06, + "loss": 0.5969, + "step": 28690 + }, + { + "epoch": 2.87, + "grad_norm": 45.15058517456055, + "learning_rate": 4.2630000000000005e-06, + "loss": 0.422, + "step": 28700 + }, + { + "epoch": 2.871, + "grad_norm": 9.530531883239746, + "learning_rate": 4.261e-06, + "loss": 0.5713, + "step": 28710 + }, + { + "epoch": 2.872, + "grad_norm": 15.413444519042969, + "learning_rate": 4.259e-06, + "loss": 0.3489, + "step": 28720 + }, + { + "epoch": 2.873, + "grad_norm": 33.19585418701172, + "learning_rate": 4.257000000000001e-06, + "loss": 0.6475, + "step": 28730 + }, + { + "epoch": 2.874, + "grad_norm": 31.359365463256836, + "learning_rate": 4.255e-06, + "loss": 0.5289, + "step": 28740 + }, + { + "epoch": 2.875, + "grad_norm": 37.34211349487305, + "learning_rate": 4.253000000000001e-06, + "loss": 0.6632, + "step": 28750 + }, + { + "epoch": 2.876, + "grad_norm": 46.908180236816406, + "learning_rate": 4.2510000000000005e-06, + "loss": 0.6369, + "step": 28760 + }, + { + "epoch": 2.877, + "grad_norm": 33.455013275146484, + "learning_rate": 4.249e-06, + "loss": 0.4806, + "step": 28770 + }, + { + "epoch": 2.878, + "grad_norm": 38.421104431152344, + "learning_rate": 4.247e-06, + "loss": 0.5913, + "step": 28780 + }, + { + "epoch": 2.879, + "grad_norm": 27.120136260986328, + "learning_rate": 4.245e-06, + "loss": 0.5004, + "step": 28790 + }, + { + "epoch": 2.88, + "grad_norm": 54.891231536865234, + "learning_rate": 4.243000000000001e-06, + "loss": 0.6929, + "step": 28800 + }, + { + "epoch": 2.8810000000000002, + "grad_norm": 27.580013275146484, + "learning_rate": 4.241e-06, + "loss": 0.7942, + "step": 28810 + }, + { + "epoch": 2.882, + "grad_norm": 34.77034378051758, + "learning_rate": 4.239e-06, + "loss": 0.7068, + "step": 28820 + }, + { + "epoch": 2.883, + "grad_norm": 17.92949104309082, + "learning_rate": 4.237e-06, + "loss": 0.5559, + "step": 28830 + }, + { + "epoch": 2.884, + "grad_norm": 30.56000328063965, + "learning_rate": 4.235e-06, + "loss": 0.5818, + "step": 28840 + }, + { + "epoch": 2.885, + "grad_norm": 57.818504333496094, + "learning_rate": 4.233000000000001e-06, + "loss": 0.7016, + "step": 28850 + }, + { + "epoch": 2.886, + "grad_norm": 31.524658203125, + "learning_rate": 4.231e-06, + "loss": 0.5322, + "step": 28860 + }, + { + "epoch": 2.887, + "grad_norm": 27.571067810058594, + "learning_rate": 4.2290000000000005e-06, + "loss": 0.6619, + "step": 28870 + }, + { + "epoch": 2.888, + "grad_norm": 6.408551216125488, + "learning_rate": 4.227e-06, + "loss": 0.6884, + "step": 28880 + }, + { + "epoch": 2.8890000000000002, + "grad_norm": 51.78606033325195, + "learning_rate": 4.225e-06, + "loss": 0.4897, + "step": 28890 + }, + { + "epoch": 2.89, + "grad_norm": 12.073798179626465, + "learning_rate": 4.223e-06, + "loss": 0.6119, + "step": 28900 + }, + { + "epoch": 2.891, + "grad_norm": 127.30245208740234, + "learning_rate": 4.221e-06, + "loss": 0.5858, + "step": 28910 + }, + { + "epoch": 2.892, + "grad_norm": 5.639313220977783, + "learning_rate": 4.219000000000001e-06, + "loss": 0.4539, + "step": 28920 + }, + { + "epoch": 2.893, + "grad_norm": 3.212557077407837, + "learning_rate": 4.2170000000000005e-06, + "loss": 0.5321, + "step": 28930 + }, + { + "epoch": 2.894, + "grad_norm": 34.34654235839844, + "learning_rate": 4.215e-06, + "loss": 0.5977, + "step": 28940 + }, + { + "epoch": 2.895, + "grad_norm": 25.9399471282959, + "learning_rate": 4.213e-06, + "loss": 0.75, + "step": 28950 + }, + { + "epoch": 2.896, + "grad_norm": 70.75745391845703, + "learning_rate": 4.211e-06, + "loss": 0.5391, + "step": 28960 + }, + { + "epoch": 2.8970000000000002, + "grad_norm": 35.240882873535156, + "learning_rate": 4.209000000000001e-06, + "loss": 0.619, + "step": 28970 + }, + { + "epoch": 2.898, + "grad_norm": 43.350284576416016, + "learning_rate": 4.2070000000000005e-06, + "loss": 0.6964, + "step": 28980 + }, + { + "epoch": 2.899, + "grad_norm": 27.253419876098633, + "learning_rate": 4.205e-06, + "loss": 0.6312, + "step": 28990 + }, + { + "epoch": 2.9, + "grad_norm": 22.365201950073242, + "learning_rate": 4.203e-06, + "loss": 0.7312, + "step": 29000 + }, + { + "epoch": 2.901, + "grad_norm": 33.48457336425781, + "learning_rate": 4.201e-06, + "loss": 0.7824, + "step": 29010 + }, + { + "epoch": 2.902, + "grad_norm": 20.902122497558594, + "learning_rate": 4.199e-06, + "loss": 0.568, + "step": 29020 + }, + { + "epoch": 2.903, + "grad_norm": 34.6548957824707, + "learning_rate": 4.197000000000001e-06, + "loss": 0.3214, + "step": 29030 + }, + { + "epoch": 2.904, + "grad_norm": 13.812738418579102, + "learning_rate": 4.1950000000000005e-06, + "loss": 0.5746, + "step": 29040 + }, + { + "epoch": 2.9050000000000002, + "grad_norm": 15.632746696472168, + "learning_rate": 4.193e-06, + "loss": 0.5344, + "step": 29050 + }, + { + "epoch": 2.906, + "grad_norm": 5.410891532897949, + "learning_rate": 4.191e-06, + "loss": 0.5319, + "step": 29060 + }, + { + "epoch": 2.907, + "grad_norm": 18.57172966003418, + "learning_rate": 4.189e-06, + "loss": 0.6795, + "step": 29070 + }, + { + "epoch": 2.908, + "grad_norm": 38.506141662597656, + "learning_rate": 4.187000000000001e-06, + "loss": 0.8386, + "step": 29080 + }, + { + "epoch": 2.909, + "grad_norm": 62.42726135253906, + "learning_rate": 4.185000000000001e-06, + "loss": 0.4449, + "step": 29090 + }, + { + "epoch": 2.91, + "grad_norm": Infinity, + "learning_rate": 4.183200000000001e-06, + "loss": 0.7186, + "step": 29100 + }, + { + "epoch": 2.911, + "grad_norm": 42.34231948852539, + "learning_rate": 4.1812e-06, + "loss": 0.5659, + "step": 29110 + }, + { + "epoch": 2.912, + "grad_norm": 40.03559875488281, + "learning_rate": 4.179200000000001e-06, + "loss": 0.5508, + "step": 29120 + }, + { + "epoch": 2.9130000000000003, + "grad_norm": 49.353240966796875, + "learning_rate": 4.1772000000000004e-06, + "loss": 0.68, + "step": 29130 + }, + { + "epoch": 2.914, + "grad_norm": 38.63571548461914, + "learning_rate": 4.1752e-06, + "loss": 0.7239, + "step": 29140 + }, + { + "epoch": 2.915, + "grad_norm": 7.519901275634766, + "learning_rate": 4.173200000000001e-06, + "loss": 0.4593, + "step": 29150 + }, + { + "epoch": 2.916, + "grad_norm": 32.73497772216797, + "learning_rate": 4.1712e-06, + "loss": 0.4446, + "step": 29160 + }, + { + "epoch": 2.917, + "grad_norm": 35.581703186035156, + "learning_rate": 4.169200000000001e-06, + "loss": 0.2908, + "step": 29170 + }, + { + "epoch": 2.918, + "grad_norm": 20.578540802001953, + "learning_rate": 4.1672000000000005e-06, + "loss": 0.7993, + "step": 29180 + }, + { + "epoch": 2.919, + "grad_norm": 22.452545166015625, + "learning_rate": 4.1652e-06, + "loss": 0.5683, + "step": 29190 + }, + { + "epoch": 2.92, + "grad_norm": 19.33148765563965, + "learning_rate": 4.1632e-06, + "loss": 0.4966, + "step": 29200 + }, + { + "epoch": 2.9210000000000003, + "grad_norm": 20.982643127441406, + "learning_rate": 4.1612e-06, + "loss": 0.6517, + "step": 29210 + }, + { + "epoch": 2.922, + "grad_norm": 45.04250717163086, + "learning_rate": 4.159200000000001e-06, + "loss": 0.6315, + "step": 29220 + }, + { + "epoch": 2.923, + "grad_norm": 40.241294860839844, + "learning_rate": 4.1572e-06, + "loss": 0.7002, + "step": 29230 + }, + { + "epoch": 2.924, + "grad_norm": 31.673635482788086, + "learning_rate": 4.1552000000000005e-06, + "loss": 0.7658, + "step": 29240 + }, + { + "epoch": 2.925, + "grad_norm": 29.901514053344727, + "learning_rate": 4.1532e-06, + "loss": 0.8841, + "step": 29250 + }, + { + "epoch": 2.926, + "grad_norm": 12.362701416015625, + "learning_rate": 4.1512e-06, + "loss": 0.5445, + "step": 29260 + }, + { + "epoch": 2.927, + "grad_norm": 36.973838806152344, + "learning_rate": 4.149200000000001e-06, + "loss": 0.5255, + "step": 29270 + }, + { + "epoch": 2.928, + "grad_norm": 1.602242112159729, + "learning_rate": 4.1472e-06, + "loss": 0.7783, + "step": 29280 + }, + { + "epoch": 2.9290000000000003, + "grad_norm": 33.756126403808594, + "learning_rate": 4.1452000000000006e-06, + "loss": 0.5491, + "step": 29290 + }, + { + "epoch": 2.93, + "grad_norm": 68.7015380859375, + "learning_rate": 4.1432e-06, + "loss": 0.7136, + "step": 29300 + }, + { + "epoch": 2.931, + "grad_norm": 27.283906936645508, + "learning_rate": 4.1412e-06, + "loss": 0.8519, + "step": 29310 + }, + { + "epoch": 2.932, + "grad_norm": 29.23966407775879, + "learning_rate": 4.1392e-06, + "loss": 0.736, + "step": 29320 + }, + { + "epoch": 2.933, + "grad_norm": 30.30097198486328, + "learning_rate": 4.1372e-06, + "loss": 0.4569, + "step": 29330 + }, + { + "epoch": 2.934, + "grad_norm": 60.6238899230957, + "learning_rate": 4.135200000000001e-06, + "loss": 0.6256, + "step": 29340 + }, + { + "epoch": 2.935, + "grad_norm": 48.743343353271484, + "learning_rate": 4.1332000000000005e-06, + "loss": 0.7367, + "step": 29350 + }, + { + "epoch": 2.936, + "grad_norm": 30.004465103149414, + "learning_rate": 4.1312e-06, + "loss": 1.0274, + "step": 29360 + }, + { + "epoch": 2.9370000000000003, + "grad_norm": 26.581308364868164, + "learning_rate": 4.1292e-06, + "loss": 0.5193, + "step": 29370 + }, + { + "epoch": 2.9379999999999997, + "grad_norm": 5.948087215423584, + "learning_rate": 4.1272e-06, + "loss": 0.44, + "step": 29380 + }, + { + "epoch": 2.939, + "grad_norm": 47.7027473449707, + "learning_rate": 4.1252e-06, + "loss": 0.7333, + "step": 29390 + }, + { + "epoch": 2.94, + "grad_norm": 39.193851470947266, + "learning_rate": 4.123200000000001e-06, + "loss": 0.6778, + "step": 29400 + }, + { + "epoch": 2.941, + "grad_norm": 27.26168441772461, + "learning_rate": 4.1212000000000005e-06, + "loss": 0.6049, + "step": 29410 + }, + { + "epoch": 2.942, + "grad_norm": 22.667373657226562, + "learning_rate": 4.1192e-06, + "loss": 0.5667, + "step": 29420 + }, + { + "epoch": 2.943, + "grad_norm": 33.34720993041992, + "learning_rate": 4.1172e-06, + "loss": 0.5668, + "step": 29430 + }, + { + "epoch": 2.944, + "grad_norm": 37.20878601074219, + "learning_rate": 4.1152e-06, + "loss": 0.916, + "step": 29440 + }, + { + "epoch": 2.945, + "grad_norm": 10.755010604858398, + "learning_rate": 4.113200000000001e-06, + "loss": 0.6121, + "step": 29450 + }, + { + "epoch": 2.9459999999999997, + "grad_norm": 13.625876426696777, + "learning_rate": 4.1112000000000006e-06, + "loss": 0.5307, + "step": 29460 + }, + { + "epoch": 2.947, + "grad_norm": 57.56064224243164, + "learning_rate": 4.1092e-06, + "loss": 0.4517, + "step": 29470 + }, + { + "epoch": 2.948, + "grad_norm": 27.347122192382812, + "learning_rate": 4.1072e-06, + "loss": 0.6075, + "step": 29480 + }, + { + "epoch": 2.949, + "grad_norm": 64.42729949951172, + "learning_rate": 4.1052e-06, + "loss": 0.7204, + "step": 29490 + }, + { + "epoch": 2.95, + "grad_norm": 42.93135452270508, + "learning_rate": 4.103200000000001e-06, + "loss": 0.586, + "step": 29500 + }, + { + "epoch": 2.951, + "grad_norm": 36.54902648925781, + "learning_rate": 4.1012e-06, + "loss": 0.7982, + "step": 29510 + }, + { + "epoch": 2.952, + "grad_norm": 25.70467758178711, + "learning_rate": 4.0992000000000005e-06, + "loss": 0.5134, + "step": 29520 + }, + { + "epoch": 2.953, + "grad_norm": 22.512723922729492, + "learning_rate": 4.0972e-06, + "loss": 0.8235, + "step": 29530 + }, + { + "epoch": 2.9539999999999997, + "grad_norm": 50.21841049194336, + "learning_rate": 4.0952e-06, + "loss": 0.6306, + "step": 29540 + }, + { + "epoch": 2.955, + "grad_norm": 17.58340835571289, + "learning_rate": 4.093200000000001e-06, + "loss": 0.5652, + "step": 29550 + }, + { + "epoch": 2.956, + "grad_norm": 19.082971572875977, + "learning_rate": 4.0912e-06, + "loss": 0.5947, + "step": 29560 + }, + { + "epoch": 2.957, + "grad_norm": 47.442440032958984, + "learning_rate": 4.089200000000001e-06, + "loss": 0.7297, + "step": 29570 + }, + { + "epoch": 2.958, + "grad_norm": 17.213855743408203, + "learning_rate": 4.0872000000000004e-06, + "loss": 0.4932, + "step": 29580 + }, + { + "epoch": 2.959, + "grad_norm": 40.99843978881836, + "learning_rate": 4.0852e-06, + "loss": 0.6037, + "step": 29590 + }, + { + "epoch": 2.96, + "grad_norm": 29.501218795776367, + "learning_rate": 4.0832e-06, + "loss": 0.5786, + "step": 29600 + }, + { + "epoch": 2.961, + "grad_norm": 28.239242553710938, + "learning_rate": 4.0812e-06, + "loss": 0.788, + "step": 29610 + }, + { + "epoch": 2.9619999999999997, + "grad_norm": 53.01205062866211, + "learning_rate": 4.079200000000001e-06, + "loss": 0.6026, + "step": 29620 + }, + { + "epoch": 2.963, + "grad_norm": 22.547651290893555, + "learning_rate": 4.0772000000000005e-06, + "loss": 0.6091, + "step": 29630 + }, + { + "epoch": 2.964, + "grad_norm": 45.12074279785156, + "learning_rate": 4.0752e-06, + "loss": 0.5451, + "step": 29640 + }, + { + "epoch": 2.965, + "grad_norm": 15.948826789855957, + "learning_rate": 4.0732e-06, + "loss": 0.3129, + "step": 29650 + }, + { + "epoch": 2.966, + "grad_norm": 8.485644340515137, + "learning_rate": 4.0712e-06, + "loss": 0.6199, + "step": 29660 + }, + { + "epoch": 2.967, + "grad_norm": 9.05318546295166, + "learning_rate": 4.069200000000001e-06, + "loss": 0.7764, + "step": 29670 + }, + { + "epoch": 2.968, + "grad_norm": 14.031304359436035, + "learning_rate": 4.067200000000001e-06, + "loss": 0.4388, + "step": 29680 + }, + { + "epoch": 2.969, + "grad_norm": 34.62077713012695, + "learning_rate": 4.0652000000000005e-06, + "loss": 0.6206, + "step": 29690 + }, + { + "epoch": 2.9699999999999998, + "grad_norm": 29.150148391723633, + "learning_rate": 4.0632e-06, + "loss": 0.4939, + "step": 29700 + }, + { + "epoch": 2.971, + "grad_norm": 29.395282745361328, + "learning_rate": 4.0612e-06, + "loss": 0.4836, + "step": 29710 + }, + { + "epoch": 2.972, + "grad_norm": 50.002437591552734, + "learning_rate": 4.0592e-06, + "loss": 0.5221, + "step": 29720 + }, + { + "epoch": 2.973, + "grad_norm": 21.285947799682617, + "learning_rate": 4.057200000000001e-06, + "loss": 0.6336, + "step": 29730 + }, + { + "epoch": 2.974, + "grad_norm": 18.822994232177734, + "learning_rate": 4.055200000000001e-06, + "loss": 0.6895, + "step": 29740 + }, + { + "epoch": 2.975, + "grad_norm": 59.74067306518555, + "learning_rate": 4.0532000000000004e-06, + "loss": 0.6264, + "step": 29750 + }, + { + "epoch": 2.976, + "grad_norm": 46.628448486328125, + "learning_rate": 4.0512e-06, + "loss": 0.5139, + "step": 29760 + }, + { + "epoch": 2.977, + "grad_norm": 4.769842147827148, + "learning_rate": 4.0492e-06, + "loss": 0.4523, + "step": 29770 + }, + { + "epoch": 2.9779999999999998, + "grad_norm": 42.81772994995117, + "learning_rate": 4.047200000000001e-06, + "loss": 0.5483, + "step": 29780 + }, + { + "epoch": 2.979, + "grad_norm": 37.86781692504883, + "learning_rate": 4.0452e-06, + "loss": 0.4806, + "step": 29790 + }, + { + "epoch": 2.98, + "grad_norm": 28.3906192779541, + "learning_rate": 4.0432000000000005e-06, + "loss": 0.7016, + "step": 29800 + }, + { + "epoch": 2.981, + "grad_norm": 9.775362014770508, + "learning_rate": 4.0412e-06, + "loss": 0.6332, + "step": 29810 + }, + { + "epoch": 2.982, + "grad_norm": 5.316655158996582, + "learning_rate": 4.0392e-06, + "loss": 0.6472, + "step": 29820 + }, + { + "epoch": 2.983, + "grad_norm": 48.42181396484375, + "learning_rate": 4.037200000000001e-06, + "loss": 0.6845, + "step": 29830 + }, + { + "epoch": 2.984, + "grad_norm": 33.597869873046875, + "learning_rate": 4.0352e-06, + "loss": 0.6101, + "step": 29840 + }, + { + "epoch": 2.985, + "grad_norm": 38.29161071777344, + "learning_rate": 4.033200000000001e-06, + "loss": 0.7222, + "step": 29850 + }, + { + "epoch": 2.9859999999999998, + "grad_norm": 33.69076919555664, + "learning_rate": 4.0312000000000005e-06, + "loss": 0.6477, + "step": 29860 + }, + { + "epoch": 2.987, + "grad_norm": 19.68893051147461, + "learning_rate": 4.0292e-06, + "loss": 0.4098, + "step": 29870 + }, + { + "epoch": 2.988, + "grad_norm": 46.982784271240234, + "learning_rate": 4.0272e-06, + "loss": 0.5056, + "step": 29880 + }, + { + "epoch": 2.989, + "grad_norm": 15.345257759094238, + "learning_rate": 4.0252e-06, + "loss": 0.8451, + "step": 29890 + }, + { + "epoch": 2.99, + "grad_norm": 34.063167572021484, + "learning_rate": 4.023200000000001e-06, + "loss": 1.02, + "step": 29900 + }, + { + "epoch": 2.991, + "grad_norm": 23.77176284790039, + "learning_rate": 4.0212e-06, + "loss": 0.8993, + "step": 29910 + }, + { + "epoch": 2.992, + "grad_norm": 7.723876476287842, + "learning_rate": 4.0192e-06, + "loss": 0.4682, + "step": 29920 + }, + { + "epoch": 2.993, + "grad_norm": 9.764384269714355, + "learning_rate": 4.0172e-06, + "loss": 0.4996, + "step": 29930 + }, + { + "epoch": 2.9939999999999998, + "grad_norm": 33.22534942626953, + "learning_rate": 4.0152e-06, + "loss": 0.6259, + "step": 29940 + }, + { + "epoch": 2.995, + "grad_norm": 17.53177833557129, + "learning_rate": 4.013200000000001e-06, + "loss": 0.5254, + "step": 29950 + }, + { + "epoch": 2.996, + "grad_norm": 53.74732971191406, + "learning_rate": 4.0112e-06, + "loss": 0.9515, + "step": 29960 + }, + { + "epoch": 2.997, + "grad_norm": 12.981660842895508, + "learning_rate": 4.0092000000000005e-06, + "loss": 0.5202, + "step": 29970 + }, + { + "epoch": 2.998, + "grad_norm": 46.99820327758789, + "learning_rate": 4.0072e-06, + "loss": 0.4536, + "step": 29980 + }, + { + "epoch": 2.999, + "grad_norm": 40.24980926513672, + "learning_rate": 4.0052e-06, + "loss": 0.7089, + "step": 29990 + }, + { + "epoch": 3.0, + "grad_norm": 5.793490409851074, + "learning_rate": 4.0032e-06, + "loss": 0.7395, + "step": 30000 + }, + { + "epoch": 3.001, + "grad_norm": 31.98954963684082, + "learning_rate": 4.0012e-06, + "loss": 0.4896, + "step": 30010 + }, + { + "epoch": 3.002, + "grad_norm": 52.853424072265625, + "learning_rate": 3.999200000000001e-06, + "loss": 0.3648, + "step": 30020 + }, + { + "epoch": 3.003, + "grad_norm": 49.65998077392578, + "learning_rate": 3.9972000000000005e-06, + "loss": 0.5501, + "step": 30030 + }, + { + "epoch": 3.004, + "grad_norm": 50.96385955810547, + "learning_rate": 3.9952e-06, + "loss": 0.6445, + "step": 30040 + }, + { + "epoch": 3.005, + "grad_norm": 36.44496154785156, + "learning_rate": 3.9932e-06, + "loss": 0.6699, + "step": 30050 + }, + { + "epoch": 3.006, + "grad_norm": 17.591075897216797, + "learning_rate": 3.9912e-06, + "loss": 0.7079, + "step": 30060 + }, + { + "epoch": 3.007, + "grad_norm": 78.94903564453125, + "learning_rate": 3.989200000000001e-06, + "loss": 0.6877, + "step": 30070 + }, + { + "epoch": 3.008, + "grad_norm": 51.7623291015625, + "learning_rate": 3.9872000000000006e-06, + "loss": 0.7178, + "step": 30080 + }, + { + "epoch": 3.009, + "grad_norm": 37.82719421386719, + "learning_rate": 3.9852e-06, + "loss": 0.7465, + "step": 30090 + }, + { + "epoch": 3.01, + "grad_norm": 36.30388641357422, + "learning_rate": 3.9832e-06, + "loss": 0.5996, + "step": 30100 + }, + { + "epoch": 3.011, + "grad_norm": 4.858558654785156, + "learning_rate": 3.9812e-06, + "loss": 0.5496, + "step": 30110 + }, + { + "epoch": 3.012, + "grad_norm": 34.84873962402344, + "learning_rate": 3.9792e-06, + "loss": 0.6838, + "step": 30120 + }, + { + "epoch": 3.013, + "grad_norm": 33.88477325439453, + "learning_rate": 3.977200000000001e-06, + "loss": 0.4307, + "step": 30130 + }, + { + "epoch": 3.014, + "grad_norm": 39.399967193603516, + "learning_rate": 3.9752000000000005e-06, + "loss": 0.6973, + "step": 30140 + }, + { + "epoch": 3.015, + "grad_norm": 20.011003494262695, + "learning_rate": 3.9732e-06, + "loss": 0.695, + "step": 30150 + }, + { + "epoch": 3.016, + "grad_norm": 28.336265563964844, + "learning_rate": 3.9712e-06, + "loss": 0.5254, + "step": 30160 + }, + { + "epoch": 3.017, + "grad_norm": 40.91427230834961, + "learning_rate": 3.9692e-06, + "loss": 0.4747, + "step": 30170 + }, + { + "epoch": 3.018, + "grad_norm": 17.782920837402344, + "learning_rate": 3.967200000000001e-06, + "loss": 0.5615, + "step": 30180 + }, + { + "epoch": 3.019, + "grad_norm": 58.19817352294922, + "learning_rate": 3.9652e-06, + "loss": 0.8336, + "step": 30190 + }, + { + "epoch": 3.02, + "grad_norm": 32.32502746582031, + "learning_rate": 3.9632000000000004e-06, + "loss": 0.7315, + "step": 30200 + }, + { + "epoch": 3.021, + "grad_norm": 5.095047950744629, + "learning_rate": 3.9612e-06, + "loss": 0.456, + "step": 30210 + }, + { + "epoch": 3.022, + "grad_norm": 41.41087341308594, + "learning_rate": 3.9592e-06, + "loss": 0.8262, + "step": 30220 + }, + { + "epoch": 3.023, + "grad_norm": 1.5623220205307007, + "learning_rate": 3.957200000000001e-06, + "loss": 0.5899, + "step": 30230 + }, + { + "epoch": 3.024, + "grad_norm": 56.326873779296875, + "learning_rate": 3.9552e-06, + "loss": 0.7977, + "step": 30240 + }, + { + "epoch": 3.025, + "grad_norm": 13.570528030395508, + "learning_rate": 3.9532000000000005e-06, + "loss": 0.5588, + "step": 30250 + }, + { + "epoch": 3.026, + "grad_norm": 19.437191009521484, + "learning_rate": 3.9512e-06, + "loss": 0.391, + "step": 30260 + }, + { + "epoch": 3.027, + "grad_norm": 99.22734069824219, + "learning_rate": 3.9492e-06, + "loss": 0.6078, + "step": 30270 + }, + { + "epoch": 3.028, + "grad_norm": 36.386932373046875, + "learning_rate": 3.9472e-06, + "loss": 0.5783, + "step": 30280 + }, + { + "epoch": 3.029, + "grad_norm": 6.841565132141113, + "learning_rate": 3.9452e-06, + "loss": 0.5239, + "step": 30290 + }, + { + "epoch": 3.03, + "grad_norm": 53.24142074584961, + "learning_rate": 3.943200000000001e-06, + "loss": 0.486, + "step": 30300 + }, + { + "epoch": 3.031, + "grad_norm": 42.58986282348633, + "learning_rate": 3.9412000000000005e-06, + "loss": 0.6734, + "step": 30310 + }, + { + "epoch": 3.032, + "grad_norm": 13.315511703491211, + "learning_rate": 3.9392e-06, + "loss": 0.3784, + "step": 30320 + }, + { + "epoch": 3.033, + "grad_norm": 30.482662200927734, + "learning_rate": 3.9372e-06, + "loss": 0.7804, + "step": 30330 + }, + { + "epoch": 3.034, + "grad_norm": 58.506553649902344, + "learning_rate": 3.9352e-06, + "loss": 0.6049, + "step": 30340 + }, + { + "epoch": 3.035, + "grad_norm": 55.03916549682617, + "learning_rate": 3.933200000000001e-06, + "loss": 0.6667, + "step": 30350 + }, + { + "epoch": 3.036, + "grad_norm": 20.70362091064453, + "learning_rate": 3.931200000000001e-06, + "loss": 0.4529, + "step": 30360 + }, + { + "epoch": 3.037, + "grad_norm": 4.082492351531982, + "learning_rate": 3.9292000000000004e-06, + "loss": 0.6196, + "step": 30370 + }, + { + "epoch": 3.038, + "grad_norm": 40.38673400878906, + "learning_rate": 3.9272e-06, + "loss": 0.619, + "step": 30380 + }, + { + "epoch": 3.039, + "grad_norm": 35.989933013916016, + "learning_rate": 3.9252e-06, + "loss": 0.6164, + "step": 30390 + }, + { + "epoch": 3.04, + "grad_norm": 12.340978622436523, + "learning_rate": 3.9232e-06, + "loss": 0.5541, + "step": 30400 + }, + { + "epoch": 3.041, + "grad_norm": 67.83113098144531, + "learning_rate": 3.921200000000001e-06, + "loss": 0.7538, + "step": 30410 + }, + { + "epoch": 3.042, + "grad_norm": 40.05186462402344, + "learning_rate": 3.9192000000000005e-06, + "loss": 0.4375, + "step": 30420 + }, + { + "epoch": 3.043, + "grad_norm": 27.47614097595215, + "learning_rate": 3.9172e-06, + "loss": 0.5773, + "step": 30430 + }, + { + "epoch": 3.044, + "grad_norm": 26.46270179748535, + "learning_rate": 3.9152e-06, + "loss": 0.6322, + "step": 30440 + }, + { + "epoch": 3.045, + "grad_norm": 8.434654235839844, + "learning_rate": 3.9132e-06, + "loss": 0.7884, + "step": 30450 + }, + { + "epoch": 3.046, + "grad_norm": 43.2637825012207, + "learning_rate": 3.911200000000001e-06, + "loss": 0.4878, + "step": 30460 + }, + { + "epoch": 3.047, + "grad_norm": 24.111162185668945, + "learning_rate": 3.909200000000001e-06, + "loss": 0.5232, + "step": 30470 + }, + { + "epoch": 3.048, + "grad_norm": 59.05705261230469, + "learning_rate": 3.9072000000000005e-06, + "loss": 0.7774, + "step": 30480 + }, + { + "epoch": 3.049, + "grad_norm": 54.003746032714844, + "learning_rate": 3.9052e-06, + "loss": 0.801, + "step": 30490 + }, + { + "epoch": 3.05, + "grad_norm": 25.38113021850586, + "learning_rate": 3.9032e-06, + "loss": 0.8444, + "step": 30500 + }, + { + "epoch": 3.051, + "grad_norm": 2.1326496601104736, + "learning_rate": 3.901200000000001e-06, + "loss": 0.551, + "step": 30510 + }, + { + "epoch": 3.052, + "grad_norm": 52.77695083618164, + "learning_rate": 3.8992e-06, + "loss": 0.617, + "step": 30520 + }, + { + "epoch": 3.053, + "grad_norm": 55.81608963012695, + "learning_rate": 3.8972000000000006e-06, + "loss": 0.4779, + "step": 30530 + }, + { + "epoch": 3.054, + "grad_norm": 33.85139083862305, + "learning_rate": 3.8952e-06, + "loss": 0.6629, + "step": 30540 + }, + { + "epoch": 3.055, + "grad_norm": 40.588417053222656, + "learning_rate": 3.8932e-06, + "loss": 0.6148, + "step": 30550 + }, + { + "epoch": 3.056, + "grad_norm": 32.17471694946289, + "learning_rate": 3.8912e-06, + "loss": 0.772, + "step": 30560 + }, + { + "epoch": 3.057, + "grad_norm": 26.596405029296875, + "learning_rate": 3.8892e-06, + "loss": 0.6355, + "step": 30570 + }, + { + "epoch": 3.058, + "grad_norm": 28.121776580810547, + "learning_rate": 3.887200000000001e-06, + "loss": 0.7167, + "step": 30580 + }, + { + "epoch": 3.059, + "grad_norm": 28.322776794433594, + "learning_rate": 3.8852e-06, + "loss": 0.6618, + "step": 30590 + }, + { + "epoch": 3.06, + "grad_norm": 22.134326934814453, + "learning_rate": 3.8832e-06, + "loss": 0.6055, + "step": 30600 + }, + { + "epoch": 3.061, + "grad_norm": 63.12453842163086, + "learning_rate": 3.8812e-06, + "loss": 0.6147, + "step": 30610 + }, + { + "epoch": 3.062, + "grad_norm": 31.899450302124023, + "learning_rate": 3.8792e-06, + "loss": 0.4047, + "step": 30620 + }, + { + "epoch": 3.063, + "grad_norm": 46.35337448120117, + "learning_rate": 3.877200000000001e-06, + "loss": 0.7266, + "step": 30630 + }, + { + "epoch": 3.064, + "grad_norm": 46.468223571777344, + "learning_rate": 3.8752e-06, + "loss": 0.5428, + "step": 30640 + }, + { + "epoch": 3.065, + "grad_norm": 6.490557670593262, + "learning_rate": 3.8732000000000005e-06, + "loss": 0.3811, + "step": 30650 + }, + { + "epoch": 3.066, + "grad_norm": 44.99673843383789, + "learning_rate": 3.8712e-06, + "loss": 0.5279, + "step": 30660 + }, + { + "epoch": 3.067, + "grad_norm": 34.744102478027344, + "learning_rate": 3.8692e-06, + "loss": 0.5125, + "step": 30670 + }, + { + "epoch": 3.068, + "grad_norm": 27.284143447875977, + "learning_rate": 3.8672e-06, + "loss": 0.4262, + "step": 30680 + }, + { + "epoch": 3.069, + "grad_norm": 70.38201904296875, + "learning_rate": 3.8652e-06, + "loss": 0.8814, + "step": 30690 + }, + { + "epoch": 3.07, + "grad_norm": 48.69472122192383, + "learning_rate": 3.8632000000000006e-06, + "loss": 0.6056, + "step": 30700 + }, + { + "epoch": 3.071, + "grad_norm": 42.86852264404297, + "learning_rate": 3.8612e-06, + "loss": 0.7168, + "step": 30710 + }, + { + "epoch": 3.072, + "grad_norm": 5.89202880859375, + "learning_rate": 3.8592e-06, + "loss": 0.5317, + "step": 30720 + }, + { + "epoch": 3.073, + "grad_norm": 62.24650573730469, + "learning_rate": 3.8572e-06, + "loss": 0.8052, + "step": 30730 + }, + { + "epoch": 3.074, + "grad_norm": 109.9715347290039, + "learning_rate": 3.8552e-06, + "loss": 0.7197, + "step": 30740 + }, + { + "epoch": 3.075, + "grad_norm": 38.530025482177734, + "learning_rate": 3.853200000000001e-06, + "loss": 0.7395, + "step": 30750 + }, + { + "epoch": 3.076, + "grad_norm": 42.872074127197266, + "learning_rate": 3.8512000000000005e-06, + "loss": 0.7683, + "step": 30760 + }, + { + "epoch": 3.077, + "grad_norm": 23.42620086669922, + "learning_rate": 3.8492e-06, + "loss": 0.5626, + "step": 30770 + }, + { + "epoch": 3.078, + "grad_norm": 63.241119384765625, + "learning_rate": 3.8472e-06, + "loss": 0.5732, + "step": 30780 + }, + { + "epoch": 3.079, + "grad_norm": 55.224063873291016, + "learning_rate": 3.8452e-06, + "loss": 0.6857, + "step": 30790 + }, + { + "epoch": 3.08, + "grad_norm": 33.85127639770508, + "learning_rate": 3.8432e-06, + "loss": 0.5499, + "step": 30800 + }, + { + "epoch": 3.081, + "grad_norm": 39.279022216796875, + "learning_rate": 3.841200000000001e-06, + "loss": 0.9333, + "step": 30810 + }, + { + "epoch": 3.082, + "grad_norm": 12.963658332824707, + "learning_rate": 3.8392000000000004e-06, + "loss": 0.2521, + "step": 30820 + }, + { + "epoch": 3.083, + "grad_norm": 8.555622100830078, + "learning_rate": 3.8372e-06, + "loss": 0.7496, + "step": 30830 + }, + { + "epoch": 3.084, + "grad_norm": 26.538875579833984, + "learning_rate": 3.835200000000001e-06, + "loss": 0.5641, + "step": 30840 + }, + { + "epoch": 3.085, + "grad_norm": 26.513824462890625, + "learning_rate": 3.8332e-06, + "loss": 0.7684, + "step": 30850 + }, + { + "epoch": 3.086, + "grad_norm": 50.69865798950195, + "learning_rate": 3.831200000000001e-06, + "loss": 0.3673, + "step": 30860 + }, + { + "epoch": 3.087, + "grad_norm": 31.16800880432129, + "learning_rate": 3.8292000000000005e-06, + "loss": 0.3716, + "step": 30870 + }, + { + "epoch": 3.088, + "grad_norm": 7.409222602844238, + "learning_rate": 3.8272e-06, + "loss": 0.6453, + "step": 30880 + }, + { + "epoch": 3.089, + "grad_norm": 21.64834213256836, + "learning_rate": 3.8252e-06, + "loss": 0.514, + "step": 30890 + }, + { + "epoch": 3.09, + "grad_norm": 21.66585350036621, + "learning_rate": 3.8232e-06, + "loss": 0.5815, + "step": 30900 + }, + { + "epoch": 3.091, + "grad_norm": 48.6446418762207, + "learning_rate": 3.8214e-06, + "loss": 0.646, + "step": 30910 + }, + { + "epoch": 3.092, + "grad_norm": 47.077335357666016, + "learning_rate": 3.8194e-06, + "loss": 0.4413, + "step": 30920 + }, + { + "epoch": 3.093, + "grad_norm": 43.14842224121094, + "learning_rate": 3.817400000000001e-06, + "loss": 0.52, + "step": 30930 + }, + { + "epoch": 3.094, + "grad_norm": 42.64856719970703, + "learning_rate": 3.8154e-06, + "loss": 0.4608, + "step": 30940 + }, + { + "epoch": 3.095, + "grad_norm": 26.505043029785156, + "learning_rate": 3.8134000000000006e-06, + "loss": 0.8754, + "step": 30950 + }, + { + "epoch": 3.096, + "grad_norm": 25.05315589904785, + "learning_rate": 3.8114e-06, + "loss": 0.5399, + "step": 30960 + }, + { + "epoch": 3.097, + "grad_norm": 3.38507080078125, + "learning_rate": 3.8094000000000003e-06, + "loss": 0.5687, + "step": 30970 + }, + { + "epoch": 3.098, + "grad_norm": 25.73077392578125, + "learning_rate": 3.8074000000000006e-06, + "loss": 0.667, + "step": 30980 + }, + { + "epoch": 3.099, + "grad_norm": 51.63774871826172, + "learning_rate": 3.8054e-06, + "loss": 0.5895, + "step": 30990 + }, + { + "epoch": 3.1, + "grad_norm": 36.685298919677734, + "learning_rate": 3.8034000000000003e-06, + "loss": 0.8263, + "step": 31000 + }, + { + "epoch": 3.101, + "grad_norm": 20.661434173583984, + "learning_rate": 3.8014e-06, + "loss": 0.5372, + "step": 31010 + }, + { + "epoch": 3.102, + "grad_norm": 1.888520359992981, + "learning_rate": 3.7994000000000004e-06, + "loss": 0.4881, + "step": 31020 + }, + { + "epoch": 3.103, + "grad_norm": 61.00127029418945, + "learning_rate": 3.7974000000000007e-06, + "loss": 0.4988, + "step": 31030 + }, + { + "epoch": 3.104, + "grad_norm": 52.48198699951172, + "learning_rate": 3.7954e-06, + "loss": 0.6888, + "step": 31040 + }, + { + "epoch": 3.105, + "grad_norm": 20.20423698425293, + "learning_rate": 3.7934000000000004e-06, + "loss": 0.6551, + "step": 31050 + }, + { + "epoch": 3.106, + "grad_norm": 8.83470344543457, + "learning_rate": 3.7914000000000002e-06, + "loss": 0.8729, + "step": 31060 + }, + { + "epoch": 3.107, + "grad_norm": 17.743305206298828, + "learning_rate": 3.7894e-06, + "loss": 0.3913, + "step": 31070 + }, + { + "epoch": 3.108, + "grad_norm": 33.55481719970703, + "learning_rate": 3.7874000000000004e-06, + "loss": 0.6961, + "step": 31080 + }, + { + "epoch": 3.109, + "grad_norm": 45.66580581665039, + "learning_rate": 3.7854000000000002e-06, + "loss": 0.6243, + "step": 31090 + }, + { + "epoch": 3.11, + "grad_norm": 25.366012573242188, + "learning_rate": 3.7834000000000005e-06, + "loss": 0.5794, + "step": 31100 + }, + { + "epoch": 3.111, + "grad_norm": 40.58540725708008, + "learning_rate": 3.7814e-06, + "loss": 0.6811, + "step": 31110 + }, + { + "epoch": 3.112, + "grad_norm": 19.951820373535156, + "learning_rate": 3.7794e-06, + "loss": 0.4759, + "step": 31120 + }, + { + "epoch": 3.113, + "grad_norm": 35.00603103637695, + "learning_rate": 3.7774000000000005e-06, + "loss": 0.5049, + "step": 31130 + }, + { + "epoch": 3.114, + "grad_norm": 32.95356369018555, + "learning_rate": 3.7754000000000003e-06, + "loss": 0.4614, + "step": 31140 + }, + { + "epoch": 3.115, + "grad_norm": 25.622833251953125, + "learning_rate": 3.7734000000000006e-06, + "loss": 0.7006, + "step": 31150 + }, + { + "epoch": 3.116, + "grad_norm": 55.83294677734375, + "learning_rate": 3.7714e-06, + "loss": 0.6517, + "step": 31160 + }, + { + "epoch": 3.117, + "grad_norm": 18.034406661987305, + "learning_rate": 3.7694000000000003e-06, + "loss": 0.7242, + "step": 31170 + }, + { + "epoch": 3.118, + "grad_norm": 14.074854850769043, + "learning_rate": 3.7674000000000006e-06, + "loss": 0.5512, + "step": 31180 + }, + { + "epoch": 3.1189999999999998, + "grad_norm": 30.482887268066406, + "learning_rate": 3.7654e-06, + "loss": 0.5969, + "step": 31190 + }, + { + "epoch": 3.12, + "grad_norm": 52.35322952270508, + "learning_rate": 3.7634000000000003e-06, + "loss": 0.6838, + "step": 31200 + }, + { + "epoch": 3.121, + "grad_norm": 67.32181549072266, + "learning_rate": 3.7614e-06, + "loss": 0.5823, + "step": 31210 + }, + { + "epoch": 3.122, + "grad_norm": 24.50666046142578, + "learning_rate": 3.7594000000000004e-06, + "loss": 0.4469, + "step": 31220 + }, + { + "epoch": 3.123, + "grad_norm": 35.51641845703125, + "learning_rate": 3.7574000000000007e-06, + "loss": 0.7058, + "step": 31230 + }, + { + "epoch": 3.124, + "grad_norm": 23.89848518371582, + "learning_rate": 3.7554e-06, + "loss": 0.7185, + "step": 31240 + }, + { + "epoch": 3.125, + "grad_norm": 32.228782653808594, + "learning_rate": 3.7534000000000004e-06, + "loss": 0.4426, + "step": 31250 + }, + { + "epoch": 3.126, + "grad_norm": 16.477025985717773, + "learning_rate": 3.7514e-06, + "loss": 0.5915, + "step": 31260 + }, + { + "epoch": 3.127, + "grad_norm": 4.443339824676514, + "learning_rate": 3.7494000000000005e-06, + "loss": 0.4272, + "step": 31270 + }, + { + "epoch": 3.128, + "grad_norm": 32.48780822753906, + "learning_rate": 3.7474000000000003e-06, + "loss": 0.5259, + "step": 31280 + }, + { + "epoch": 3.129, + "grad_norm": 7.203727722167969, + "learning_rate": 3.7454e-06, + "loss": 0.624, + "step": 31290 + }, + { + "epoch": 3.13, + "grad_norm": 71.5589828491211, + "learning_rate": 3.7434000000000004e-06, + "loss": 0.9032, + "step": 31300 + }, + { + "epoch": 3.1310000000000002, + "grad_norm": 40.37437057495117, + "learning_rate": 3.7414e-06, + "loss": 0.607, + "step": 31310 + }, + { + "epoch": 3.132, + "grad_norm": 8.436080932617188, + "learning_rate": 3.7394e-06, + "loss": 0.5178, + "step": 31320 + }, + { + "epoch": 3.133, + "grad_norm": 11.011701583862305, + "learning_rate": 3.7374000000000004e-06, + "loss": 0.5794, + "step": 31330 + }, + { + "epoch": 3.134, + "grad_norm": 27.312786102294922, + "learning_rate": 3.7354000000000003e-06, + "loss": 0.5639, + "step": 31340 + }, + { + "epoch": 3.135, + "grad_norm": 34.613746643066406, + "learning_rate": 3.7334000000000005e-06, + "loss": 0.5785, + "step": 31350 + }, + { + "epoch": 3.136, + "grad_norm": 29.656906127929688, + "learning_rate": 3.7314000000000004e-06, + "loss": 0.6471, + "step": 31360 + }, + { + "epoch": 3.137, + "grad_norm": 39.9609489440918, + "learning_rate": 3.7294000000000002e-06, + "loss": 0.9502, + "step": 31370 + }, + { + "epoch": 3.138, + "grad_norm": 28.447853088378906, + "learning_rate": 3.7274000000000005e-06, + "loss": 0.3478, + "step": 31380 + }, + { + "epoch": 3.1390000000000002, + "grad_norm": 8.006779670715332, + "learning_rate": 3.7254e-06, + "loss": 0.363, + "step": 31390 + }, + { + "epoch": 3.14, + "grad_norm": 53.307796478271484, + "learning_rate": 3.7234000000000002e-06, + "loss": 0.7523, + "step": 31400 + }, + { + "epoch": 3.141, + "grad_norm": 40.8818359375, + "learning_rate": 3.7214000000000005e-06, + "loss": 0.8793, + "step": 31410 + }, + { + "epoch": 3.142, + "grad_norm": 23.048547744750977, + "learning_rate": 3.7194000000000003e-06, + "loss": 0.4126, + "step": 31420 + }, + { + "epoch": 3.143, + "grad_norm": 55.69511795043945, + "learning_rate": 3.7174000000000006e-06, + "loss": 0.7602, + "step": 31430 + }, + { + "epoch": 3.144, + "grad_norm": 49.32610321044922, + "learning_rate": 3.7154e-06, + "loss": 0.6945, + "step": 31440 + }, + { + "epoch": 3.145, + "grad_norm": 78.19044494628906, + "learning_rate": 3.7134000000000003e-06, + "loss": 0.8383, + "step": 31450 + }, + { + "epoch": 3.146, + "grad_norm": 14.405440330505371, + "learning_rate": 3.7114000000000006e-06, + "loss": 0.5729, + "step": 31460 + }, + { + "epoch": 3.147, + "grad_norm": 46.739463806152344, + "learning_rate": 3.7094000000000004e-06, + "loss": 0.5877, + "step": 31470 + }, + { + "epoch": 3.148, + "grad_norm": 14.353507995605469, + "learning_rate": 3.7074000000000003e-06, + "loss": 0.4201, + "step": 31480 + }, + { + "epoch": 3.149, + "grad_norm": 55.31502151489258, + "learning_rate": 3.7054e-06, + "loss": 0.5387, + "step": 31490 + }, + { + "epoch": 3.15, + "grad_norm": 40.76209259033203, + "learning_rate": 3.7034000000000004e-06, + "loss": 0.2956, + "step": 31500 + }, + { + "epoch": 3.151, + "grad_norm": 35.83966827392578, + "learning_rate": 3.7014000000000007e-06, + "loss": 0.8846, + "step": 31510 + }, + { + "epoch": 3.152, + "grad_norm": 42.301090240478516, + "learning_rate": 3.6994e-06, + "loss": 0.6257, + "step": 31520 + }, + { + "epoch": 3.153, + "grad_norm": 31.590164184570312, + "learning_rate": 3.6974000000000004e-06, + "loss": 0.2422, + "step": 31530 + }, + { + "epoch": 3.154, + "grad_norm": 12.676555633544922, + "learning_rate": 3.6954000000000002e-06, + "loss": 0.7386, + "step": 31540 + }, + { + "epoch": 3.155, + "grad_norm": 54.29704284667969, + "learning_rate": 3.6934000000000005e-06, + "loss": 0.4906, + "step": 31550 + }, + { + "epoch": 3.156, + "grad_norm": 32.339134216308594, + "learning_rate": 3.6914000000000004e-06, + "loss": 0.6436, + "step": 31560 + }, + { + "epoch": 3.157, + "grad_norm": 53.582733154296875, + "learning_rate": 3.6894e-06, + "loss": 0.5199, + "step": 31570 + }, + { + "epoch": 3.158, + "grad_norm": 20.8098087310791, + "learning_rate": 3.6874000000000005e-06, + "loss": 0.5713, + "step": 31580 + }, + { + "epoch": 3.159, + "grad_norm": 12.076930046081543, + "learning_rate": 3.6854e-06, + "loss": 0.5682, + "step": 31590 + }, + { + "epoch": 3.16, + "grad_norm": 39.23326110839844, + "learning_rate": 3.6834e-06, + "loss": 0.7824, + "step": 31600 + }, + { + "epoch": 3.161, + "grad_norm": 1.6527082920074463, + "learning_rate": 3.6814000000000004e-06, + "loss": 0.8088, + "step": 31610 + }, + { + "epoch": 3.162, + "grad_norm": 46.88147735595703, + "learning_rate": 3.6794000000000003e-06, + "loss": 0.7926, + "step": 31620 + }, + { + "epoch": 3.163, + "grad_norm": 43.62668991088867, + "learning_rate": 3.6774000000000006e-06, + "loss": 0.603, + "step": 31630 + }, + { + "epoch": 3.164, + "grad_norm": 39.14570236206055, + "learning_rate": 3.6754e-06, + "loss": 0.4348, + "step": 31640 + }, + { + "epoch": 3.165, + "grad_norm": 33.04401779174805, + "learning_rate": 3.6734000000000003e-06, + "loss": 0.4615, + "step": 31650 + }, + { + "epoch": 3.166, + "grad_norm": 30.718061447143555, + "learning_rate": 3.6714000000000005e-06, + "loss": 0.6487, + "step": 31660 + }, + { + "epoch": 3.167, + "grad_norm": 40.128971099853516, + "learning_rate": 3.6694000000000004e-06, + "loss": 0.6791, + "step": 31670 + }, + { + "epoch": 3.168, + "grad_norm": 34.56775665283203, + "learning_rate": 3.6674000000000002e-06, + "loss": 0.4019, + "step": 31680 + }, + { + "epoch": 3.169, + "grad_norm": 47.323760986328125, + "learning_rate": 3.6654e-06, + "loss": 0.79, + "step": 31690 + }, + { + "epoch": 3.17, + "grad_norm": 42.98324203491211, + "learning_rate": 3.6634000000000004e-06, + "loss": 0.7067, + "step": 31700 + }, + { + "epoch": 3.171, + "grad_norm": 16.420366287231445, + "learning_rate": 3.6614000000000006e-06, + "loss": 0.7854, + "step": 31710 + }, + { + "epoch": 3.172, + "grad_norm": 45.52265930175781, + "learning_rate": 3.6594e-06, + "loss": 0.6981, + "step": 31720 + }, + { + "epoch": 3.173, + "grad_norm": 30.123014450073242, + "learning_rate": 3.6574000000000003e-06, + "loss": 0.6198, + "step": 31730 + }, + { + "epoch": 3.174, + "grad_norm": 24.232975006103516, + "learning_rate": 3.6554e-06, + "loss": 0.6919, + "step": 31740 + }, + { + "epoch": 3.175, + "grad_norm": 20.821025848388672, + "learning_rate": 3.6534000000000005e-06, + "loss": 0.6695, + "step": 31750 + }, + { + "epoch": 3.176, + "grad_norm": 51.86601638793945, + "learning_rate": 3.6514000000000003e-06, + "loss": 0.6286, + "step": 31760 + }, + { + "epoch": 3.177, + "grad_norm": 11.700900077819824, + "learning_rate": 3.6494e-06, + "loss": 0.5536, + "step": 31770 + }, + { + "epoch": 3.178, + "grad_norm": 35.8469352722168, + "learning_rate": 3.6474000000000004e-06, + "loss": 0.4811, + "step": 31780 + }, + { + "epoch": 3.179, + "grad_norm": 31.125680923461914, + "learning_rate": 3.6454e-06, + "loss": 0.4386, + "step": 31790 + }, + { + "epoch": 3.18, + "grad_norm": 40.396820068359375, + "learning_rate": 3.6434e-06, + "loss": 0.934, + "step": 31800 + }, + { + "epoch": 3.181, + "grad_norm": 2.2530946731567383, + "learning_rate": 3.6414000000000004e-06, + "loss": 0.5532, + "step": 31810 + }, + { + "epoch": 3.182, + "grad_norm": 29.33017921447754, + "learning_rate": 3.6394000000000003e-06, + "loss": 0.6204, + "step": 31820 + }, + { + "epoch": 3.183, + "grad_norm": 55.61162185668945, + "learning_rate": 3.6374000000000005e-06, + "loss": 0.4463, + "step": 31830 + }, + { + "epoch": 3.184, + "grad_norm": 4.762482166290283, + "learning_rate": 3.6354e-06, + "loss": 0.5826, + "step": 31840 + }, + { + "epoch": 3.185, + "grad_norm": 33.20708465576172, + "learning_rate": 3.6334000000000002e-06, + "loss": 0.6001, + "step": 31850 + }, + { + "epoch": 3.186, + "grad_norm": 16.60824966430664, + "learning_rate": 3.6314000000000005e-06, + "loss": 0.7306, + "step": 31860 + }, + { + "epoch": 3.187, + "grad_norm": 48.7596435546875, + "learning_rate": 3.6294000000000004e-06, + "loss": 0.5614, + "step": 31870 + }, + { + "epoch": 3.188, + "grad_norm": 11.902722358703613, + "learning_rate": 3.6274e-06, + "loss": 0.692, + "step": 31880 + }, + { + "epoch": 3.189, + "grad_norm": 47.085758209228516, + "learning_rate": 3.6254e-06, + "loss": 0.566, + "step": 31890 + }, + { + "epoch": 3.19, + "grad_norm": 42.062076568603516, + "learning_rate": 3.6234000000000003e-06, + "loss": 0.5515, + "step": 31900 + }, + { + "epoch": 3.191, + "grad_norm": 3.4789304733276367, + "learning_rate": 3.6214000000000006e-06, + "loss": 0.586, + "step": 31910 + }, + { + "epoch": 3.192, + "grad_norm": 63.67318344116211, + "learning_rate": 3.6194e-06, + "loss": 0.8436, + "step": 31920 + }, + { + "epoch": 3.193, + "grad_norm": 27.702524185180664, + "learning_rate": 3.6174000000000003e-06, + "loss": 0.3121, + "step": 31930 + }, + { + "epoch": 3.194, + "grad_norm": 46.160911560058594, + "learning_rate": 3.6154e-06, + "loss": 0.5511, + "step": 31940 + }, + { + "epoch": 3.195, + "grad_norm": 24.993696212768555, + "learning_rate": 3.6134000000000004e-06, + "loss": 0.7147, + "step": 31950 + }, + { + "epoch": 3.196, + "grad_norm": 28.18203353881836, + "learning_rate": 3.6114000000000003e-06, + "loss": 0.6309, + "step": 31960 + }, + { + "epoch": 3.197, + "grad_norm": 18.851085662841797, + "learning_rate": 3.6094e-06, + "loss": 0.4695, + "step": 31970 + }, + { + "epoch": 3.198, + "grad_norm": 85.22747039794922, + "learning_rate": 3.6074000000000004e-06, + "loss": 0.7747, + "step": 31980 + }, + { + "epoch": 3.199, + "grad_norm": 48.64833068847656, + "learning_rate": 3.6054000000000007e-06, + "loss": 0.5871, + "step": 31990 + }, + { + "epoch": 3.2, + "grad_norm": 51.850101470947266, + "learning_rate": 3.6034e-06, + "loss": 0.7215, + "step": 32000 + }, + { + "epoch": 3.201, + "grad_norm": 24.21227264404297, + "learning_rate": 3.6014000000000004e-06, + "loss": 0.8802, + "step": 32010 + }, + { + "epoch": 3.202, + "grad_norm": 17.991352081298828, + "learning_rate": 3.5994000000000002e-06, + "loss": 0.5883, + "step": 32020 + }, + { + "epoch": 3.203, + "grad_norm": 69.04742431640625, + "learning_rate": 3.5974000000000005e-06, + "loss": 0.577, + "step": 32030 + }, + { + "epoch": 3.204, + "grad_norm": 52.01885986328125, + "learning_rate": 3.5954000000000008e-06, + "loss": 0.6458, + "step": 32040 + }, + { + "epoch": 3.205, + "grad_norm": 28.483524322509766, + "learning_rate": 3.5934e-06, + "loss": 0.4858, + "step": 32050 + }, + { + "epoch": 3.206, + "grad_norm": 37.120445251464844, + "learning_rate": 3.5914000000000005e-06, + "loss": 0.2048, + "step": 32060 + }, + { + "epoch": 3.207, + "grad_norm": 14.233193397521973, + "learning_rate": 3.5894000000000003e-06, + "loss": 0.417, + "step": 32070 + }, + { + "epoch": 3.208, + "grad_norm": 17.76689910888672, + "learning_rate": 3.5874e-06, + "loss": 0.5522, + "step": 32080 + }, + { + "epoch": 3.209, + "grad_norm": 50.82288360595703, + "learning_rate": 3.5854000000000004e-06, + "loss": 0.3946, + "step": 32090 + }, + { + "epoch": 3.21, + "grad_norm": 3.2764945030212402, + "learning_rate": 3.5834000000000003e-06, + "loss": 0.4198, + "step": 32100 + }, + { + "epoch": 3.211, + "grad_norm": 60.76835632324219, + "learning_rate": 3.5814000000000006e-06, + "loss": 0.6575, + "step": 32110 + }, + { + "epoch": 3.212, + "grad_norm": 68.07122039794922, + "learning_rate": 3.5794e-06, + "loss": 0.7174, + "step": 32120 + }, + { + "epoch": 3.213, + "grad_norm": 6.457009315490723, + "learning_rate": 3.5774000000000003e-06, + "loss": 0.7412, + "step": 32130 + }, + { + "epoch": 3.214, + "grad_norm": 32.04698181152344, + "learning_rate": 3.5754000000000005e-06, + "loss": 1.0493, + "step": 32140 + }, + { + "epoch": 3.215, + "grad_norm": 11.740108489990234, + "learning_rate": 3.5734000000000004e-06, + "loss": 0.7469, + "step": 32150 + }, + { + "epoch": 3.216, + "grad_norm": 46.86805725097656, + "learning_rate": 3.5714000000000002e-06, + "loss": 0.7554, + "step": 32160 + }, + { + "epoch": 3.217, + "grad_norm": 34.02867126464844, + "learning_rate": 3.5694e-06, + "loss": 0.6535, + "step": 32170 + }, + { + "epoch": 3.218, + "grad_norm": 29.15270233154297, + "learning_rate": 3.5674000000000004e-06, + "loss": 0.7064, + "step": 32180 + }, + { + "epoch": 3.219, + "grad_norm": 23.48900032043457, + "learning_rate": 3.5654000000000006e-06, + "loss": 0.3669, + "step": 32190 + }, + { + "epoch": 3.22, + "grad_norm": 37.315250396728516, + "learning_rate": 3.5634e-06, + "loss": 0.856, + "step": 32200 + }, + { + "epoch": 3.221, + "grad_norm": 23.128664016723633, + "learning_rate": 3.5614000000000003e-06, + "loss": 0.528, + "step": 32210 + }, + { + "epoch": 3.222, + "grad_norm": 21.8521671295166, + "learning_rate": 3.5594e-06, + "loss": 0.431, + "step": 32220 + }, + { + "epoch": 3.223, + "grad_norm": 13.725652694702148, + "learning_rate": 3.5574000000000004e-06, + "loss": 0.7434, + "step": 32230 + }, + { + "epoch": 3.224, + "grad_norm": 32.80021286010742, + "learning_rate": 3.5554000000000007e-06, + "loss": 0.5275, + "step": 32240 + }, + { + "epoch": 3.225, + "grad_norm": 3.1503541469573975, + "learning_rate": 3.5534e-06, + "loss": 0.4221, + "step": 32250 + }, + { + "epoch": 3.226, + "grad_norm": 71.9884262084961, + "learning_rate": 3.5514000000000004e-06, + "loss": 0.4528, + "step": 32260 + }, + { + "epoch": 3.227, + "grad_norm": 3.422281503677368, + "learning_rate": 3.5494000000000003e-06, + "loss": 0.6253, + "step": 32270 + }, + { + "epoch": 3.228, + "grad_norm": 6.156475067138672, + "learning_rate": 3.5474e-06, + "loss": 0.5486, + "step": 32280 + }, + { + "epoch": 3.229, + "grad_norm": 26.414377212524414, + "learning_rate": 3.5454000000000004e-06, + "loss": 0.6165, + "step": 32290 + }, + { + "epoch": 3.23, + "grad_norm": 46.42730712890625, + "learning_rate": 3.5434000000000002e-06, + "loss": 1.0238, + "step": 32300 + }, + { + "epoch": 3.231, + "grad_norm": 11.5466890335083, + "learning_rate": 3.5414000000000005e-06, + "loss": 0.3546, + "step": 32310 + }, + { + "epoch": 3.232, + "grad_norm": 29.019729614257812, + "learning_rate": 3.5394e-06, + "loss": 0.4857, + "step": 32320 + }, + { + "epoch": 3.233, + "grad_norm": 27.857799530029297, + "learning_rate": 3.5374000000000002e-06, + "loss": 1.0558, + "step": 32330 + }, + { + "epoch": 3.234, + "grad_norm": 15.012434005737305, + "learning_rate": 3.5354000000000005e-06, + "loss": 0.7907, + "step": 32340 + }, + { + "epoch": 3.235, + "grad_norm": 21.44322395324707, + "learning_rate": 3.5334000000000003e-06, + "loss": 0.7334, + "step": 32350 + }, + { + "epoch": 3.2359999999999998, + "grad_norm": 32.07785415649414, + "learning_rate": 3.5314e-06, + "loss": 0.6319, + "step": 32360 + }, + { + "epoch": 3.237, + "grad_norm": 34.226993560791016, + "learning_rate": 3.5294e-06, + "loss": 0.5482, + "step": 32370 + }, + { + "epoch": 3.238, + "grad_norm": 30.83748435974121, + "learning_rate": 3.5274000000000003e-06, + "loss": 0.3989, + "step": 32380 + }, + { + "epoch": 3.239, + "grad_norm": 34.788299560546875, + "learning_rate": 3.5254000000000006e-06, + "loss": 0.8568, + "step": 32390 + }, + { + "epoch": 3.24, + "grad_norm": 10.510351181030273, + "learning_rate": 3.5234e-06, + "loss": 0.6174, + "step": 32400 + }, + { + "epoch": 3.241, + "grad_norm": 7.551955223083496, + "learning_rate": 3.5214000000000003e-06, + "loss": 0.4735, + "step": 32410 + }, + { + "epoch": 3.242, + "grad_norm": 53.604469299316406, + "learning_rate": 3.5194e-06, + "loss": 0.5829, + "step": 32420 + }, + { + "epoch": 3.243, + "grad_norm": 35.08150100708008, + "learning_rate": 3.5174000000000004e-06, + "loss": 0.4748, + "step": 32430 + }, + { + "epoch": 3.2439999999999998, + "grad_norm": 36.20558166503906, + "learning_rate": 3.5154000000000007e-06, + "loss": 0.7476, + "step": 32440 + }, + { + "epoch": 3.245, + "grad_norm": 29.312915802001953, + "learning_rate": 3.5134e-06, + "loss": 0.9648, + "step": 32450 + }, + { + "epoch": 3.246, + "grad_norm": 28.11040687561035, + "learning_rate": 3.5114000000000004e-06, + "loss": 0.6069, + "step": 32460 + }, + { + "epoch": 3.247, + "grad_norm": 39.97213363647461, + "learning_rate": 3.5094000000000002e-06, + "loss": 0.6488, + "step": 32470 + }, + { + "epoch": 3.248, + "grad_norm": 33.253658294677734, + "learning_rate": 3.5074e-06, + "loss": 0.5124, + "step": 32480 + }, + { + "epoch": 3.249, + "grad_norm": 35.87740707397461, + "learning_rate": 3.5054000000000004e-06, + "loss": 0.5967, + "step": 32490 + }, + { + "epoch": 3.25, + "grad_norm": 29.033767700195312, + "learning_rate": 3.5034e-06, + "loss": 0.7158, + "step": 32500 + }, + { + "epoch": 3.251, + "grad_norm": 32.10422897338867, + "learning_rate": 3.5014000000000005e-06, + "loss": 0.5006, + "step": 32510 + }, + { + "epoch": 3.252, + "grad_norm": 47.43272399902344, + "learning_rate": 3.4994e-06, + "loss": 0.6308, + "step": 32520 + }, + { + "epoch": 3.253, + "grad_norm": 37.08671188354492, + "learning_rate": 3.4974e-06, + "loss": 0.498, + "step": 32530 + }, + { + "epoch": 3.254, + "grad_norm": 20.07958984375, + "learning_rate": 3.4954000000000004e-06, + "loss": 0.5946, + "step": 32540 + }, + { + "epoch": 3.255, + "grad_norm": 51.85481643676758, + "learning_rate": 3.4934000000000003e-06, + "loss": 0.6992, + "step": 32550 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 29.262605667114258, + "learning_rate": 3.4914e-06, + "loss": 0.7368, + "step": 32560 + }, + { + "epoch": 3.257, + "grad_norm": 60.6501579284668, + "learning_rate": 3.4894e-06, + "loss": 0.7476, + "step": 32570 + }, + { + "epoch": 3.258, + "grad_norm": 9.376657485961914, + "learning_rate": 3.4874000000000003e-06, + "loss": 0.5938, + "step": 32580 + }, + { + "epoch": 3.259, + "grad_norm": 20.48821449279785, + "learning_rate": 3.4854000000000005e-06, + "loss": 0.6259, + "step": 32590 + }, + { + "epoch": 3.26, + "grad_norm": 28.9674015045166, + "learning_rate": 3.4834e-06, + "loss": 0.5057, + "step": 32600 + }, + { + "epoch": 3.261, + "grad_norm": 32.977699279785156, + "learning_rate": 3.4814000000000002e-06, + "loss": 0.5966, + "step": 32610 + }, + { + "epoch": 3.262, + "grad_norm": 9.944561004638672, + "learning_rate": 3.4794000000000005e-06, + "loss": 0.7086, + "step": 32620 + }, + { + "epoch": 3.263, + "grad_norm": 26.56491470336914, + "learning_rate": 3.4774000000000004e-06, + "loss": 0.6401, + "step": 32630 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 36.066864013671875, + "learning_rate": 3.4754000000000006e-06, + "loss": 0.6594, + "step": 32640 + }, + { + "epoch": 3.265, + "grad_norm": 37.5883903503418, + "learning_rate": 3.4734e-06, + "loss": 0.5561, + "step": 32650 + }, + { + "epoch": 3.266, + "grad_norm": 4.712746620178223, + "learning_rate": 3.4714000000000003e-06, + "loss": 0.4307, + "step": 32660 + }, + { + "epoch": 3.267, + "grad_norm": 28.391803741455078, + "learning_rate": 3.4694000000000006e-06, + "loss": 0.834, + "step": 32670 + }, + { + "epoch": 3.268, + "grad_norm": 33.248355865478516, + "learning_rate": 3.4674e-06, + "loss": 0.4551, + "step": 32680 + }, + { + "epoch": 3.269, + "grad_norm": 25.97675895690918, + "learning_rate": 3.4654000000000003e-06, + "loss": 0.5444, + "step": 32690 + }, + { + "epoch": 3.27, + "grad_norm": 19.075571060180664, + "learning_rate": 3.4634e-06, + "loss": 0.553, + "step": 32700 + }, + { + "epoch": 3.271, + "grad_norm": 22.07334327697754, + "learning_rate": 3.4614000000000004e-06, + "loss": 0.5503, + "step": 32710 + }, + { + "epoch": 3.2720000000000002, + "grad_norm": 63.43888854980469, + "learning_rate": 3.4594000000000007e-06, + "loss": 0.4606, + "step": 32720 + }, + { + "epoch": 3.273, + "grad_norm": 40.647926330566406, + "learning_rate": 3.4574e-06, + "loss": 0.6632, + "step": 32730 + }, + { + "epoch": 3.274, + "grad_norm": 29.407672882080078, + "learning_rate": 3.4554000000000004e-06, + "loss": 0.7169, + "step": 32740 + }, + { + "epoch": 3.275, + "grad_norm": 25.562196731567383, + "learning_rate": 3.4534000000000003e-06, + "loss": 0.5418, + "step": 32750 + }, + { + "epoch": 3.276, + "grad_norm": 35.23017883300781, + "learning_rate": 3.4514e-06, + "loss": 0.5916, + "step": 32760 + }, + { + "epoch": 3.277, + "grad_norm": 5.2610392570495605, + "learning_rate": 3.4494000000000004e-06, + "loss": 0.6641, + "step": 32770 + }, + { + "epoch": 3.278, + "grad_norm": 10.967763900756836, + "learning_rate": 3.4474000000000002e-06, + "loss": 0.4844, + "step": 32780 + }, + { + "epoch": 3.279, + "grad_norm": 26.412208557128906, + "learning_rate": 3.4454000000000005e-06, + "loss": 0.788, + "step": 32790 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 69.90411376953125, + "learning_rate": 3.4434e-06, + "loss": 1.0829, + "step": 32800 + }, + { + "epoch": 3.281, + "grad_norm": 39.49662780761719, + "learning_rate": 3.4414e-06, + "loss": 0.7883, + "step": 32810 + }, + { + "epoch": 3.282, + "grad_norm": 21.244108200073242, + "learning_rate": 3.4394000000000005e-06, + "loss": 0.6491, + "step": 32820 + }, + { + "epoch": 3.283, + "grad_norm": 21.366939544677734, + "learning_rate": 3.4374000000000003e-06, + "loss": 0.5096, + "step": 32830 + }, + { + "epoch": 3.284, + "grad_norm": 3.5856432914733887, + "learning_rate": 3.4354000000000006e-06, + "loss": 0.551, + "step": 32840 + }, + { + "epoch": 3.285, + "grad_norm": 22.398399353027344, + "learning_rate": 3.4334e-06, + "loss": 0.7356, + "step": 32850 + }, + { + "epoch": 3.286, + "grad_norm": 47.277099609375, + "learning_rate": 3.4314000000000003e-06, + "loss": 0.6598, + "step": 32860 + }, + { + "epoch": 3.287, + "grad_norm": 45.505672454833984, + "learning_rate": 3.4294000000000006e-06, + "loss": 0.5803, + "step": 32870 + }, + { + "epoch": 3.288, + "grad_norm": 31.135108947753906, + "learning_rate": 3.4274e-06, + "loss": 0.7393, + "step": 32880 + }, + { + "epoch": 3.289, + "grad_norm": 32.842384338378906, + "learning_rate": 3.4254000000000003e-06, + "loss": 0.5627, + "step": 32890 + }, + { + "epoch": 3.29, + "grad_norm": 42.18638610839844, + "learning_rate": 3.4234e-06, + "loss": 0.5442, + "step": 32900 + }, + { + "epoch": 3.291, + "grad_norm": 41.7022705078125, + "learning_rate": 3.4214000000000004e-06, + "loss": 0.6849, + "step": 32910 + }, + { + "epoch": 3.292, + "grad_norm": 13.518721580505371, + "learning_rate": 3.4194000000000007e-06, + "loss": 0.3897, + "step": 32920 + }, + { + "epoch": 3.293, + "grad_norm": 48.264923095703125, + "learning_rate": 3.4174e-06, + "loss": 0.9303, + "step": 32930 + }, + { + "epoch": 3.294, + "grad_norm": 7.329990386962891, + "learning_rate": 3.4154000000000004e-06, + "loss": 0.4068, + "step": 32940 + }, + { + "epoch": 3.295, + "grad_norm": 24.180370330810547, + "learning_rate": 3.4134000000000002e-06, + "loss": 0.5774, + "step": 32950 + }, + { + "epoch": 3.296, + "grad_norm": 17.27824592590332, + "learning_rate": 3.4114e-06, + "loss": 0.5578, + "step": 32960 + }, + { + "epoch": 3.297, + "grad_norm": 40.2707633972168, + "learning_rate": 3.4094000000000003e-06, + "loss": 0.5842, + "step": 32970 + }, + { + "epoch": 3.298, + "grad_norm": 28.49213409423828, + "learning_rate": 3.4074e-06, + "loss": 0.6671, + "step": 32980 + }, + { + "epoch": 3.299, + "grad_norm": 18.07645606994629, + "learning_rate": 3.4054000000000005e-06, + "loss": 0.4864, + "step": 32990 + }, + { + "epoch": 3.3, + "grad_norm": 34.3114128112793, + "learning_rate": 3.4034e-06, + "loss": 0.6038, + "step": 33000 + }, + { + "epoch": 3.301, + "grad_norm": 5.593450546264648, + "learning_rate": 3.4014e-06, + "loss": 0.6182, + "step": 33010 + }, + { + "epoch": 3.302, + "grad_norm": 40.547847747802734, + "learning_rate": 3.3994000000000004e-06, + "loss": 0.611, + "step": 33020 + }, + { + "epoch": 3.303, + "grad_norm": 45.57594299316406, + "learning_rate": 3.3974000000000003e-06, + "loss": 0.4329, + "step": 33030 + }, + { + "epoch": 3.304, + "grad_norm": 37.91822814941406, + "learning_rate": 3.3954000000000006e-06, + "loss": 0.4367, + "step": 33040 + }, + { + "epoch": 3.305, + "grad_norm": 25.43527603149414, + "learning_rate": 3.3934e-06, + "loss": 0.8501, + "step": 33050 + }, + { + "epoch": 3.306, + "grad_norm": 19.600317001342773, + "learning_rate": 3.3914000000000003e-06, + "loss": 0.554, + "step": 33060 + }, + { + "epoch": 3.307, + "grad_norm": 47.621376037597656, + "learning_rate": 3.3894000000000005e-06, + "loss": 0.5234, + "step": 33070 + }, + { + "epoch": 3.308, + "grad_norm": 2.732773542404175, + "learning_rate": 3.3874e-06, + "loss": 0.6959, + "step": 33080 + }, + { + "epoch": 3.309, + "grad_norm": 27.551868438720703, + "learning_rate": 3.3854000000000002e-06, + "loss": 0.7955, + "step": 33090 + }, + { + "epoch": 3.31, + "grad_norm": 7.095355987548828, + "learning_rate": 3.3834e-06, + "loss": 0.4998, + "step": 33100 + }, + { + "epoch": 3.311, + "grad_norm": 26.43609619140625, + "learning_rate": 3.3814000000000004e-06, + "loss": 0.4003, + "step": 33110 + }, + { + "epoch": 3.312, + "grad_norm": 26.44409942626953, + "learning_rate": 3.3794000000000006e-06, + "loss": 0.6276, + "step": 33120 + }, + { + "epoch": 3.313, + "grad_norm": 20.006427764892578, + "learning_rate": 3.3774e-06, + "loss": 0.713, + "step": 33130 + }, + { + "epoch": 3.314, + "grad_norm": 28.21528434753418, + "learning_rate": 3.3754000000000003e-06, + "loss": 0.3301, + "step": 33140 + }, + { + "epoch": 3.315, + "grad_norm": 67.65629577636719, + "learning_rate": 3.3734e-06, + "loss": 0.5409, + "step": 33150 + }, + { + "epoch": 3.316, + "grad_norm": 27.215896606445312, + "learning_rate": 3.3714e-06, + "loss": 0.789, + "step": 33160 + }, + { + "epoch": 3.317, + "grad_norm": 33.35668182373047, + "learning_rate": 3.3694000000000003e-06, + "loss": 0.7049, + "step": 33170 + }, + { + "epoch": 3.318, + "grad_norm": 27.733373641967773, + "learning_rate": 3.3674e-06, + "loss": 0.5918, + "step": 33180 + }, + { + "epoch": 3.319, + "grad_norm": 21.185148239135742, + "learning_rate": 3.3654000000000004e-06, + "loss": 0.6805, + "step": 33190 + }, + { + "epoch": 3.32, + "grad_norm": 24.466445922851562, + "learning_rate": 3.3634e-06, + "loss": 0.5639, + "step": 33200 + }, + { + "epoch": 3.321, + "grad_norm": 36.54484558105469, + "learning_rate": 3.3614e-06, + "loss": 0.5826, + "step": 33210 + }, + { + "epoch": 3.322, + "grad_norm": 49.59003829956055, + "learning_rate": 3.3594000000000004e-06, + "loss": 0.6006, + "step": 33220 + }, + { + "epoch": 3.323, + "grad_norm": 28.964061737060547, + "learning_rate": 3.3574000000000002e-06, + "loss": 0.621, + "step": 33230 + }, + { + "epoch": 3.324, + "grad_norm": 41.858428955078125, + "learning_rate": 3.3554000000000005e-06, + "loss": 0.7396, + "step": 33240 + }, + { + "epoch": 3.325, + "grad_norm": 47.466148376464844, + "learning_rate": 3.3534000000000004e-06, + "loss": 0.774, + "step": 33250 + }, + { + "epoch": 3.326, + "grad_norm": 34.58932113647461, + "learning_rate": 3.3514000000000002e-06, + "loss": 0.5768, + "step": 33260 + }, + { + "epoch": 3.327, + "grad_norm": 30.386682510375977, + "learning_rate": 3.3494000000000005e-06, + "loss": 0.4744, + "step": 33270 + }, + { + "epoch": 3.328, + "grad_norm": 32.04347610473633, + "learning_rate": 3.3474e-06, + "loss": 0.5459, + "step": 33280 + }, + { + "epoch": 3.329, + "grad_norm": 34.02959442138672, + "learning_rate": 3.3454e-06, + "loss": 0.5957, + "step": 33290 + }, + { + "epoch": 3.33, + "grad_norm": 28.056381225585938, + "learning_rate": 3.3434000000000005e-06, + "loss": 0.4959, + "step": 33300 + }, + { + "epoch": 3.331, + "grad_norm": 11.723163604736328, + "learning_rate": 3.3414000000000003e-06, + "loss": 0.4785, + "step": 33310 + }, + { + "epoch": 3.332, + "grad_norm": 45.393795013427734, + "learning_rate": 3.3394000000000006e-06, + "loss": 0.6107, + "step": 33320 + }, + { + "epoch": 3.333, + "grad_norm": 195.7186737060547, + "learning_rate": 3.3374e-06, + "loss": 0.557, + "step": 33330 + }, + { + "epoch": 3.334, + "grad_norm": 36.78384017944336, + "learning_rate": 3.3354000000000003e-06, + "loss": 0.568, + "step": 33340 + }, + { + "epoch": 3.335, + "grad_norm": 37.96222686767578, + "learning_rate": 3.3334000000000006e-06, + "loss": 0.5528, + "step": 33350 + }, + { + "epoch": 3.336, + "grad_norm": 38.002418518066406, + "learning_rate": 3.3314e-06, + "loss": 0.5147, + "step": 33360 + }, + { + "epoch": 3.337, + "grad_norm": 35.465614318847656, + "learning_rate": 3.3294000000000003e-06, + "loss": 0.4127, + "step": 33370 + }, + { + "epoch": 3.338, + "grad_norm": 15.506857872009277, + "learning_rate": 3.3274e-06, + "loss": 0.4986, + "step": 33380 + }, + { + "epoch": 3.339, + "grad_norm": 29.845312118530273, + "learning_rate": 3.3254000000000004e-06, + "loss": 0.4565, + "step": 33390 + }, + { + "epoch": 3.34, + "grad_norm": 6.27945613861084, + "learning_rate": 3.3234000000000007e-06, + "loss": 0.62, + "step": 33400 + }, + { + "epoch": 3.341, + "grad_norm": 51.38041687011719, + "learning_rate": 3.3214e-06, + "loss": 0.8341, + "step": 33410 + }, + { + "epoch": 3.342, + "grad_norm": 37.613433837890625, + "learning_rate": 3.3194000000000004e-06, + "loss": 0.6525, + "step": 33420 + }, + { + "epoch": 3.343, + "grad_norm": 6.7415266036987305, + "learning_rate": 3.3174e-06, + "loss": 0.5533, + "step": 33430 + }, + { + "epoch": 3.344, + "grad_norm": 4.774569988250732, + "learning_rate": 3.3154000000000005e-06, + "loss": 0.5075, + "step": 33440 + }, + { + "epoch": 3.3449999999999998, + "grad_norm": 26.686466217041016, + "learning_rate": 3.3134000000000003e-06, + "loss": 0.6392, + "step": 33450 + }, + { + "epoch": 3.346, + "grad_norm": 28.92704200744629, + "learning_rate": 3.3114e-06, + "loss": 0.6978, + "step": 33460 + }, + { + "epoch": 3.347, + "grad_norm": 26.489171981811523, + "learning_rate": 3.3094000000000005e-06, + "loss": 0.5123, + "step": 33470 + }, + { + "epoch": 3.348, + "grad_norm": 69.60842895507812, + "learning_rate": 3.3074e-06, + "loss": 0.7418, + "step": 33480 + }, + { + "epoch": 3.349, + "grad_norm": 42.1660270690918, + "learning_rate": 3.3054e-06, + "loss": 0.7195, + "step": 33490 + }, + { + "epoch": 3.35, + "grad_norm": 53.14607620239258, + "learning_rate": 3.3034000000000004e-06, + "loss": 0.4782, + "step": 33500 + }, + { + "epoch": 3.351, + "grad_norm": 39.90643310546875, + "learning_rate": 3.3014000000000003e-06, + "loss": 0.6492, + "step": 33510 + }, + { + "epoch": 3.352, + "grad_norm": 37.26142501831055, + "learning_rate": 3.2994000000000005e-06, + "loss": 0.4781, + "step": 33520 + }, + { + "epoch": 3.3529999999999998, + "grad_norm": 59.464988708496094, + "learning_rate": 3.2974e-06, + "loss": 0.684, + "step": 33530 + }, + { + "epoch": 3.354, + "grad_norm": 43.060523986816406, + "learning_rate": 3.2954000000000002e-06, + "loss": 0.5223, + "step": 33540 + }, + { + "epoch": 3.355, + "grad_norm": 23.580224990844727, + "learning_rate": 3.2934000000000005e-06, + "loss": 0.4565, + "step": 33550 + }, + { + "epoch": 3.356, + "grad_norm": 54.410423278808594, + "learning_rate": 3.2914e-06, + "loss": 0.586, + "step": 33560 + }, + { + "epoch": 3.357, + "grad_norm": 8.084722518920898, + "learning_rate": 3.2894000000000002e-06, + "loss": 0.442, + "step": 33570 + }, + { + "epoch": 3.358, + "grad_norm": 21.548755645751953, + "learning_rate": 3.2874e-06, + "loss": 0.5678, + "step": 33580 + }, + { + "epoch": 3.359, + "grad_norm": 14.505980491638184, + "learning_rate": 3.2854000000000003e-06, + "loss": 0.539, + "step": 33590 + }, + { + "epoch": 3.36, + "grad_norm": 4.456478118896484, + "learning_rate": 3.2834000000000006e-06, + "loss": 0.4719, + "step": 33600 + }, + { + "epoch": 3.3609999999999998, + "grad_norm": 58.42336654663086, + "learning_rate": 3.2814e-06, + "loss": 0.5288, + "step": 33610 + }, + { + "epoch": 3.362, + "grad_norm": 36.164154052734375, + "learning_rate": 3.2794000000000003e-06, + "loss": 0.5947, + "step": 33620 + }, + { + "epoch": 3.363, + "grad_norm": 53.009727478027344, + "learning_rate": 3.2774e-06, + "loss": 0.4736, + "step": 33630 + }, + { + "epoch": 3.364, + "grad_norm": 33.75554656982422, + "learning_rate": 3.2754000000000004e-06, + "loss": 0.6622, + "step": 33640 + }, + { + "epoch": 3.365, + "grad_norm": 38.6405143737793, + "learning_rate": 3.2734000000000003e-06, + "loss": 0.5776, + "step": 33650 + }, + { + "epoch": 3.366, + "grad_norm": 5.115553379058838, + "learning_rate": 3.2714e-06, + "loss": 0.6407, + "step": 33660 + }, + { + "epoch": 3.367, + "grad_norm": 33.69319152832031, + "learning_rate": 3.2694000000000004e-06, + "loss": 0.5197, + "step": 33670 + }, + { + "epoch": 3.368, + "grad_norm": 37.610408782958984, + "learning_rate": 3.2674e-06, + "loss": 0.794, + "step": 33680 + }, + { + "epoch": 3.3689999999999998, + "grad_norm": 69.22075653076172, + "learning_rate": 3.2654e-06, + "loss": 0.4071, + "step": 33690 + }, + { + "epoch": 3.37, + "grad_norm": 33.58649444580078, + "learning_rate": 3.2634000000000004e-06, + "loss": 0.5019, + "step": 33700 + }, + { + "epoch": 3.371, + "grad_norm": 67.5018539428711, + "learning_rate": 3.2614000000000002e-06, + "loss": 0.5652, + "step": 33710 + }, + { + "epoch": 3.372, + "grad_norm": 45.005184173583984, + "learning_rate": 3.2594000000000005e-06, + "loss": 0.4798, + "step": 33720 + }, + { + "epoch": 3.373, + "grad_norm": 30.43639373779297, + "learning_rate": 3.2574e-06, + "loss": 0.559, + "step": 33730 + }, + { + "epoch": 3.374, + "grad_norm": 45.54741287231445, + "learning_rate": 3.2554e-06, + "loss": 0.6114, + "step": 33740 + }, + { + "epoch": 3.375, + "grad_norm": 16.796628952026367, + "learning_rate": 3.2534000000000005e-06, + "loss": 0.7101, + "step": 33750 + }, + { + "epoch": 3.376, + "grad_norm": 13.3946533203125, + "learning_rate": 3.2514e-06, + "loss": 0.6903, + "step": 33760 + }, + { + "epoch": 3.377, + "grad_norm": 69.04631805419922, + "learning_rate": 3.2494e-06, + "loss": 0.6845, + "step": 33770 + }, + { + "epoch": 3.378, + "grad_norm": 29.6506404876709, + "learning_rate": 3.2474e-06, + "loss": 0.9378, + "step": 33780 + }, + { + "epoch": 3.379, + "grad_norm": 47.310462951660156, + "learning_rate": 3.2454000000000003e-06, + "loss": 0.4969, + "step": 33790 + }, + { + "epoch": 3.38, + "grad_norm": 41.4119987487793, + "learning_rate": 3.2434000000000006e-06, + "loss": 0.479, + "step": 33800 + }, + { + "epoch": 3.3810000000000002, + "grad_norm": 51.03746032714844, + "learning_rate": 3.2414e-06, + "loss": 0.6456, + "step": 33810 + }, + { + "epoch": 3.382, + "grad_norm": 21.19719886779785, + "learning_rate": 3.2394000000000003e-06, + "loss": 0.4372, + "step": 33820 + }, + { + "epoch": 3.383, + "grad_norm": 6.403212547302246, + "learning_rate": 3.2374000000000005e-06, + "loss": 0.4273, + "step": 33830 + }, + { + "epoch": 3.384, + "grad_norm": 37.43207550048828, + "learning_rate": 3.2354000000000004e-06, + "loss": 0.6179, + "step": 33840 + }, + { + "epoch": 3.385, + "grad_norm": 1.2079825401306152, + "learning_rate": 3.2334000000000002e-06, + "loss": 0.2904, + "step": 33850 + }, + { + "epoch": 3.386, + "grad_norm": 29.27356719970703, + "learning_rate": 3.2314e-06, + "loss": 0.6627, + "step": 33860 + }, + { + "epoch": 3.387, + "grad_norm": 45.79487228393555, + "learning_rate": 3.2294000000000004e-06, + "loss": 0.8661, + "step": 33870 + }, + { + "epoch": 3.388, + "grad_norm": 17.66958999633789, + "learning_rate": 3.2274000000000006e-06, + "loss": 0.3627, + "step": 33880 + }, + { + "epoch": 3.3890000000000002, + "grad_norm": 35.9542236328125, + "learning_rate": 3.2254e-06, + "loss": 0.9139, + "step": 33890 + }, + { + "epoch": 3.39, + "grad_norm": 11.291265487670898, + "learning_rate": 3.2234000000000003e-06, + "loss": 0.4528, + "step": 33900 + }, + { + "epoch": 3.391, + "grad_norm": 78.31896209716797, + "learning_rate": 3.2214e-06, + "loss": 0.8227, + "step": 33910 + }, + { + "epoch": 3.392, + "grad_norm": 24.657066345214844, + "learning_rate": 3.2194000000000005e-06, + "loss": 0.4872, + "step": 33920 + }, + { + "epoch": 3.393, + "grad_norm": 45.37953567504883, + "learning_rate": 3.2174000000000003e-06, + "loss": 0.7539, + "step": 33930 + }, + { + "epoch": 3.394, + "grad_norm": 42.956459045410156, + "learning_rate": 3.2154e-06, + "loss": 0.8764, + "step": 33940 + }, + { + "epoch": 3.395, + "grad_norm": 9.4888277053833, + "learning_rate": 3.2134000000000004e-06, + "loss": 0.6115, + "step": 33950 + }, + { + "epoch": 3.396, + "grad_norm": 23.10115623474121, + "learning_rate": 3.2114e-06, + "loss": 0.6539, + "step": 33960 + }, + { + "epoch": 3.3970000000000002, + "grad_norm": 17.13572883605957, + "learning_rate": 3.2094e-06, + "loss": 0.5708, + "step": 33970 + }, + { + "epoch": 3.398, + "grad_norm": 15.064404487609863, + "learning_rate": 3.2074000000000004e-06, + "loss": 0.4718, + "step": 33980 + }, + { + "epoch": 3.399, + "grad_norm": 63.72896194458008, + "learning_rate": 3.2054000000000003e-06, + "loss": 0.4698, + "step": 33990 + }, + { + "epoch": 3.4, + "grad_norm": 12.187103271484375, + "learning_rate": 3.2034000000000005e-06, + "loss": 0.4326, + "step": 34000 + }, + { + "epoch": 3.401, + "grad_norm": 67.7624740600586, + "learning_rate": 3.2014e-06, + "loss": 0.6557, + "step": 34010 + }, + { + "epoch": 3.402, + "grad_norm": 34.88738250732422, + "learning_rate": 3.1994000000000002e-06, + "loss": 0.5402, + "step": 34020 + }, + { + "epoch": 3.403, + "grad_norm": 50.04561996459961, + "learning_rate": 3.1974000000000005e-06, + "loss": 0.6956, + "step": 34030 + }, + { + "epoch": 3.404, + "grad_norm": 48.32029342651367, + "learning_rate": 3.1954000000000004e-06, + "loss": 0.6118, + "step": 34040 + }, + { + "epoch": 3.4050000000000002, + "grad_norm": 60.040218353271484, + "learning_rate": 3.1934e-06, + "loss": 0.5796, + "step": 34050 + }, + { + "epoch": 3.406, + "grad_norm": 41.5383415222168, + "learning_rate": 3.1914e-06, + "loss": 0.5164, + "step": 34060 + }, + { + "epoch": 3.407, + "grad_norm": 22.694530487060547, + "learning_rate": 3.1894000000000003e-06, + "loss": 0.4927, + "step": 34070 + }, + { + "epoch": 3.408, + "grad_norm": 45.76729202270508, + "learning_rate": 3.1874000000000006e-06, + "loss": 0.6338, + "step": 34080 + }, + { + "epoch": 3.409, + "grad_norm": 2.770658493041992, + "learning_rate": 3.1854e-06, + "loss": 0.8599, + "step": 34090 + }, + { + "epoch": 3.41, + "grad_norm": 12.581628799438477, + "learning_rate": 3.1834000000000003e-06, + "loss": 0.4446, + "step": 34100 + }, + { + "epoch": 3.411, + "grad_norm": 15.565549850463867, + "learning_rate": 3.1814e-06, + "loss": 0.5537, + "step": 34110 + }, + { + "epoch": 3.412, + "grad_norm": 24.086557388305664, + "learning_rate": 3.1794000000000004e-06, + "loss": 0.5949, + "step": 34120 + }, + { + "epoch": 3.413, + "grad_norm": 18.560720443725586, + "learning_rate": 3.1774000000000003e-06, + "loss": 0.5971, + "step": 34130 + }, + { + "epoch": 3.414, + "grad_norm": 40.32215881347656, + "learning_rate": 3.1754e-06, + "loss": 0.5366, + "step": 34140 + }, + { + "epoch": 3.415, + "grad_norm": 42.73981857299805, + "learning_rate": 3.1734000000000004e-06, + "loss": 0.4257, + "step": 34150 + }, + { + "epoch": 3.416, + "grad_norm": 27.142242431640625, + "learning_rate": 3.1714e-06, + "loss": 0.7793, + "step": 34160 + }, + { + "epoch": 3.417, + "grad_norm": 31.951887130737305, + "learning_rate": 3.1694e-06, + "loss": 0.3895, + "step": 34170 + }, + { + "epoch": 3.418, + "grad_norm": 34.47846603393555, + "learning_rate": 3.1674000000000004e-06, + "loss": 0.6155, + "step": 34180 + }, + { + "epoch": 3.419, + "grad_norm": 33.36940383911133, + "learning_rate": 3.1654000000000002e-06, + "loss": 0.416, + "step": 34190 + }, + { + "epoch": 3.42, + "grad_norm": 71.54866027832031, + "learning_rate": 3.1634000000000005e-06, + "loss": 0.8407, + "step": 34200 + }, + { + "epoch": 3.421, + "grad_norm": 2.4726550579071045, + "learning_rate": 3.1614e-06, + "loss": 0.7122, + "step": 34210 + }, + { + "epoch": 3.422, + "grad_norm": 55.77376174926758, + "learning_rate": 3.1594e-06, + "loss": 0.543, + "step": 34220 + }, + { + "epoch": 3.423, + "grad_norm": 30.04120635986328, + "learning_rate": 3.1574000000000005e-06, + "loss": 0.7113, + "step": 34230 + }, + { + "epoch": 3.424, + "grad_norm": 7.0647993087768555, + "learning_rate": 3.1554000000000003e-06, + "loss": 0.6297, + "step": 34240 + }, + { + "epoch": 3.425, + "grad_norm": 44.25429916381836, + "learning_rate": 3.1534e-06, + "loss": 0.6442, + "step": 34250 + }, + { + "epoch": 3.426, + "grad_norm": 8.99540901184082, + "learning_rate": 3.1514e-06, + "loss": 0.63, + "step": 34260 + }, + { + "epoch": 3.427, + "grad_norm": 5.797199249267578, + "learning_rate": 3.1494000000000003e-06, + "loss": 0.3925, + "step": 34270 + }, + { + "epoch": 3.428, + "grad_norm": 33.43487548828125, + "learning_rate": 3.1474000000000006e-06, + "loss": 0.3708, + "step": 34280 + }, + { + "epoch": 3.429, + "grad_norm": 20.71714210510254, + "learning_rate": 3.1454e-06, + "loss": 0.6929, + "step": 34290 + }, + { + "epoch": 3.43, + "grad_norm": 32.383670806884766, + "learning_rate": 3.1434000000000003e-06, + "loss": 0.5878, + "step": 34300 + }, + { + "epoch": 3.431, + "grad_norm": 56.46757888793945, + "learning_rate": 3.1414e-06, + "loss": 0.6765, + "step": 34310 + }, + { + "epoch": 3.432, + "grad_norm": 5.454736709594727, + "learning_rate": 3.1394000000000004e-06, + "loss": 0.7627, + "step": 34320 + }, + { + "epoch": 3.433, + "grad_norm": 51.933231353759766, + "learning_rate": 3.1374000000000002e-06, + "loss": 0.4866, + "step": 34330 + }, + { + "epoch": 3.434, + "grad_norm": 41.372867584228516, + "learning_rate": 3.1354e-06, + "loss": 0.5965, + "step": 34340 + }, + { + "epoch": 3.435, + "grad_norm": 56.871681213378906, + "learning_rate": 3.1334000000000004e-06, + "loss": 0.6184, + "step": 34350 + }, + { + "epoch": 3.436, + "grad_norm": 6.995004177093506, + "learning_rate": 3.1314e-06, + "loss": 0.7549, + "step": 34360 + }, + { + "epoch": 3.437, + "grad_norm": 46.217041015625, + "learning_rate": 3.1294e-06, + "loss": 0.7235, + "step": 34370 + }, + { + "epoch": 3.438, + "grad_norm": 29.683107376098633, + "learning_rate": 3.1274000000000003e-06, + "loss": 0.7433, + "step": 34380 + }, + { + "epoch": 3.439, + "grad_norm": 26.741121292114258, + "learning_rate": 3.1254e-06, + "loss": 0.6973, + "step": 34390 + }, + { + "epoch": 3.44, + "grad_norm": 65.56108856201172, + "learning_rate": 3.1234000000000005e-06, + "loss": 0.5786, + "step": 34400 + }, + { + "epoch": 3.441, + "grad_norm": 13.837389945983887, + "learning_rate": 3.1214e-06, + "loss": 0.6994, + "step": 34410 + }, + { + "epoch": 3.442, + "grad_norm": 24.370201110839844, + "learning_rate": 3.1194e-06, + "loss": 0.5863, + "step": 34420 + }, + { + "epoch": 3.443, + "grad_norm": 1.192421555519104, + "learning_rate": 3.1174000000000004e-06, + "loss": 0.6617, + "step": 34430 + }, + { + "epoch": 3.444, + "grad_norm": 61.917198181152344, + "learning_rate": 3.1154000000000003e-06, + "loss": 0.5432, + "step": 34440 + }, + { + "epoch": 3.445, + "grad_norm": 26.635774612426758, + "learning_rate": 3.1134e-06, + "loss": 0.6587, + "step": 34450 + }, + { + "epoch": 3.446, + "grad_norm": 20.741262435913086, + "learning_rate": 3.1114000000000004e-06, + "loss": 0.5384, + "step": 34460 + }, + { + "epoch": 3.447, + "grad_norm": 56.38450622558594, + "learning_rate": 3.1094000000000002e-06, + "loss": 0.8204, + "step": 34470 + }, + { + "epoch": 3.448, + "grad_norm": 58.865074157714844, + "learning_rate": 3.1074000000000005e-06, + "loss": 0.679, + "step": 34480 + }, + { + "epoch": 3.449, + "grad_norm": 79.67637634277344, + "learning_rate": 3.1054e-06, + "loss": 0.6898, + "step": 34490 + }, + { + "epoch": 3.45, + "grad_norm": 52.565040588378906, + "learning_rate": 3.1034000000000002e-06, + "loss": 0.7095, + "step": 34500 + }, + { + "epoch": 3.451, + "grad_norm": 47.64340591430664, + "learning_rate": 3.1014000000000005e-06, + "loss": 0.5069, + "step": 34510 + }, + { + "epoch": 3.452, + "grad_norm": 54.49386978149414, + "learning_rate": 3.0994000000000003e-06, + "loss": 0.7183, + "step": 34520 + }, + { + "epoch": 3.453, + "grad_norm": 38.74276351928711, + "learning_rate": 3.0974000000000006e-06, + "loss": 0.4679, + "step": 34530 + }, + { + "epoch": 3.454, + "grad_norm": 21.229961395263672, + "learning_rate": 3.0954e-06, + "loss": 0.7752, + "step": 34540 + }, + { + "epoch": 3.455, + "grad_norm": 72.66622924804688, + "learning_rate": 3.0934000000000003e-06, + "loss": 0.5673, + "step": 34550 + }, + { + "epoch": 3.456, + "grad_norm": 40.880306243896484, + "learning_rate": 3.0914000000000006e-06, + "loss": 0.4422, + "step": 34560 + }, + { + "epoch": 3.457, + "grad_norm": 40.722320556640625, + "learning_rate": 3.0894e-06, + "loss": 0.6761, + "step": 34570 + }, + { + "epoch": 3.458, + "grad_norm": 63.91718292236328, + "learning_rate": 3.0874000000000003e-06, + "loss": 0.6881, + "step": 34580 + }, + { + "epoch": 3.459, + "grad_norm": 79.54600524902344, + "learning_rate": 3.0854e-06, + "loss": 0.5785, + "step": 34590 + }, + { + "epoch": 3.46, + "grad_norm": 38.31000518798828, + "learning_rate": 3.0834000000000004e-06, + "loss": 0.8471, + "step": 34600 + }, + { + "epoch": 3.461, + "grad_norm": 42.8505859375, + "learning_rate": 3.0814000000000007e-06, + "loss": 0.7023, + "step": 34610 + }, + { + "epoch": 3.462, + "grad_norm": 27.422082901000977, + "learning_rate": 3.0794e-06, + "loss": 0.7369, + "step": 34620 + }, + { + "epoch": 3.463, + "grad_norm": 45.18264389038086, + "learning_rate": 3.0774000000000004e-06, + "loss": 0.4083, + "step": 34630 + }, + { + "epoch": 3.464, + "grad_norm": 32.00255584716797, + "learning_rate": 3.0754000000000002e-06, + "loss": 0.6028, + "step": 34640 + }, + { + "epoch": 3.465, + "grad_norm": 10.15850830078125, + "learning_rate": 3.0734e-06, + "loss": 0.5721, + "step": 34650 + }, + { + "epoch": 3.466, + "grad_norm": 12.090837478637695, + "learning_rate": 3.0714000000000004e-06, + "loss": 0.6242, + "step": 34660 + }, + { + "epoch": 3.467, + "grad_norm": 55.292484283447266, + "learning_rate": 3.0694e-06, + "loss": 0.4978, + "step": 34670 + }, + { + "epoch": 3.468, + "grad_norm": 68.95906066894531, + "learning_rate": 3.0674000000000005e-06, + "loss": 0.7369, + "step": 34680 + }, + { + "epoch": 3.469, + "grad_norm": 53.9699821472168, + "learning_rate": 3.0654e-06, + "loss": 0.68, + "step": 34690 + }, + { + "epoch": 3.4699999999999998, + "grad_norm": 60.666229248046875, + "learning_rate": 3.0634e-06, + "loss": 0.855, + "step": 34700 + }, + { + "epoch": 3.471, + "grad_norm": 17.940872192382812, + "learning_rate": 3.0614000000000005e-06, + "loss": 0.3829, + "step": 34710 + }, + { + "epoch": 3.472, + "grad_norm": 38.6558723449707, + "learning_rate": 3.0594000000000003e-06, + "loss": 0.5485, + "step": 34720 + }, + { + "epoch": 3.473, + "grad_norm": 31.948421478271484, + "learning_rate": 3.0574000000000006e-06, + "loss": 0.664, + "step": 34730 + }, + { + "epoch": 3.474, + "grad_norm": 33.4412727355957, + "learning_rate": 3.0554e-06, + "loss": 0.5501, + "step": 34740 + }, + { + "epoch": 3.475, + "grad_norm": 42.890480041503906, + "learning_rate": 3.0534000000000003e-06, + "loss": 0.4292, + "step": 34750 + }, + { + "epoch": 3.476, + "grad_norm": 52.22254180908203, + "learning_rate": 3.0514000000000005e-06, + "loss": 0.6644, + "step": 34760 + }, + { + "epoch": 3.477, + "grad_norm": 3.8460352420806885, + "learning_rate": 3.0494e-06, + "loss": 0.6048, + "step": 34770 + }, + { + "epoch": 3.4779999999999998, + "grad_norm": 38.23897171020508, + "learning_rate": 3.0476e-06, + "loss": 0.6073, + "step": 34780 + }, + { + "epoch": 3.479, + "grad_norm": 39.942691802978516, + "learning_rate": 3.0456e-06, + "loss": 0.4298, + "step": 34790 + }, + { + "epoch": 3.48, + "grad_norm": 63.8907470703125, + "learning_rate": 3.0436000000000004e-06, + "loss": 0.6932, + "step": 34800 + }, + { + "epoch": 3.481, + "grad_norm": 30.509557723999023, + "learning_rate": 3.0416000000000002e-06, + "loss": 0.6961, + "step": 34810 + }, + { + "epoch": 3.482, + "grad_norm": 31.000228881835938, + "learning_rate": 3.0396000000000005e-06, + "loss": 0.5928, + "step": 34820 + }, + { + "epoch": 3.483, + "grad_norm": 15.010289192199707, + "learning_rate": 3.0376e-06, + "loss": 0.4602, + "step": 34830 + }, + { + "epoch": 3.484, + "grad_norm": 57.35770797729492, + "learning_rate": 3.0356e-06, + "loss": 0.6064, + "step": 34840 + }, + { + "epoch": 3.485, + "grad_norm": 28.138120651245117, + "learning_rate": 3.0336000000000005e-06, + "loss": 0.7436, + "step": 34850 + }, + { + "epoch": 3.4859999999999998, + "grad_norm": 44.76555252075195, + "learning_rate": 3.0316e-06, + "loss": 0.7035, + "step": 34860 + }, + { + "epoch": 3.487, + "grad_norm": 15.496353149414062, + "learning_rate": 3.0296e-06, + "loss": 0.5574, + "step": 34870 + }, + { + "epoch": 3.488, + "grad_norm": 9.24044132232666, + "learning_rate": 3.0276e-06, + "loss": 0.5043, + "step": 34880 + }, + { + "epoch": 3.489, + "grad_norm": 10.469659805297852, + "learning_rate": 3.0256000000000003e-06, + "loss": 0.4792, + "step": 34890 + }, + { + "epoch": 3.49, + "grad_norm": 23.2648983001709, + "learning_rate": 3.0236000000000006e-06, + "loss": 0.606, + "step": 34900 + }, + { + "epoch": 3.491, + "grad_norm": 60.732452392578125, + "learning_rate": 3.0216e-06, + "loss": 0.934, + "step": 34910 + }, + { + "epoch": 3.492, + "grad_norm": 10.405770301818848, + "learning_rate": 3.0196000000000003e-06, + "loss": 0.4321, + "step": 34920 + }, + { + "epoch": 3.493, + "grad_norm": 40.18657302856445, + "learning_rate": 3.0176e-06, + "loss": 0.4344, + "step": 34930 + }, + { + "epoch": 3.4939999999999998, + "grad_norm": 23.268260955810547, + "learning_rate": 3.0156000000000004e-06, + "loss": 0.4978, + "step": 34940 + }, + { + "epoch": 3.495, + "grad_norm": 24.141082763671875, + "learning_rate": 3.0136000000000003e-06, + "loss": 0.5395, + "step": 34950 + }, + { + "epoch": 3.496, + "grad_norm": 29.806568145751953, + "learning_rate": 3.0116e-06, + "loss": 0.3322, + "step": 34960 + }, + { + "epoch": 3.497, + "grad_norm": 51.73660659790039, + "learning_rate": 3.0096000000000004e-06, + "loss": 0.719, + "step": 34970 + }, + { + "epoch": 3.498, + "grad_norm": 15.9256591796875, + "learning_rate": 3.0076000000000006e-06, + "loss": 0.6364, + "step": 34980 + }, + { + "epoch": 3.499, + "grad_norm": 6.268895626068115, + "learning_rate": 3.0056e-06, + "loss": 0.6966, + "step": 34990 + }, + { + "epoch": 3.5, + "grad_norm": 64.48417663574219, + "learning_rate": 3.0036000000000003e-06, + "loss": 0.557, + "step": 35000 + }, + { + "epoch": 3.501, + "grad_norm": 26.548616409301758, + "learning_rate": 3.0016e-06, + "loss": 0.7495, + "step": 35010 + }, + { + "epoch": 3.502, + "grad_norm": 43.20073699951172, + "learning_rate": 2.9996000000000005e-06, + "loss": 0.3843, + "step": 35020 + }, + { + "epoch": 3.503, + "grad_norm": 29.13274383544922, + "learning_rate": 2.9976000000000003e-06, + "loss": 0.4718, + "step": 35030 + }, + { + "epoch": 3.504, + "grad_norm": 54.19246292114258, + "learning_rate": 2.9956e-06, + "loss": 0.6532, + "step": 35040 + }, + { + "epoch": 3.505, + "grad_norm": 42.29218673706055, + "learning_rate": 2.9936000000000004e-06, + "loss": 0.5487, + "step": 35050 + }, + { + "epoch": 3.5060000000000002, + "grad_norm": 42.01010513305664, + "learning_rate": 2.9916e-06, + "loss": 0.6537, + "step": 35060 + }, + { + "epoch": 3.507, + "grad_norm": 22.00259017944336, + "learning_rate": 2.9896e-06, + "loss": 0.5337, + "step": 35070 + }, + { + "epoch": 3.508, + "grad_norm": 47.87183380126953, + "learning_rate": 2.9876000000000004e-06, + "loss": 0.5742, + "step": 35080 + }, + { + "epoch": 3.509, + "grad_norm": 39.39622497558594, + "learning_rate": 2.9856000000000003e-06, + "loss": 0.4731, + "step": 35090 + }, + { + "epoch": 3.51, + "grad_norm": 12.731461524963379, + "learning_rate": 2.9836000000000005e-06, + "loss": 0.5601, + "step": 35100 + }, + { + "epoch": 3.511, + "grad_norm": 80.69575500488281, + "learning_rate": 2.9816e-06, + "loss": 1.0194, + "step": 35110 + }, + { + "epoch": 3.512, + "grad_norm": 26.40245246887207, + "learning_rate": 2.9796000000000002e-06, + "loss": 0.5556, + "step": 35120 + }, + { + "epoch": 3.513, + "grad_norm": 38.29823684692383, + "learning_rate": 2.9776000000000005e-06, + "loss": 0.5171, + "step": 35130 + }, + { + "epoch": 3.5140000000000002, + "grad_norm": 26.385711669921875, + "learning_rate": 2.9756000000000004e-06, + "loss": 0.6902, + "step": 35140 + }, + { + "epoch": 3.515, + "grad_norm": 46.439876556396484, + "learning_rate": 2.9736e-06, + "loss": 0.6092, + "step": 35150 + }, + { + "epoch": 3.516, + "grad_norm": 16.673818588256836, + "learning_rate": 2.9716e-06, + "loss": 0.5462, + "step": 35160 + }, + { + "epoch": 3.517, + "grad_norm": 23.730607986450195, + "learning_rate": 2.9696000000000003e-06, + "loss": 0.6426, + "step": 35170 + }, + { + "epoch": 3.518, + "grad_norm": 27.162260055541992, + "learning_rate": 2.9676000000000006e-06, + "loss": 0.5589, + "step": 35180 + }, + { + "epoch": 3.519, + "grad_norm": 47.120635986328125, + "learning_rate": 2.9656e-06, + "loss": 0.5825, + "step": 35190 + }, + { + "epoch": 3.52, + "grad_norm": 42.60955047607422, + "learning_rate": 2.9636000000000003e-06, + "loss": 0.4523, + "step": 35200 + }, + { + "epoch": 3.521, + "grad_norm": 60.848388671875, + "learning_rate": 2.9616e-06, + "loss": 0.7928, + "step": 35210 + }, + { + "epoch": 3.5220000000000002, + "grad_norm": 10.591123580932617, + "learning_rate": 2.9596000000000004e-06, + "loss": 0.3901, + "step": 35220 + }, + { + "epoch": 3.523, + "grad_norm": 57.188079833984375, + "learning_rate": 2.9576000000000003e-06, + "loss": 0.6111, + "step": 35230 + }, + { + "epoch": 3.524, + "grad_norm": 45.49052047729492, + "learning_rate": 2.9556e-06, + "loss": 0.6336, + "step": 35240 + }, + { + "epoch": 3.525, + "grad_norm": 59.25286102294922, + "learning_rate": 2.9536000000000004e-06, + "loss": 0.4022, + "step": 35250 + }, + { + "epoch": 3.526, + "grad_norm": 45.68498992919922, + "learning_rate": 2.9516e-06, + "loss": 1.1201, + "step": 35260 + }, + { + "epoch": 3.527, + "grad_norm": 12.463373184204102, + "learning_rate": 2.9496e-06, + "loss": 0.595, + "step": 35270 + }, + { + "epoch": 3.528, + "grad_norm": 50.46893310546875, + "learning_rate": 2.9476000000000004e-06, + "loss": 0.7465, + "step": 35280 + }, + { + "epoch": 3.529, + "grad_norm": 1.739295244216919, + "learning_rate": 2.9456000000000002e-06, + "loss": 0.6406, + "step": 35290 + }, + { + "epoch": 3.5300000000000002, + "grad_norm": 64.8057861328125, + "learning_rate": 2.9436000000000005e-06, + "loss": 0.6426, + "step": 35300 + }, + { + "epoch": 3.531, + "grad_norm": 7.33566427230835, + "learning_rate": 2.9416e-06, + "loss": 0.4964, + "step": 35310 + }, + { + "epoch": 3.532, + "grad_norm": 29.327688217163086, + "learning_rate": 2.9396e-06, + "loss": 0.5833, + "step": 35320 + }, + { + "epoch": 3.533, + "grad_norm": 56.58113479614258, + "learning_rate": 2.9376000000000005e-06, + "loss": 0.444, + "step": 35330 + }, + { + "epoch": 3.534, + "grad_norm": 23.588838577270508, + "learning_rate": 2.9356000000000003e-06, + "loss": 0.5028, + "step": 35340 + }, + { + "epoch": 3.535, + "grad_norm": 14.40968132019043, + "learning_rate": 2.9336e-06, + "loss": 0.2349, + "step": 35350 + }, + { + "epoch": 3.536, + "grad_norm": 58.149803161621094, + "learning_rate": 2.9316e-06, + "loss": 0.7023, + "step": 35360 + }, + { + "epoch": 3.537, + "grad_norm": 46.038902282714844, + "learning_rate": 2.9296000000000003e-06, + "loss": 0.7448, + "step": 35370 + }, + { + "epoch": 3.5380000000000003, + "grad_norm": 45.67920684814453, + "learning_rate": 2.9276000000000006e-06, + "loss": 0.6292, + "step": 35380 + }, + { + "epoch": 3.539, + "grad_norm": 14.963251113891602, + "learning_rate": 2.9256e-06, + "loss": 0.726, + "step": 35390 + }, + { + "epoch": 3.54, + "grad_norm": 30.30369758605957, + "learning_rate": 2.9236000000000003e-06, + "loss": 0.6129, + "step": 35400 + }, + { + "epoch": 3.541, + "grad_norm": 21.017148971557617, + "learning_rate": 2.9216e-06, + "loss": 0.8496, + "step": 35410 + }, + { + "epoch": 3.542, + "grad_norm": 61.78471374511719, + "learning_rate": 2.9196000000000004e-06, + "loss": 0.5446, + "step": 35420 + }, + { + "epoch": 3.543, + "grad_norm": 21.3669376373291, + "learning_rate": 2.9176000000000002e-06, + "loss": 0.3647, + "step": 35430 + }, + { + "epoch": 3.544, + "grad_norm": 19.953702926635742, + "learning_rate": 2.9156e-06, + "loss": 0.385, + "step": 35440 + }, + { + "epoch": 3.545, + "grad_norm": 46.433876037597656, + "learning_rate": 2.9136000000000004e-06, + "loss": 0.676, + "step": 35450 + }, + { + "epoch": 3.5460000000000003, + "grad_norm": 62.306941986083984, + "learning_rate": 2.9115999999999998e-06, + "loss": 0.6844, + "step": 35460 + }, + { + "epoch": 3.547, + "grad_norm": 42.35296630859375, + "learning_rate": 2.9096e-06, + "loss": 0.6106, + "step": 35470 + }, + { + "epoch": 3.548, + "grad_norm": 37.69320297241211, + "learning_rate": 2.9076000000000003e-06, + "loss": 0.6195, + "step": 35480 + }, + { + "epoch": 3.549, + "grad_norm": 46.660640716552734, + "learning_rate": 2.9056e-06, + "loss": 0.6841, + "step": 35490 + }, + { + "epoch": 3.55, + "grad_norm": 73.18340301513672, + "learning_rate": 2.9036000000000005e-06, + "loss": 0.6472, + "step": 35500 + }, + { + "epoch": 3.551, + "grad_norm": 44.00477600097656, + "learning_rate": 2.9016e-06, + "loss": 0.7367, + "step": 35510 + }, + { + "epoch": 3.552, + "grad_norm": 140.4091339111328, + "learning_rate": 2.8996e-06, + "loss": 0.6896, + "step": 35520 + }, + { + "epoch": 3.553, + "grad_norm": 9.06778621673584, + "learning_rate": 2.8976000000000004e-06, + "loss": 0.2333, + "step": 35530 + }, + { + "epoch": 3.5540000000000003, + "grad_norm": 48.09809494018555, + "learning_rate": 2.8956000000000003e-06, + "loss": 0.6038, + "step": 35540 + }, + { + "epoch": 3.555, + "grad_norm": 36.15306854248047, + "learning_rate": 2.8936e-06, + "loss": 0.4335, + "step": 35550 + }, + { + "epoch": 3.556, + "grad_norm": 9.26518726348877, + "learning_rate": 2.8916e-06, + "loss": 0.4004, + "step": 35560 + }, + { + "epoch": 3.557, + "grad_norm": 81.55866241455078, + "learning_rate": 2.8896000000000003e-06, + "loss": 0.5066, + "step": 35570 + }, + { + "epoch": 3.558, + "grad_norm": 8.793334007263184, + "learning_rate": 2.8876000000000005e-06, + "loss": 0.5561, + "step": 35580 + }, + { + "epoch": 3.559, + "grad_norm": 28.917333602905273, + "learning_rate": 2.8856e-06, + "loss": 0.7647, + "step": 35590 + }, + { + "epoch": 3.56, + "grad_norm": 36.710777282714844, + "learning_rate": 2.8836000000000002e-06, + "loss": 0.7894, + "step": 35600 + }, + { + "epoch": 3.561, + "grad_norm": 65.8742446899414, + "learning_rate": 2.8816000000000005e-06, + "loss": 0.7871, + "step": 35610 + }, + { + "epoch": 3.5620000000000003, + "grad_norm": 48.99974060058594, + "learning_rate": 2.8796000000000003e-06, + "loss": 0.6941, + "step": 35620 + }, + { + "epoch": 3.5629999999999997, + "grad_norm": 53.21778106689453, + "learning_rate": 2.8776e-06, + "loss": 0.8488, + "step": 35630 + }, + { + "epoch": 3.564, + "grad_norm": 18.24955177307129, + "learning_rate": 2.8756e-06, + "loss": 0.7039, + "step": 35640 + }, + { + "epoch": 3.565, + "grad_norm": 11.103310585021973, + "learning_rate": 2.8736000000000003e-06, + "loss": 0.5776, + "step": 35650 + }, + { + "epoch": 3.566, + "grad_norm": 21.356523513793945, + "learning_rate": 2.8716000000000006e-06, + "loss": 0.76, + "step": 35660 + }, + { + "epoch": 3.567, + "grad_norm": 61.79388427734375, + "learning_rate": 2.8696e-06, + "loss": 0.4611, + "step": 35670 + }, + { + "epoch": 3.568, + "grad_norm": 40.665496826171875, + "learning_rate": 2.8676000000000003e-06, + "loss": 0.4536, + "step": 35680 + }, + { + "epoch": 3.569, + "grad_norm": 42.91267776489258, + "learning_rate": 2.8656e-06, + "loss": 0.7923, + "step": 35690 + }, + { + "epoch": 3.57, + "grad_norm": 47.55757522583008, + "learning_rate": 2.8636000000000004e-06, + "loss": 0.924, + "step": 35700 + }, + { + "epoch": 3.5709999999999997, + "grad_norm": 51.175811767578125, + "learning_rate": 2.8616000000000007e-06, + "loss": 0.9656, + "step": 35710 + }, + { + "epoch": 3.572, + "grad_norm": 68.177734375, + "learning_rate": 2.8596e-06, + "loss": 0.645, + "step": 35720 + }, + { + "epoch": 3.573, + "grad_norm": 0.8722525835037231, + "learning_rate": 2.8576000000000004e-06, + "loss": 0.6464, + "step": 35730 + }, + { + "epoch": 3.574, + "grad_norm": 61.88508605957031, + "learning_rate": 2.8556000000000002e-06, + "loss": 0.6672, + "step": 35740 + }, + { + "epoch": 3.575, + "grad_norm": 31.792068481445312, + "learning_rate": 2.8536e-06, + "loss": 0.7106, + "step": 35750 + }, + { + "epoch": 3.576, + "grad_norm": 34.22478485107422, + "learning_rate": 2.8516000000000004e-06, + "loss": 0.5765, + "step": 35760 + }, + { + "epoch": 3.577, + "grad_norm": 31.679950714111328, + "learning_rate": 2.8496e-06, + "loss": 0.5261, + "step": 35770 + }, + { + "epoch": 3.578, + "grad_norm": 17.40015983581543, + "learning_rate": 2.8476000000000005e-06, + "loss": 0.5874, + "step": 35780 + }, + { + "epoch": 3.5789999999999997, + "grad_norm": 15.593846321105957, + "learning_rate": 2.8456e-06, + "loss": 0.4362, + "step": 35790 + }, + { + "epoch": 3.58, + "grad_norm": 27.332752227783203, + "learning_rate": 2.8436e-06, + "loss": 0.6304, + "step": 35800 + }, + { + "epoch": 3.581, + "grad_norm": 26.46154022216797, + "learning_rate": 2.8416000000000005e-06, + "loss": 0.6574, + "step": 35810 + }, + { + "epoch": 3.582, + "grad_norm": 22.900833129882812, + "learning_rate": 2.8396000000000003e-06, + "loss": 0.6414, + "step": 35820 + }, + { + "epoch": 3.583, + "grad_norm": 40.177005767822266, + "learning_rate": 2.8376e-06, + "loss": 0.6131, + "step": 35830 + }, + { + "epoch": 3.584, + "grad_norm": 29.940670013427734, + "learning_rate": 2.8356e-06, + "loss": 0.7146, + "step": 35840 + }, + { + "epoch": 3.585, + "grad_norm": 40.88385772705078, + "learning_rate": 2.8336000000000003e-06, + "loss": 0.4976, + "step": 35850 + }, + { + "epoch": 3.586, + "grad_norm": 20.683185577392578, + "learning_rate": 2.8316000000000006e-06, + "loss": 0.9411, + "step": 35860 + }, + { + "epoch": 3.5869999999999997, + "grad_norm": 4.140332221984863, + "learning_rate": 2.8296e-06, + "loss": 0.9062, + "step": 35870 + }, + { + "epoch": 3.588, + "grad_norm": 46.50490188598633, + "learning_rate": 2.8276000000000003e-06, + "loss": 0.7577, + "step": 35880 + }, + { + "epoch": 3.589, + "grad_norm": 38.72293472290039, + "learning_rate": 2.8256e-06, + "loss": 0.8583, + "step": 35890 + }, + { + "epoch": 3.59, + "grad_norm": 37.587318420410156, + "learning_rate": 2.8236000000000004e-06, + "loss": 0.6687, + "step": 35900 + }, + { + "epoch": 3.591, + "grad_norm": 42.93622970581055, + "learning_rate": 2.8216000000000006e-06, + "loss": 0.7726, + "step": 35910 + }, + { + "epoch": 3.592, + "grad_norm": 5.569044589996338, + "learning_rate": 2.8196e-06, + "loss": 0.4084, + "step": 35920 + }, + { + "epoch": 3.593, + "grad_norm": 18.586912155151367, + "learning_rate": 2.8176000000000003e-06, + "loss": 0.6104, + "step": 35930 + }, + { + "epoch": 3.594, + "grad_norm": 50.00092697143555, + "learning_rate": 2.8156e-06, + "loss": 0.5622, + "step": 35940 + }, + { + "epoch": 3.5949999999999998, + "grad_norm": 30.04667091369629, + "learning_rate": 2.8136e-06, + "loss": 0.5719, + "step": 35950 + }, + { + "epoch": 3.596, + "grad_norm": 13.743998527526855, + "learning_rate": 2.8116000000000003e-06, + "loss": 0.5752, + "step": 35960 + }, + { + "epoch": 3.597, + "grad_norm": 39.8201904296875, + "learning_rate": 2.8096e-06, + "loss": 0.4261, + "step": 35970 + }, + { + "epoch": 3.598, + "grad_norm": 31.560199737548828, + "learning_rate": 2.8076000000000004e-06, + "loss": 0.5454, + "step": 35980 + }, + { + "epoch": 3.599, + "grad_norm": 55.95412063598633, + "learning_rate": 2.8056e-06, + "loss": 0.7822, + "step": 35990 + }, + { + "epoch": 3.6, + "grad_norm": 27.65061378479004, + "learning_rate": 2.8036e-06, + "loss": 0.5083, + "step": 36000 + }, + { + "epoch": 3.601, + "grad_norm": 33.482330322265625, + "learning_rate": 2.8016000000000004e-06, + "loss": 0.5487, + "step": 36010 + }, + { + "epoch": 3.602, + "grad_norm": 56.06214904785156, + "learning_rate": 2.7996000000000003e-06, + "loss": 0.6015, + "step": 36020 + }, + { + "epoch": 3.6029999999999998, + "grad_norm": 26.832622528076172, + "learning_rate": 2.7976e-06, + "loss": 0.5467, + "step": 36030 + }, + { + "epoch": 3.604, + "grad_norm": 9.77216625213623, + "learning_rate": 2.7956e-06, + "loss": 0.3149, + "step": 36040 + }, + { + "epoch": 3.605, + "grad_norm": 45.25501251220703, + "learning_rate": 2.7936000000000002e-06, + "loss": 0.5729, + "step": 36050 + }, + { + "epoch": 3.606, + "grad_norm": 47.72624969482422, + "learning_rate": 2.7916000000000005e-06, + "loss": 0.7686, + "step": 36060 + }, + { + "epoch": 3.607, + "grad_norm": 37.49579620361328, + "learning_rate": 2.7896e-06, + "loss": 0.473, + "step": 36070 + }, + { + "epoch": 3.608, + "grad_norm": 34.11646270751953, + "learning_rate": 2.7876000000000002e-06, + "loss": 0.5037, + "step": 36080 + }, + { + "epoch": 3.609, + "grad_norm": 7.480967044830322, + "learning_rate": 2.7856e-06, + "loss": 0.4404, + "step": 36090 + }, + { + "epoch": 3.61, + "grad_norm": 12.523974418640137, + "learning_rate": 2.7836000000000003e-06, + "loss": 0.5428, + "step": 36100 + }, + { + "epoch": 3.6109999999999998, + "grad_norm": 27.516239166259766, + "learning_rate": 2.7816000000000006e-06, + "loss": 0.4199, + "step": 36110 + }, + { + "epoch": 3.612, + "grad_norm": 14.997575759887695, + "learning_rate": 2.7796e-06, + "loss": 0.3202, + "step": 36120 + }, + { + "epoch": 3.613, + "grad_norm": 23.749460220336914, + "learning_rate": 2.7776000000000003e-06, + "loss": 0.5419, + "step": 36130 + }, + { + "epoch": 3.614, + "grad_norm": 18.59351348876953, + "learning_rate": 2.7756e-06, + "loss": 0.5096, + "step": 36140 + }, + { + "epoch": 3.615, + "grad_norm": 36.554222106933594, + "learning_rate": 2.7736e-06, + "loss": 0.4627, + "step": 36150 + }, + { + "epoch": 3.616, + "grad_norm": 21.801746368408203, + "learning_rate": 2.7716000000000003e-06, + "loss": 0.5231, + "step": 36160 + }, + { + "epoch": 3.617, + "grad_norm": 9.660021781921387, + "learning_rate": 2.7696e-06, + "loss": 0.6033, + "step": 36170 + }, + { + "epoch": 3.618, + "grad_norm": 52.21128845214844, + "learning_rate": 2.7676000000000004e-06, + "loss": 0.679, + "step": 36180 + }, + { + "epoch": 3.6189999999999998, + "grad_norm": 56.615543365478516, + "learning_rate": 2.7656e-06, + "loss": 0.4759, + "step": 36190 + }, + { + "epoch": 3.62, + "grad_norm": 17.087657928466797, + "learning_rate": 2.7636e-06, + "loss": 0.6594, + "step": 36200 + }, + { + "epoch": 3.621, + "grad_norm": 1.8077119588851929, + "learning_rate": 2.7616000000000004e-06, + "loss": 0.456, + "step": 36210 + }, + { + "epoch": 3.622, + "grad_norm": 53.12010192871094, + "learning_rate": 2.7596000000000002e-06, + "loss": 0.3854, + "step": 36220 + }, + { + "epoch": 3.623, + "grad_norm": 46.78509521484375, + "learning_rate": 2.7576e-06, + "loss": 0.5449, + "step": 36230 + }, + { + "epoch": 3.624, + "grad_norm": 50.42572021484375, + "learning_rate": 2.7556000000000003e-06, + "loss": 0.6184, + "step": 36240 + }, + { + "epoch": 3.625, + "grad_norm": 40.22068405151367, + "learning_rate": 2.7536e-06, + "loss": 0.5953, + "step": 36250 + }, + { + "epoch": 3.626, + "grad_norm": 29.53036117553711, + "learning_rate": 2.7516000000000005e-06, + "loss": 0.499, + "step": 36260 + }, + { + "epoch": 3.627, + "grad_norm": 18.926544189453125, + "learning_rate": 2.7496e-06, + "loss": 0.3777, + "step": 36270 + }, + { + "epoch": 3.628, + "grad_norm": 40.2806282043457, + "learning_rate": 2.7476e-06, + "loss": 0.6982, + "step": 36280 + }, + { + "epoch": 3.629, + "grad_norm": 20.797760009765625, + "learning_rate": 2.7456000000000004e-06, + "loss": 0.7362, + "step": 36290 + }, + { + "epoch": 3.63, + "grad_norm": 6.7826032638549805, + "learning_rate": 2.7436000000000003e-06, + "loss": 0.5419, + "step": 36300 + }, + { + "epoch": 3.6310000000000002, + "grad_norm": 6.89370059967041, + "learning_rate": 2.7416000000000006e-06, + "loss": 0.6738, + "step": 36310 + }, + { + "epoch": 3.632, + "grad_norm": 18.179012298583984, + "learning_rate": 2.7396e-06, + "loss": 0.5437, + "step": 36320 + }, + { + "epoch": 3.633, + "grad_norm": 60.44831848144531, + "learning_rate": 2.7376000000000003e-06, + "loss": 0.4036, + "step": 36330 + }, + { + "epoch": 3.634, + "grad_norm": 26.54794692993164, + "learning_rate": 2.7356000000000005e-06, + "loss": 0.3875, + "step": 36340 + }, + { + "epoch": 3.635, + "grad_norm": 36.836124420166016, + "learning_rate": 2.7336e-06, + "loss": 0.5515, + "step": 36350 + }, + { + "epoch": 3.636, + "grad_norm": 66.89555358886719, + "learning_rate": 2.7316000000000002e-06, + "loss": 0.5732, + "step": 36360 + }, + { + "epoch": 3.637, + "grad_norm": 43.293033599853516, + "learning_rate": 2.7296e-06, + "loss": 0.5602, + "step": 36370 + }, + { + "epoch": 3.638, + "grad_norm": 31.94192886352539, + "learning_rate": 2.7276000000000004e-06, + "loss": 0.522, + "step": 36380 + }, + { + "epoch": 3.6390000000000002, + "grad_norm": 31.9433650970459, + "learning_rate": 2.7256000000000006e-06, + "loss": 0.4046, + "step": 36390 + }, + { + "epoch": 3.64, + "grad_norm": 49.40196990966797, + "learning_rate": 2.7236e-06, + "loss": 0.7042, + "step": 36400 + }, + { + "epoch": 3.641, + "grad_norm": 33.30634307861328, + "learning_rate": 2.7216000000000003e-06, + "loss": 0.6307, + "step": 36410 + }, + { + "epoch": 3.642, + "grad_norm": 55.43235778808594, + "learning_rate": 2.7196e-06, + "loss": 0.5498, + "step": 36420 + }, + { + "epoch": 3.643, + "grad_norm": 45.17135238647461, + "learning_rate": 2.7176e-06, + "loss": 0.5377, + "step": 36430 + }, + { + "epoch": 3.644, + "grad_norm": 52.483306884765625, + "learning_rate": 2.7156000000000003e-06, + "loss": 0.5028, + "step": 36440 + }, + { + "epoch": 3.645, + "grad_norm": 48.63310623168945, + "learning_rate": 2.7136e-06, + "loss": 0.6501, + "step": 36450 + }, + { + "epoch": 3.646, + "grad_norm": 32.639774322509766, + "learning_rate": 2.7116000000000004e-06, + "loss": 0.7266, + "step": 36460 + }, + { + "epoch": 3.6470000000000002, + "grad_norm": 12.3084135055542, + "learning_rate": 2.7096e-06, + "loss": 0.56, + "step": 36470 + }, + { + "epoch": 3.648, + "grad_norm": 3.4930202960968018, + "learning_rate": 2.7076e-06, + "loss": 0.6945, + "step": 36480 + }, + { + "epoch": 3.649, + "grad_norm": 55.61498260498047, + "learning_rate": 2.7056000000000004e-06, + "loss": 0.3165, + "step": 36490 + }, + { + "epoch": 3.65, + "grad_norm": 56.448055267333984, + "learning_rate": 2.7036000000000003e-06, + "loss": 0.3924, + "step": 36500 + }, + { + "epoch": 3.651, + "grad_norm": 7.915106773376465, + "learning_rate": 2.7016000000000005e-06, + "loss": 0.4978, + "step": 36510 + }, + { + "epoch": 3.652, + "grad_norm": 13.365342140197754, + "learning_rate": 2.6996e-06, + "loss": 0.5635, + "step": 36520 + }, + { + "epoch": 3.653, + "grad_norm": 19.216312408447266, + "learning_rate": 2.6976000000000002e-06, + "loss": 0.4758, + "step": 36530 + }, + { + "epoch": 3.654, + "grad_norm": 45.420921325683594, + "learning_rate": 2.6956000000000005e-06, + "loss": 0.7895, + "step": 36540 + }, + { + "epoch": 3.6550000000000002, + "grad_norm": 78.061279296875, + "learning_rate": 2.6936e-06, + "loss": 0.4275, + "step": 36550 + }, + { + "epoch": 3.656, + "grad_norm": 1.6079028844833374, + "learning_rate": 2.6916e-06, + "loss": 0.5539, + "step": 36560 + }, + { + "epoch": 3.657, + "grad_norm": 58.142704010009766, + "learning_rate": 2.6896e-06, + "loss": 0.9741, + "step": 36570 + }, + { + "epoch": 3.658, + "grad_norm": 8.05836009979248, + "learning_rate": 2.6876000000000003e-06, + "loss": 0.4898, + "step": 36580 + }, + { + "epoch": 3.659, + "grad_norm": 31.095792770385742, + "learning_rate": 2.6856000000000006e-06, + "loss": 0.5192, + "step": 36590 + }, + { + "epoch": 3.66, + "grad_norm": 29.871849060058594, + "learning_rate": 2.6836e-06, + "loss": 0.7613, + "step": 36600 + }, + { + "epoch": 3.661, + "grad_norm": 63.34565353393555, + "learning_rate": 2.6816000000000003e-06, + "loss": 0.5743, + "step": 36610 + }, + { + "epoch": 3.662, + "grad_norm": 7.080972671508789, + "learning_rate": 2.6796e-06, + "loss": 0.7588, + "step": 36620 + }, + { + "epoch": 3.6630000000000003, + "grad_norm": 28.010923385620117, + "learning_rate": 2.6776e-06, + "loss": 0.4268, + "step": 36630 + }, + { + "epoch": 3.664, + "grad_norm": 50.9543342590332, + "learning_rate": 2.6756000000000003e-06, + "loss": 0.6525, + "step": 36640 + }, + { + "epoch": 3.665, + "grad_norm": 2.2711169719696045, + "learning_rate": 2.6736e-06, + "loss": 0.4292, + "step": 36650 + }, + { + "epoch": 3.666, + "grad_norm": 58.64751052856445, + "learning_rate": 2.6716000000000004e-06, + "loss": 0.7755, + "step": 36660 + }, + { + "epoch": 3.667, + "grad_norm": 40.33562088012695, + "learning_rate": 2.6696e-06, + "loss": 0.6762, + "step": 36670 + }, + { + "epoch": 3.668, + "grad_norm": 8.384110450744629, + "learning_rate": 2.6676e-06, + "loss": 0.5171, + "step": 36680 + }, + { + "epoch": 3.669, + "grad_norm": 29.39927101135254, + "learning_rate": 2.6656000000000004e-06, + "loss": 0.5208, + "step": 36690 + }, + { + "epoch": 3.67, + "grad_norm": 35.47958755493164, + "learning_rate": 2.6636000000000002e-06, + "loss": 0.4768, + "step": 36700 + }, + { + "epoch": 3.6710000000000003, + "grad_norm": 49.37161636352539, + "learning_rate": 2.6616000000000005e-06, + "loss": 0.7873, + "step": 36710 + }, + { + "epoch": 3.672, + "grad_norm": 53.93315887451172, + "learning_rate": 2.6596e-06, + "loss": 0.559, + "step": 36720 + }, + { + "epoch": 3.673, + "grad_norm": 32.21564483642578, + "learning_rate": 2.6576e-06, + "loss": 0.5429, + "step": 36730 + }, + { + "epoch": 3.674, + "grad_norm": 0.6642956137657166, + "learning_rate": 2.6556000000000005e-06, + "loss": 0.5247, + "step": 36740 + }, + { + "epoch": 3.675, + "grad_norm": 43.1329460144043, + "learning_rate": 2.6536e-06, + "loss": 0.6105, + "step": 36750 + }, + { + "epoch": 3.676, + "grad_norm": 71.51045989990234, + "learning_rate": 2.6516e-06, + "loss": 0.6286, + "step": 36760 + }, + { + "epoch": 3.677, + "grad_norm": 35.833473205566406, + "learning_rate": 2.6496e-06, + "loss": 0.3915, + "step": 36770 + }, + { + "epoch": 3.678, + "grad_norm": 35.758827209472656, + "learning_rate": 2.6476000000000003e-06, + "loss": 0.4121, + "step": 36780 + }, + { + "epoch": 3.6790000000000003, + "grad_norm": 38.435794830322266, + "learning_rate": 2.6456000000000006e-06, + "loss": 0.6117, + "step": 36790 + }, + { + "epoch": 3.68, + "grad_norm": 66.92293548583984, + "learning_rate": 2.6436e-06, + "loss": 0.6519, + "step": 36800 + }, + { + "epoch": 3.681, + "grad_norm": 60.6037712097168, + "learning_rate": 2.6416000000000003e-06, + "loss": 0.5819, + "step": 36810 + }, + { + "epoch": 3.682, + "grad_norm": 79.33817291259766, + "learning_rate": 2.6396e-06, + "loss": 0.7536, + "step": 36820 + }, + { + "epoch": 3.683, + "grad_norm": 28.001657485961914, + "learning_rate": 2.6376e-06, + "loss": 0.5932, + "step": 36830 + }, + { + "epoch": 3.684, + "grad_norm": 25.109481811523438, + "learning_rate": 2.6356000000000002e-06, + "loss": 0.5357, + "step": 36840 + }, + { + "epoch": 3.685, + "grad_norm": 36.05546188354492, + "learning_rate": 2.6336e-06, + "loss": 0.7685, + "step": 36850 + }, + { + "epoch": 3.686, + "grad_norm": 14.421683311462402, + "learning_rate": 2.6316000000000004e-06, + "loss": 0.7285, + "step": 36860 + }, + { + "epoch": 3.6870000000000003, + "grad_norm": 41.31953048706055, + "learning_rate": 2.6296000000000006e-06, + "loss": 0.7947, + "step": 36870 + }, + { + "epoch": 3.6879999999999997, + "grad_norm": 18.700454711914062, + "learning_rate": 2.6276e-06, + "loss": 0.8291, + "step": 36880 + }, + { + "epoch": 3.689, + "grad_norm": 47.96348190307617, + "learning_rate": 2.6256000000000003e-06, + "loss": 0.5966, + "step": 36890 + }, + { + "epoch": 3.69, + "grad_norm": 26.686315536499023, + "learning_rate": 2.6236e-06, + "loss": 0.6098, + "step": 36900 + }, + { + "epoch": 3.691, + "grad_norm": 47.52505111694336, + "learning_rate": 2.6216000000000004e-06, + "loss": 0.7313, + "step": 36910 + }, + { + "epoch": 3.692, + "grad_norm": 75.91033935546875, + "learning_rate": 2.6196000000000003e-06, + "loss": 0.6876, + "step": 36920 + }, + { + "epoch": 3.693, + "grad_norm": 57.06541442871094, + "learning_rate": 2.6176e-06, + "loss": 0.5015, + "step": 36930 + }, + { + "epoch": 3.694, + "grad_norm": 2.7463643550872803, + "learning_rate": 2.6156000000000004e-06, + "loss": 0.5357, + "step": 36940 + }, + { + "epoch": 3.695, + "grad_norm": 92.55400848388672, + "learning_rate": 2.6136e-06, + "loss": 0.4267, + "step": 36950 + }, + { + "epoch": 3.6959999999999997, + "grad_norm": 47.57724380493164, + "learning_rate": 2.6116e-06, + "loss": 0.7616, + "step": 36960 + }, + { + "epoch": 3.697, + "grad_norm": 72.47662353515625, + "learning_rate": 2.6096000000000004e-06, + "loss": 0.6118, + "step": 36970 + }, + { + "epoch": 3.698, + "grad_norm": 66.8961410522461, + "learning_rate": 2.6076000000000002e-06, + "loss": 0.7714, + "step": 36980 + }, + { + "epoch": 3.699, + "grad_norm": 7.666031360626221, + "learning_rate": 2.6056000000000005e-06, + "loss": 0.9331, + "step": 36990 + }, + { + "epoch": 3.7, + "grad_norm": 6.739872932434082, + "learning_rate": 2.6036e-06, + "loss": 0.6918, + "step": 37000 + }, + { + "epoch": 3.701, + "grad_norm": 30.467918395996094, + "learning_rate": 2.6016000000000002e-06, + "loss": 0.5595, + "step": 37010 + }, + { + "epoch": 3.702, + "grad_norm": 6.347413063049316, + "learning_rate": 2.5996000000000005e-06, + "loss": 0.3831, + "step": 37020 + }, + { + "epoch": 3.703, + "grad_norm": 37.63015365600586, + "learning_rate": 2.5976e-06, + "loss": 0.4942, + "step": 37030 + }, + { + "epoch": 3.7039999999999997, + "grad_norm": 28.430194854736328, + "learning_rate": 2.5956e-06, + "loss": 0.7804, + "step": 37040 + }, + { + "epoch": 3.705, + "grad_norm": 18.33820343017578, + "learning_rate": 2.5936e-06, + "loss": 0.5023, + "step": 37050 + }, + { + "epoch": 3.706, + "grad_norm": 40.74871826171875, + "learning_rate": 2.5916000000000003e-06, + "loss": 0.6542, + "step": 37060 + }, + { + "epoch": 3.707, + "grad_norm": 77.77648162841797, + "learning_rate": 2.5896000000000006e-06, + "loss": 0.5121, + "step": 37070 + }, + { + "epoch": 3.708, + "grad_norm": 70.61822509765625, + "learning_rate": 2.5876e-06, + "loss": 0.6368, + "step": 37080 + }, + { + "epoch": 3.709, + "grad_norm": 34.1216926574707, + "learning_rate": 2.5856000000000003e-06, + "loss": 0.5764, + "step": 37090 + }, + { + "epoch": 3.71, + "grad_norm": 9.746734619140625, + "learning_rate": 2.5836e-06, + "loss": 0.6669, + "step": 37100 + }, + { + "epoch": 3.711, + "grad_norm": 31.46854591369629, + "learning_rate": 2.5816000000000004e-06, + "loss": 0.6934, + "step": 37110 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 28.925506591796875, + "learning_rate": 2.5796000000000003e-06, + "loss": 0.3579, + "step": 37120 + }, + { + "epoch": 3.713, + "grad_norm": 48.56563186645508, + "learning_rate": 2.5776e-06, + "loss": 0.5464, + "step": 37130 + }, + { + "epoch": 3.714, + "grad_norm": 48.421627044677734, + "learning_rate": 2.5756000000000004e-06, + "loss": 0.999, + "step": 37140 + }, + { + "epoch": 3.715, + "grad_norm": 35.519554138183594, + "learning_rate": 2.5736e-06, + "loss": 0.5991, + "step": 37150 + }, + { + "epoch": 3.716, + "grad_norm": 49.56758117675781, + "learning_rate": 2.5716e-06, + "loss": 0.6012, + "step": 37160 + }, + { + "epoch": 3.717, + "grad_norm": 32.166297912597656, + "learning_rate": 2.5696000000000004e-06, + "loss": 0.6095, + "step": 37170 + }, + { + "epoch": 3.718, + "grad_norm": 4.212522506713867, + "learning_rate": 2.5676e-06, + "loss": 0.5991, + "step": 37180 + }, + { + "epoch": 3.719, + "grad_norm": 23.69797706604004, + "learning_rate": 2.5656000000000005e-06, + "loss": 0.8162, + "step": 37190 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 36.765499114990234, + "learning_rate": 2.5636e-06, + "loss": 0.535, + "step": 37200 + }, + { + "epoch": 3.721, + "grad_norm": 27.32730484008789, + "learning_rate": 2.5616e-06, + "loss": 0.558, + "step": 37210 + }, + { + "epoch": 3.722, + "grad_norm": 0.6880661845207214, + "learning_rate": 2.5596000000000004e-06, + "loss": 0.5504, + "step": 37220 + }, + { + "epoch": 3.723, + "grad_norm": 37.06109619140625, + "learning_rate": 2.5576e-06, + "loss": 0.5269, + "step": 37230 + }, + { + "epoch": 3.724, + "grad_norm": 63.31182861328125, + "learning_rate": 2.5556e-06, + "loss": 0.627, + "step": 37240 + }, + { + "epoch": 3.725, + "grad_norm": 2.2652759552001953, + "learning_rate": 2.5536e-06, + "loss": 0.5961, + "step": 37250 + }, + { + "epoch": 3.726, + "grad_norm": 34.18130874633789, + "learning_rate": 2.5516000000000003e-06, + "loss": 0.5139, + "step": 37260 + }, + { + "epoch": 3.727, + "grad_norm": 17.9378662109375, + "learning_rate": 2.5496000000000005e-06, + "loss": 0.4794, + "step": 37270 + }, + { + "epoch": 3.7279999999999998, + "grad_norm": 5.379673004150391, + "learning_rate": 2.5476e-06, + "loss": 0.3321, + "step": 37280 + }, + { + "epoch": 3.729, + "grad_norm": 14.593806266784668, + "learning_rate": 2.5456000000000002e-06, + "loss": 0.4052, + "step": 37290 + }, + { + "epoch": 3.73, + "grad_norm": 33.24942398071289, + "learning_rate": 2.5436e-06, + "loss": 0.8694, + "step": 37300 + }, + { + "epoch": 3.731, + "grad_norm": 11.167141914367676, + "learning_rate": 2.5416000000000004e-06, + "loss": 0.4076, + "step": 37310 + }, + { + "epoch": 3.732, + "grad_norm": 6.8190388679504395, + "learning_rate": 2.5396000000000002e-06, + "loss": 0.6243, + "step": 37320 + }, + { + "epoch": 3.733, + "grad_norm": 49.4874382019043, + "learning_rate": 2.5376e-06, + "loss": 0.3881, + "step": 37330 + }, + { + "epoch": 3.734, + "grad_norm": 10.311324119567871, + "learning_rate": 2.5356000000000003e-06, + "loss": 0.264, + "step": 37340 + }, + { + "epoch": 3.735, + "grad_norm": 10.175442695617676, + "learning_rate": 2.5335999999999998e-06, + "loss": 0.5226, + "step": 37350 + }, + { + "epoch": 3.7359999999999998, + "grad_norm": 25.542966842651367, + "learning_rate": 2.5316e-06, + "loss": 0.6076, + "step": 37360 + }, + { + "epoch": 3.737, + "grad_norm": 6.893240928649902, + "learning_rate": 2.5296000000000003e-06, + "loss": 0.4515, + "step": 37370 + }, + { + "epoch": 3.738, + "grad_norm": 12.86868667602539, + "learning_rate": 2.5276e-06, + "loss": 0.778, + "step": 37380 + }, + { + "epoch": 3.739, + "grad_norm": 51.52862548828125, + "learning_rate": 2.5256000000000004e-06, + "loss": 0.7111, + "step": 37390 + }, + { + "epoch": 3.74, + "grad_norm": 50.969703674316406, + "learning_rate": 2.5236e-06, + "loss": 0.5213, + "step": 37400 + }, + { + "epoch": 3.741, + "grad_norm": 40.725242614746094, + "learning_rate": 2.5216e-06, + "loss": 0.5364, + "step": 37410 + }, + { + "epoch": 3.742, + "grad_norm": 36.41386795043945, + "learning_rate": 2.5196000000000004e-06, + "loss": 0.6574, + "step": 37420 + }, + { + "epoch": 3.743, + "grad_norm": 23.86066246032715, + "learning_rate": 2.5176e-06, + "loss": 0.3526, + "step": 37430 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 26.9005069732666, + "learning_rate": 2.5156e-06, + "loss": 0.5263, + "step": 37440 + }, + { + "epoch": 3.745, + "grad_norm": 53.60567855834961, + "learning_rate": 2.5136e-06, + "loss": 0.5625, + "step": 37450 + }, + { + "epoch": 3.746, + "grad_norm": 39.05552291870117, + "learning_rate": 2.5116000000000002e-06, + "loss": 1.0419, + "step": 37460 + }, + { + "epoch": 3.747, + "grad_norm": 57.375396728515625, + "learning_rate": 2.5096000000000005e-06, + "loss": 0.6052, + "step": 37470 + }, + { + "epoch": 3.748, + "grad_norm": 61.77552795410156, + "learning_rate": 2.5076e-06, + "loss": 0.6023, + "step": 37480 + }, + { + "epoch": 3.749, + "grad_norm": 4.251457691192627, + "learning_rate": 2.5056e-06, + "loss": 0.7081, + "step": 37490 + }, + { + "epoch": 3.75, + "grad_norm": 10.436452865600586, + "learning_rate": 2.5036000000000005e-06, + "loss": 0.5283, + "step": 37500 + }, + { + "epoch": 3.751, + "grad_norm": 6.220841884613037, + "learning_rate": 2.5016000000000003e-06, + "loss": 0.8396, + "step": 37510 + }, + { + "epoch": 3.752, + "grad_norm": 46.980194091796875, + "learning_rate": 2.4996e-06, + "loss": 0.5082, + "step": 37520 + }, + { + "epoch": 3.753, + "grad_norm": 26.87305450439453, + "learning_rate": 2.4976000000000004e-06, + "loss": 0.4748, + "step": 37530 + }, + { + "epoch": 3.754, + "grad_norm": 118.64867401123047, + "learning_rate": 2.4956000000000003e-06, + "loss": 0.7019, + "step": 37540 + }, + { + "epoch": 3.755, + "grad_norm": 33.75884246826172, + "learning_rate": 2.4936e-06, + "loss": 0.6893, + "step": 37550 + }, + { + "epoch": 3.7560000000000002, + "grad_norm": 6.014034271240234, + "learning_rate": 2.4916e-06, + "loss": 0.4001, + "step": 37560 + }, + { + "epoch": 3.757, + "grad_norm": 66.10906982421875, + "learning_rate": 2.4896000000000003e-06, + "loss": 0.6762, + "step": 37570 + }, + { + "epoch": 3.758, + "grad_norm": 55.02159881591797, + "learning_rate": 2.4876e-06, + "loss": 0.6555, + "step": 37580 + }, + { + "epoch": 3.759, + "grad_norm": 5.82243013381958, + "learning_rate": 2.4856000000000004e-06, + "loss": 0.4801, + "step": 37590 + }, + { + "epoch": 3.76, + "grad_norm": 50.23249053955078, + "learning_rate": 2.4836000000000002e-06, + "loss": 0.5579, + "step": 37600 + }, + { + "epoch": 3.761, + "grad_norm": 5.414667129516602, + "learning_rate": 2.4816e-06, + "loss": 0.7534, + "step": 37610 + }, + { + "epoch": 3.762, + "grad_norm": 47.884521484375, + "learning_rate": 2.4796000000000004e-06, + "loss": 0.9884, + "step": 37620 + }, + { + "epoch": 3.763, + "grad_norm": 59.99955749511719, + "learning_rate": 2.4776000000000002e-06, + "loss": 0.8165, + "step": 37630 + }, + { + "epoch": 3.7640000000000002, + "grad_norm": 30.009929656982422, + "learning_rate": 2.4756e-06, + "loss": 0.5666, + "step": 37640 + }, + { + "epoch": 3.765, + "grad_norm": 57.524356842041016, + "learning_rate": 2.4736000000000003e-06, + "loss": 0.6281, + "step": 37650 + }, + { + "epoch": 3.766, + "grad_norm": 5.745800018310547, + "learning_rate": 2.4716e-06, + "loss": 0.6068, + "step": 37660 + }, + { + "epoch": 3.767, + "grad_norm": 52.89751052856445, + "learning_rate": 2.4696000000000005e-06, + "loss": 0.4946, + "step": 37670 + }, + { + "epoch": 3.768, + "grad_norm": 60.40765380859375, + "learning_rate": 2.4676000000000003e-06, + "loss": 0.5985, + "step": 37680 + }, + { + "epoch": 3.769, + "grad_norm": 33.4071159362793, + "learning_rate": 2.4656e-06, + "loss": 0.4445, + "step": 37690 + }, + { + "epoch": 3.77, + "grad_norm": 7.831329345703125, + "learning_rate": 2.4636e-06, + "loss": 0.4106, + "step": 37700 + }, + { + "epoch": 3.771, + "grad_norm": 31.423812866210938, + "learning_rate": 2.4616000000000003e-06, + "loss": 0.6481, + "step": 37710 + }, + { + "epoch": 3.7720000000000002, + "grad_norm": 58.97570037841797, + "learning_rate": 2.4596e-06, + "loss": 0.6293, + "step": 37720 + }, + { + "epoch": 3.773, + "grad_norm": 45.02798843383789, + "learning_rate": 2.4576000000000004e-06, + "loss": 0.4715, + "step": 37730 + }, + { + "epoch": 3.774, + "grad_norm": 44.902523040771484, + "learning_rate": 2.4556000000000003e-06, + "loss": 0.6748, + "step": 37740 + }, + { + "epoch": 3.775, + "grad_norm": 25.11823081970215, + "learning_rate": 2.4536e-06, + "loss": 0.5346, + "step": 37750 + }, + { + "epoch": 3.776, + "grad_norm": 24.013734817504883, + "learning_rate": 2.4516e-06, + "loss": 0.776, + "step": 37760 + }, + { + "epoch": 3.777, + "grad_norm": 38.328773498535156, + "learning_rate": 2.4496000000000002e-06, + "loss": 0.7096, + "step": 37770 + }, + { + "epoch": 3.778, + "grad_norm": 27.20368003845215, + "learning_rate": 2.4476e-06, + "loss": 0.7903, + "step": 37780 + }, + { + "epoch": 3.779, + "grad_norm": 20.354225158691406, + "learning_rate": 2.4456000000000004e-06, + "loss": 0.9121, + "step": 37790 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 32.40660095214844, + "learning_rate": 2.4436e-06, + "loss": 0.5785, + "step": 37800 + }, + { + "epoch": 3.781, + "grad_norm": 47.86864471435547, + "learning_rate": 2.4416e-06, + "loss": 0.7947, + "step": 37810 + }, + { + "epoch": 3.782, + "grad_norm": 3.703575372695923, + "learning_rate": 2.4396000000000003e-06, + "loss": 0.5196, + "step": 37820 + }, + { + "epoch": 3.783, + "grad_norm": 44.8717041015625, + "learning_rate": 2.4376e-06, + "loss": 0.6596, + "step": 37830 + }, + { + "epoch": 3.784, + "grad_norm": 37.06995391845703, + "learning_rate": 2.4356e-06, + "loss": 0.7149, + "step": 37840 + }, + { + "epoch": 3.785, + "grad_norm": 21.56605339050293, + "learning_rate": 2.4336000000000003e-06, + "loss": 0.4211, + "step": 37850 + }, + { + "epoch": 3.786, + "grad_norm": 21.40250015258789, + "learning_rate": 2.4316e-06, + "loss": 0.5447, + "step": 37860 + }, + { + "epoch": 3.787, + "grad_norm": 64.63874053955078, + "learning_rate": 2.4296000000000004e-06, + "loss": 0.4935, + "step": 37870 + }, + { + "epoch": 3.7880000000000003, + "grad_norm": 50.10771179199219, + "learning_rate": 2.4276000000000003e-06, + "loss": 0.6575, + "step": 37880 + }, + { + "epoch": 3.789, + "grad_norm": 21.860170364379883, + "learning_rate": 2.4256e-06, + "loss": 0.6575, + "step": 37890 + }, + { + "epoch": 3.79, + "grad_norm": 28.551694869995117, + "learning_rate": 2.4236e-06, + "loss": 0.5472, + "step": 37900 + }, + { + "epoch": 3.791, + "grad_norm": 51.53792190551758, + "learning_rate": 2.4216000000000002e-06, + "loss": 0.3918, + "step": 37910 + }, + { + "epoch": 3.792, + "grad_norm": 2.6405768394470215, + "learning_rate": 2.4196e-06, + "loss": 0.5887, + "step": 37920 + }, + { + "epoch": 3.793, + "grad_norm": 37.74142837524414, + "learning_rate": 2.4176000000000004e-06, + "loss": 0.9124, + "step": 37930 + }, + { + "epoch": 3.794, + "grad_norm": 1.126764178276062, + "learning_rate": 2.4156000000000002e-06, + "loss": 0.3395, + "step": 37940 + }, + { + "epoch": 3.795, + "grad_norm": 50.959747314453125, + "learning_rate": 2.4136e-06, + "loss": 0.5296, + "step": 37950 + }, + { + "epoch": 3.7960000000000003, + "grad_norm": 53.771331787109375, + "learning_rate": 2.4116000000000003e-06, + "loss": 0.6108, + "step": 37960 + }, + { + "epoch": 3.797, + "grad_norm": 75.44929504394531, + "learning_rate": 2.4096e-06, + "loss": 1.0028, + "step": 37970 + }, + { + "epoch": 3.798, + "grad_norm": 14.876928329467773, + "learning_rate": 2.4076e-06, + "loss": 0.4708, + "step": 37980 + }, + { + "epoch": 3.799, + "grad_norm": 56.54835510253906, + "learning_rate": 2.4056000000000003e-06, + "loss": 0.4433, + "step": 37990 + }, + { + "epoch": 3.8, + "grad_norm": 55.80618667602539, + "learning_rate": 2.4036e-06, + "loss": 0.4473, + "step": 38000 + }, + { + "epoch": 3.801, + "grad_norm": 32.187435150146484, + "learning_rate": 2.4016000000000004e-06, + "loss": 0.6997, + "step": 38010 + }, + { + "epoch": 3.802, + "grad_norm": 80.04641723632812, + "learning_rate": 2.3996000000000003e-06, + "loss": 0.604, + "step": 38020 + }, + { + "epoch": 3.803, + "grad_norm": 45.173240661621094, + "learning_rate": 2.3976e-06, + "loss": 0.5006, + "step": 38030 + }, + { + "epoch": 3.8040000000000003, + "grad_norm": 65.73089599609375, + "learning_rate": 2.3956e-06, + "loss": 1.1262, + "step": 38040 + }, + { + "epoch": 3.805, + "grad_norm": 39.5977897644043, + "learning_rate": 2.3936000000000003e-06, + "loss": 0.5203, + "step": 38050 + }, + { + "epoch": 3.806, + "grad_norm": 13.670785903930664, + "learning_rate": 2.3916e-06, + "loss": 0.4645, + "step": 38060 + }, + { + "epoch": 3.807, + "grad_norm": 5.6472039222717285, + "learning_rate": 2.3896000000000004e-06, + "loss": 0.7319, + "step": 38070 + }, + { + "epoch": 3.808, + "grad_norm": 28.19403839111328, + "learning_rate": 2.3876000000000002e-06, + "loss": 0.6628, + "step": 38080 + }, + { + "epoch": 3.809, + "grad_norm": 19.916555404663086, + "learning_rate": 2.3856e-06, + "loss": 0.5419, + "step": 38090 + }, + { + "epoch": 3.81, + "grad_norm": 55.237937927246094, + "learning_rate": 2.3836e-06, + "loss": 0.4992, + "step": 38100 + }, + { + "epoch": 3.811, + "grad_norm": 8.9008150100708, + "learning_rate": 2.3816e-06, + "loss": 0.6025, + "step": 38110 + }, + { + "epoch": 3.8120000000000003, + "grad_norm": 10.064616203308105, + "learning_rate": 2.3796e-06, + "loss": 0.5929, + "step": 38120 + }, + { + "epoch": 3.8129999999999997, + "grad_norm": 40.20355224609375, + "learning_rate": 2.3776000000000003e-06, + "loss": 0.6161, + "step": 38130 + }, + { + "epoch": 3.814, + "grad_norm": 87.54853057861328, + "learning_rate": 2.3756e-06, + "loss": 0.8959, + "step": 38140 + }, + { + "epoch": 3.815, + "grad_norm": 20.800928115844727, + "learning_rate": 2.3736e-06, + "loss": 0.5493, + "step": 38150 + }, + { + "epoch": 3.816, + "grad_norm": 45.83055877685547, + "learning_rate": 2.3716000000000003e-06, + "loss": 0.5517, + "step": 38160 + }, + { + "epoch": 3.817, + "grad_norm": 37.758338928222656, + "learning_rate": 2.3696e-06, + "loss": 0.7703, + "step": 38170 + }, + { + "epoch": 3.818, + "grad_norm": 43.727909088134766, + "learning_rate": 2.3676e-06, + "loss": 0.752, + "step": 38180 + }, + { + "epoch": 3.819, + "grad_norm": 15.53500747680664, + "learning_rate": 2.3656000000000003e-06, + "loss": 0.7634, + "step": 38190 + }, + { + "epoch": 3.82, + "grad_norm": 15.645066261291504, + "learning_rate": 2.3636e-06, + "loss": 0.8673, + "step": 38200 + }, + { + "epoch": 3.8209999999999997, + "grad_norm": 24.320234298706055, + "learning_rate": 2.3616000000000004e-06, + "loss": 0.525, + "step": 38210 + }, + { + "epoch": 3.822, + "grad_norm": 48.00284957885742, + "learning_rate": 2.3596000000000002e-06, + "loss": 0.4945, + "step": 38220 + }, + { + "epoch": 3.823, + "grad_norm": 104.06059265136719, + "learning_rate": 2.3576e-06, + "loss": 0.5866, + "step": 38230 + }, + { + "epoch": 3.824, + "grad_norm": 47.05179214477539, + "learning_rate": 2.3556e-06, + "loss": 0.6431, + "step": 38240 + }, + { + "epoch": 3.825, + "grad_norm": 26.590356826782227, + "learning_rate": 2.3536000000000002e-06, + "loss": 0.6083, + "step": 38250 + }, + { + "epoch": 3.826, + "grad_norm": 6.2458295822143555, + "learning_rate": 2.3516e-06, + "loss": 0.7281, + "step": 38260 + }, + { + "epoch": 3.827, + "grad_norm": 33.54754638671875, + "learning_rate": 2.3496000000000003e-06, + "loss": 0.5375, + "step": 38270 + }, + { + "epoch": 3.828, + "grad_norm": 55.06610107421875, + "learning_rate": 2.3476e-06, + "loss": 0.398, + "step": 38280 + }, + { + "epoch": 3.8289999999999997, + "grad_norm": 61.530181884765625, + "learning_rate": 2.3456e-06, + "loss": 0.9226, + "step": 38290 + }, + { + "epoch": 3.83, + "grad_norm": 67.4300537109375, + "learning_rate": 2.3436000000000003e-06, + "loss": 0.7849, + "step": 38300 + }, + { + "epoch": 3.831, + "grad_norm": 17.95315170288086, + "learning_rate": 2.3416e-06, + "loss": 0.574, + "step": 38310 + }, + { + "epoch": 3.832, + "grad_norm": 26.313135147094727, + "learning_rate": 2.3396e-06, + "loss": 0.4189, + "step": 38320 + }, + { + "epoch": 3.833, + "grad_norm": 35.44434356689453, + "learning_rate": 2.3376000000000003e-06, + "loss": 0.7136, + "step": 38330 + }, + { + "epoch": 3.834, + "grad_norm": 100.7297592163086, + "learning_rate": 2.3356e-06, + "loss": 1.1248, + "step": 38340 + }, + { + "epoch": 3.835, + "grad_norm": 42.63737106323242, + "learning_rate": 2.3336000000000004e-06, + "loss": 0.8059, + "step": 38350 + }, + { + "epoch": 3.836, + "grad_norm": 28.802104949951172, + "learning_rate": 2.3316000000000003e-06, + "loss": 0.4204, + "step": 38360 + }, + { + "epoch": 3.8369999999999997, + "grad_norm": 36.844635009765625, + "learning_rate": 2.3296e-06, + "loss": 0.6372, + "step": 38370 + }, + { + "epoch": 3.838, + "grad_norm": 38.515411376953125, + "learning_rate": 2.3276e-06, + "loss": 0.4195, + "step": 38380 + }, + { + "epoch": 3.839, + "grad_norm": 44.74578857421875, + "learning_rate": 2.3256000000000002e-06, + "loss": 0.6128, + "step": 38390 + }, + { + "epoch": 3.84, + "grad_norm": 48.676204681396484, + "learning_rate": 2.3236000000000005e-06, + "loss": 0.4594, + "step": 38400 + }, + { + "epoch": 3.841, + "grad_norm": 32.13726043701172, + "learning_rate": 2.3216000000000004e-06, + "loss": 0.4666, + "step": 38410 + }, + { + "epoch": 3.842, + "grad_norm": 54.24015426635742, + "learning_rate": 2.3196e-06, + "loss": 0.578, + "step": 38420 + }, + { + "epoch": 3.843, + "grad_norm": 53.41969299316406, + "learning_rate": 2.3176e-06, + "loss": 0.7449, + "step": 38430 + }, + { + "epoch": 3.844, + "grad_norm": 57.150970458984375, + "learning_rate": 2.3156e-06, + "loss": 0.6208, + "step": 38440 + }, + { + "epoch": 3.8449999999999998, + "grad_norm": 29.20799446105957, + "learning_rate": 2.3136e-06, + "loss": 0.6802, + "step": 38450 + }, + { + "epoch": 3.846, + "grad_norm": 49.80082702636719, + "learning_rate": 2.3116e-06, + "loss": 0.4248, + "step": 38460 + }, + { + "epoch": 3.847, + "grad_norm": 38.34299087524414, + "learning_rate": 2.3096000000000003e-06, + "loss": 0.4581, + "step": 38470 + }, + { + "epoch": 3.848, + "grad_norm": 39.137451171875, + "learning_rate": 2.3076e-06, + "loss": 0.5434, + "step": 38480 + }, + { + "epoch": 3.849, + "grad_norm": 2.7317628860473633, + "learning_rate": 2.3056e-06, + "loss": 0.3184, + "step": 38490 + }, + { + "epoch": 3.85, + "grad_norm": 24.794809341430664, + "learning_rate": 2.3036000000000003e-06, + "loss": 0.4862, + "step": 38500 + }, + { + "epoch": 3.851, + "grad_norm": 43.81166076660156, + "learning_rate": 2.3016e-06, + "loss": 0.6984, + "step": 38510 + }, + { + "epoch": 3.852, + "grad_norm": 46.706764221191406, + "learning_rate": 2.2996e-06, + "loss": 0.6245, + "step": 38520 + }, + { + "epoch": 3.8529999999999998, + "grad_norm": 32.64492416381836, + "learning_rate": 2.2976000000000002e-06, + "loss": 0.5363, + "step": 38530 + }, + { + "epoch": 3.854, + "grad_norm": 43.35952377319336, + "learning_rate": 2.2956e-06, + "loss": 0.9169, + "step": 38540 + }, + { + "epoch": 3.855, + "grad_norm": 56.83952713012695, + "learning_rate": 2.2936000000000004e-06, + "loss": 0.5871, + "step": 38550 + }, + { + "epoch": 3.856, + "grad_norm": 24.13530921936035, + "learning_rate": 2.2916000000000002e-06, + "loss": 0.7488, + "step": 38560 + }, + { + "epoch": 3.857, + "grad_norm": 4.121044635772705, + "learning_rate": 2.2896e-06, + "loss": 0.687, + "step": 38570 + }, + { + "epoch": 3.858, + "grad_norm": 3.334587812423706, + "learning_rate": 2.2876e-06, + "loss": 0.764, + "step": 38580 + }, + { + "epoch": 3.859, + "grad_norm": 15.22492790222168, + "learning_rate": 2.2856e-06, + "loss": 0.5746, + "step": 38590 + }, + { + "epoch": 3.86, + "grad_norm": 39.12681198120117, + "learning_rate": 2.2836000000000005e-06, + "loss": 0.5068, + "step": 38600 + }, + { + "epoch": 3.8609999999999998, + "grad_norm": 59.579742431640625, + "learning_rate": 2.2816000000000003e-06, + "loss": 0.7578, + "step": 38610 + }, + { + "epoch": 3.862, + "grad_norm": 4.269464015960693, + "learning_rate": 2.2796e-06, + "loss": 0.5282, + "step": 38620 + }, + { + "epoch": 3.863, + "grad_norm": 11.499958992004395, + "learning_rate": 2.2776e-06, + "loss": 0.6187, + "step": 38630 + }, + { + "epoch": 3.864, + "grad_norm": 15.392281532287598, + "learning_rate": 2.2756000000000003e-06, + "loss": 0.5032, + "step": 38640 + }, + { + "epoch": 3.865, + "grad_norm": 35.44303512573242, + "learning_rate": 2.2736e-06, + "loss": 0.4503, + "step": 38650 + }, + { + "epoch": 3.866, + "grad_norm": 38.298221588134766, + "learning_rate": 2.2716e-06, + "loss": 0.6979, + "step": 38660 + }, + { + "epoch": 3.867, + "grad_norm": 40.04734420776367, + "learning_rate": 2.2696000000000003e-06, + "loss": 0.8594, + "step": 38670 + }, + { + "epoch": 3.868, + "grad_norm": 22.192415237426758, + "learning_rate": 2.2676e-06, + "loss": 0.641, + "step": 38680 + }, + { + "epoch": 3.8689999999999998, + "grad_norm": 41.32742691040039, + "learning_rate": 2.2656000000000004e-06, + "loss": 0.5788, + "step": 38690 + }, + { + "epoch": 3.87, + "grad_norm": 8.416460037231445, + "learning_rate": 2.2636000000000002e-06, + "loss": 0.4709, + "step": 38700 + }, + { + "epoch": 3.871, + "grad_norm": 16.503379821777344, + "learning_rate": 2.2616e-06, + "loss": 0.5078, + "step": 38710 + }, + { + "epoch": 3.872, + "grad_norm": 12.669565200805664, + "learning_rate": 2.2596e-06, + "loss": 0.5737, + "step": 38720 + }, + { + "epoch": 3.873, + "grad_norm": 16.520822525024414, + "learning_rate": 2.2576e-06, + "loss": 0.4941, + "step": 38730 + }, + { + "epoch": 3.874, + "grad_norm": 29.02339744567871, + "learning_rate": 2.2556000000000005e-06, + "loss": 0.6283, + "step": 38740 + }, + { + "epoch": 3.875, + "grad_norm": 4.977586269378662, + "learning_rate": 2.2536000000000003e-06, + "loss": 0.4775, + "step": 38750 + }, + { + "epoch": 3.876, + "grad_norm": 64.50590515136719, + "learning_rate": 2.2516e-06, + "loss": 0.6762, + "step": 38760 + }, + { + "epoch": 3.877, + "grad_norm": 41.48358917236328, + "learning_rate": 2.2496e-06, + "loss": 0.8202, + "step": 38770 + }, + { + "epoch": 3.878, + "grad_norm": 54.37251663208008, + "learning_rate": 2.2476e-06, + "loss": 0.6263, + "step": 38780 + }, + { + "epoch": 3.879, + "grad_norm": 37.51151657104492, + "learning_rate": 2.2456e-06, + "loss": 0.5234, + "step": 38790 + }, + { + "epoch": 3.88, + "grad_norm": 8.200417518615723, + "learning_rate": 2.2436000000000004e-06, + "loss": 0.4962, + "step": 38800 + }, + { + "epoch": 3.8810000000000002, + "grad_norm": 5.233102798461914, + "learning_rate": 2.2416000000000003e-06, + "loss": 0.6761, + "step": 38810 + }, + { + "epoch": 3.882, + "grad_norm": 1.343250036239624, + "learning_rate": 2.2396e-06, + "loss": 0.4689, + "step": 38820 + }, + { + "epoch": 3.883, + "grad_norm": 42.027427673339844, + "learning_rate": 2.2376e-06, + "loss": 0.8863, + "step": 38830 + }, + { + "epoch": 3.884, + "grad_norm": 7.038177967071533, + "learning_rate": 2.2356000000000002e-06, + "loss": 0.8442, + "step": 38840 + }, + { + "epoch": 3.885, + "grad_norm": 23.81315803527832, + "learning_rate": 2.2336e-06, + "loss": 0.5624, + "step": 38850 + }, + { + "epoch": 3.886, + "grad_norm": 43.30754470825195, + "learning_rate": 2.2316e-06, + "loss": 0.5765, + "step": 38860 + }, + { + "epoch": 3.887, + "grad_norm": 34.442176818847656, + "learning_rate": 2.2296000000000002e-06, + "loss": 0.9356, + "step": 38870 + }, + { + "epoch": 3.888, + "grad_norm": 41.87175369262695, + "learning_rate": 2.2276000000000005e-06, + "loss": 1.1155, + "step": 38880 + }, + { + "epoch": 3.8890000000000002, + "grad_norm": 34.54471206665039, + "learning_rate": 2.2256000000000003e-06, + "loss": 0.4604, + "step": 38890 + }, + { + "epoch": 3.89, + "grad_norm": 50.42923355102539, + "learning_rate": 2.2236e-06, + "loss": 0.6437, + "step": 38900 + }, + { + "epoch": 3.891, + "grad_norm": 39.492305755615234, + "learning_rate": 2.2216e-06, + "loss": 0.4618, + "step": 38910 + }, + { + "epoch": 3.892, + "grad_norm": 9.663975715637207, + "learning_rate": 2.2196e-06, + "loss": 0.5767, + "step": 38920 + }, + { + "epoch": 3.893, + "grad_norm": 46.55671310424805, + "learning_rate": 2.2176e-06, + "loss": 0.513, + "step": 38930 + }, + { + "epoch": 3.894, + "grad_norm": 65.58306121826172, + "learning_rate": 2.2156000000000004e-06, + "loss": 0.6941, + "step": 38940 + }, + { + "epoch": 3.895, + "grad_norm": 78.95509338378906, + "learning_rate": 2.2136000000000003e-06, + "loss": 0.7419, + "step": 38950 + }, + { + "epoch": 3.896, + "grad_norm": 16.70928955078125, + "learning_rate": 2.2116e-06, + "loss": 0.736, + "step": 38960 + }, + { + "epoch": 3.8970000000000002, + "grad_norm": 3.3220157623291016, + "learning_rate": 2.2096e-06, + "loss": 0.2733, + "step": 38970 + }, + { + "epoch": 3.898, + "grad_norm": 14.191826820373535, + "learning_rate": 2.2076000000000003e-06, + "loss": 0.5306, + "step": 38980 + }, + { + "epoch": 3.899, + "grad_norm": 35.20226287841797, + "learning_rate": 2.2056e-06, + "loss": 0.2515, + "step": 38990 + }, + { + "epoch": 3.9, + "grad_norm": 26.15009880065918, + "learning_rate": 2.2036000000000004e-06, + "loss": 0.596, + "step": 39000 + }, + { + "epoch": 3.901, + "grad_norm": 47.9098014831543, + "learning_rate": 2.2016000000000002e-06, + "loss": 0.4469, + "step": 39010 + }, + { + "epoch": 3.902, + "grad_norm": 68.36515045166016, + "learning_rate": 2.1996e-06, + "loss": 0.6961, + "step": 39020 + }, + { + "epoch": 3.903, + "grad_norm": 18.675844192504883, + "learning_rate": 2.1976000000000004e-06, + "loss": 0.4955, + "step": 39030 + }, + { + "epoch": 3.904, + "grad_norm": 2.813723087310791, + "learning_rate": 2.1956e-06, + "loss": 0.6415, + "step": 39040 + }, + { + "epoch": 3.9050000000000002, + "grad_norm": 40.3653564453125, + "learning_rate": 2.1936e-06, + "loss": 0.7362, + "step": 39050 + }, + { + "epoch": 3.906, + "grad_norm": 43.43606185913086, + "learning_rate": 2.1916e-06, + "loss": 0.7509, + "step": 39060 + }, + { + "epoch": 3.907, + "grad_norm": 40.229148864746094, + "learning_rate": 2.1896e-06, + "loss": 0.54, + "step": 39070 + }, + { + "epoch": 3.908, + "grad_norm": 2.370300531387329, + "learning_rate": 2.1876000000000005e-06, + "loss": 0.4631, + "step": 39080 + }, + { + "epoch": 3.909, + "grad_norm": 66.48926544189453, + "learning_rate": 2.1856000000000003e-06, + "loss": 0.8684, + "step": 39090 + }, + { + "epoch": 3.91, + "grad_norm": 71.65203094482422, + "learning_rate": 2.1836e-06, + "loss": 0.5653, + "step": 39100 + }, + { + "epoch": 3.911, + "grad_norm": 67.22386169433594, + "learning_rate": 2.1816e-06, + "loss": 0.577, + "step": 39110 + }, + { + "epoch": 3.912, + "grad_norm": 11.789389610290527, + "learning_rate": 2.1796e-06, + "loss": 0.6232, + "step": 39120 + }, + { + "epoch": 3.9130000000000003, + "grad_norm": 22.935331344604492, + "learning_rate": 2.1776e-06, + "loss": 0.4742, + "step": 39130 + }, + { + "epoch": 3.914, + "grad_norm": 49.52204513549805, + "learning_rate": 2.1756000000000004e-06, + "loss": 0.6813, + "step": 39140 + }, + { + "epoch": 3.915, + "grad_norm": 54.53312683105469, + "learning_rate": 2.1736000000000002e-06, + "loss": 0.6947, + "step": 39150 + }, + { + "epoch": 3.916, + "grad_norm": 23.98177146911621, + "learning_rate": 2.1716e-06, + "loss": 0.4229, + "step": 39160 + }, + { + "epoch": 3.917, + "grad_norm": 39.71488571166992, + "learning_rate": 2.1696e-06, + "loss": 0.3979, + "step": 39170 + }, + { + "epoch": 3.918, + "grad_norm": 80.45480346679688, + "learning_rate": 2.1676000000000002e-06, + "loss": 0.6924, + "step": 39180 + }, + { + "epoch": 3.919, + "grad_norm": 10.103964805603027, + "learning_rate": 2.1656e-06, + "loss": 0.7003, + "step": 39190 + }, + { + "epoch": 3.92, + "grad_norm": 27.01572036743164, + "learning_rate": 2.1636000000000003e-06, + "loss": 0.6469, + "step": 39200 + }, + { + "epoch": 3.9210000000000003, + "grad_norm": 61.562255859375, + "learning_rate": 2.1616e-06, + "loss": 0.5936, + "step": 39210 + }, + { + "epoch": 3.922, + "grad_norm": 48.38149642944336, + "learning_rate": 2.1596000000000005e-06, + "loss": 0.4467, + "step": 39220 + }, + { + "epoch": 3.923, + "grad_norm": 39.5386962890625, + "learning_rate": 2.1576000000000003e-06, + "loss": 0.4228, + "step": 39230 + }, + { + "epoch": 3.924, + "grad_norm": 30.5782527923584, + "learning_rate": 2.1556e-06, + "loss": 0.6259, + "step": 39240 + }, + { + "epoch": 3.925, + "grad_norm": 41.15312576293945, + "learning_rate": 2.1536e-06, + "loss": 0.5045, + "step": 39250 + }, + { + "epoch": 3.926, + "grad_norm": 18.565710067749023, + "learning_rate": 2.1516000000000003e-06, + "loss": 0.5476, + "step": 39260 + }, + { + "epoch": 3.927, + "grad_norm": 39.46208953857422, + "learning_rate": 2.1496e-06, + "loss": 0.3473, + "step": 39270 + }, + { + "epoch": 3.928, + "grad_norm": 28.1165771484375, + "learning_rate": 2.1478e-06, + "loss": 0.8228, + "step": 39280 + }, + { + "epoch": 3.9290000000000003, + "grad_norm": 19.74481964111328, + "learning_rate": 2.1458000000000003e-06, + "loss": 0.6381, + "step": 39290 + }, + { + "epoch": 3.93, + "grad_norm": 36.65999221801758, + "learning_rate": 2.1438e-06, + "loss": 0.7628, + "step": 39300 + }, + { + "epoch": 3.931, + "grad_norm": 28.818679809570312, + "learning_rate": 2.1418000000000004e-06, + "loss": 0.4013, + "step": 39310 + }, + { + "epoch": 3.932, + "grad_norm": 2.3065967559814453, + "learning_rate": 2.1398000000000002e-06, + "loss": 0.4747, + "step": 39320 + }, + { + "epoch": 3.933, + "grad_norm": 66.51754760742188, + "learning_rate": 2.1378e-06, + "loss": 0.59, + "step": 39330 + }, + { + "epoch": 3.934, + "grad_norm": 18.615644454956055, + "learning_rate": 2.1358e-06, + "loss": 0.4391, + "step": 39340 + }, + { + "epoch": 3.935, + "grad_norm": 37.48296356201172, + "learning_rate": 2.1338000000000002e-06, + "loss": 0.5004, + "step": 39350 + }, + { + "epoch": 3.936, + "grad_norm": 24.665191650390625, + "learning_rate": 2.1318e-06, + "loss": 0.6554, + "step": 39360 + }, + { + "epoch": 3.9370000000000003, + "grad_norm": 44.627471923828125, + "learning_rate": 2.1298000000000003e-06, + "loss": 0.5226, + "step": 39370 + }, + { + "epoch": 3.9379999999999997, + "grad_norm": 42.4708137512207, + "learning_rate": 2.1278e-06, + "loss": 0.503, + "step": 39380 + }, + { + "epoch": 3.939, + "grad_norm": 69.4271469116211, + "learning_rate": 2.1258e-06, + "loss": 0.8558, + "step": 39390 + }, + { + "epoch": 3.94, + "grad_norm": 54.04352569580078, + "learning_rate": 2.1238000000000003e-06, + "loss": 0.6508, + "step": 39400 + }, + { + "epoch": 3.941, + "grad_norm": 52.962120056152344, + "learning_rate": 2.1218e-06, + "loss": 0.4722, + "step": 39410 + }, + { + "epoch": 3.942, + "grad_norm": 55.27985382080078, + "learning_rate": 2.1198e-06, + "loss": 0.7672, + "step": 39420 + }, + { + "epoch": 3.943, + "grad_norm": 45.97237014770508, + "learning_rate": 2.1178000000000003e-06, + "loss": 0.7393, + "step": 39430 + }, + { + "epoch": 3.944, + "grad_norm": 37.7562255859375, + "learning_rate": 2.1158e-06, + "loss": 0.5531, + "step": 39440 + }, + { + "epoch": 3.945, + "grad_norm": 2.4542977809906006, + "learning_rate": 2.1138000000000004e-06, + "loss": 0.3792, + "step": 39450 + }, + { + "epoch": 3.9459999999999997, + "grad_norm": 60.25086975097656, + "learning_rate": 2.1118000000000003e-06, + "loss": 0.7512, + "step": 39460 + }, + { + "epoch": 3.947, + "grad_norm": 47.01914978027344, + "learning_rate": 2.1098e-06, + "loss": 0.5358, + "step": 39470 + }, + { + "epoch": 3.948, + "grad_norm": 7.820398807525635, + "learning_rate": 2.1078e-06, + "loss": 0.7304, + "step": 39480 + }, + { + "epoch": 3.949, + "grad_norm": 4.501594066619873, + "learning_rate": 2.1058000000000002e-06, + "loss": 0.7448, + "step": 39490 + }, + { + "epoch": 3.95, + "grad_norm": 21.622995376586914, + "learning_rate": 2.1038e-06, + "loss": 0.5995, + "step": 39500 + }, + { + "epoch": 3.951, + "grad_norm": 9.188790321350098, + "learning_rate": 2.1018000000000004e-06, + "loss": 0.4282, + "step": 39510 + }, + { + "epoch": 3.952, + "grad_norm": 46.63243103027344, + "learning_rate": 2.0998e-06, + "loss": 0.9215, + "step": 39520 + }, + { + "epoch": 3.953, + "grad_norm": 32.74306106567383, + "learning_rate": 2.0978e-06, + "loss": 0.5652, + "step": 39530 + }, + { + "epoch": 3.9539999999999997, + "grad_norm": 14.57941722869873, + "learning_rate": 2.0958e-06, + "loss": 0.4371, + "step": 39540 + }, + { + "epoch": 3.955, + "grad_norm": 31.52217674255371, + "learning_rate": 2.0938e-06, + "loss": 0.3675, + "step": 39550 + }, + { + "epoch": 3.956, + "grad_norm": 2.353820323944092, + "learning_rate": 2.0918e-06, + "loss": 0.4438, + "step": 39560 + }, + { + "epoch": 3.957, + "grad_norm": 6.106788635253906, + "learning_rate": 2.0898000000000003e-06, + "loss": 0.5282, + "step": 39570 + }, + { + "epoch": 3.958, + "grad_norm": 34.10426712036133, + "learning_rate": 2.0878e-06, + "loss": 0.3473, + "step": 39580 + }, + { + "epoch": 3.959, + "grad_norm": 3.547370672225952, + "learning_rate": 2.0858e-06, + "loss": 0.8163, + "step": 39590 + }, + { + "epoch": 3.96, + "grad_norm": 11.41083812713623, + "learning_rate": 2.0838000000000003e-06, + "loss": 0.4964, + "step": 39600 + }, + { + "epoch": 3.961, + "grad_norm": 20.70975112915039, + "learning_rate": 2.0818e-06, + "loss": 0.6299, + "step": 39610 + }, + { + "epoch": 3.9619999999999997, + "grad_norm": 40.6129264831543, + "learning_rate": 2.0798e-06, + "loss": 0.6077, + "step": 39620 + }, + { + "epoch": 3.963, + "grad_norm": 49.65641403198242, + "learning_rate": 2.0778000000000002e-06, + "loss": 0.7575, + "step": 39630 + }, + { + "epoch": 3.964, + "grad_norm": 43.09720230102539, + "learning_rate": 2.0758e-06, + "loss": 0.5197, + "step": 39640 + }, + { + "epoch": 3.965, + "grad_norm": 4.485574245452881, + "learning_rate": 2.0738000000000004e-06, + "loss": 0.8253, + "step": 39650 + }, + { + "epoch": 3.966, + "grad_norm": 24.993141174316406, + "learning_rate": 2.0718000000000002e-06, + "loss": 0.5464, + "step": 39660 + }, + { + "epoch": 3.967, + "grad_norm": 63.31475830078125, + "learning_rate": 2.0698e-06, + "loss": 0.7333, + "step": 39670 + }, + { + "epoch": 3.968, + "grad_norm": 53.776859283447266, + "learning_rate": 2.0678e-06, + "loss": 0.5568, + "step": 39680 + }, + { + "epoch": 3.969, + "grad_norm": 18.64484214782715, + "learning_rate": 2.0658e-06, + "loss": 0.3912, + "step": 39690 + }, + { + "epoch": 3.9699999999999998, + "grad_norm": 68.29088592529297, + "learning_rate": 2.0638e-06, + "loss": 0.5753, + "step": 39700 + }, + { + "epoch": 3.971, + "grad_norm": 53.469207763671875, + "learning_rate": 2.0618000000000003e-06, + "loss": 0.4962, + "step": 39710 + }, + { + "epoch": 3.972, + "grad_norm": 22.82310676574707, + "learning_rate": 2.0598e-06, + "loss": 0.7009, + "step": 39720 + }, + { + "epoch": 3.973, + "grad_norm": 47.31079864501953, + "learning_rate": 2.0578e-06, + "loss": 0.8921, + "step": 39730 + }, + { + "epoch": 3.974, + "grad_norm": 12.895943641662598, + "learning_rate": 2.0558000000000003e-06, + "loss": 0.736, + "step": 39740 + }, + { + "epoch": 3.975, + "grad_norm": 21.22369384765625, + "learning_rate": 2.0538e-06, + "loss": 0.4136, + "step": 39750 + }, + { + "epoch": 3.976, + "grad_norm": 1.816238522529602, + "learning_rate": 2.0518e-06, + "loss": 0.3451, + "step": 39760 + }, + { + "epoch": 3.977, + "grad_norm": 22.588109970092773, + "learning_rate": 2.0498000000000003e-06, + "loss": 0.7551, + "step": 39770 + }, + { + "epoch": 3.9779999999999998, + "grad_norm": 15.299209594726562, + "learning_rate": 2.0478e-06, + "loss": 0.79, + "step": 39780 + }, + { + "epoch": 3.979, + "grad_norm": 49.50896453857422, + "learning_rate": 2.0458000000000004e-06, + "loss": 0.6644, + "step": 39790 + }, + { + "epoch": 3.98, + "grad_norm": 21.569507598876953, + "learning_rate": 2.0438000000000002e-06, + "loss": 0.6802, + "step": 39800 + }, + { + "epoch": 3.981, + "grad_norm": 45.40610885620117, + "learning_rate": 2.0418e-06, + "loss": 1.0853, + "step": 39810 + }, + { + "epoch": 3.982, + "grad_norm": 17.77871322631836, + "learning_rate": 2.0398e-06, + "loss": 0.4756, + "step": 39820 + }, + { + "epoch": 3.983, + "grad_norm": 33.64976119995117, + "learning_rate": 2.0378e-06, + "loss": 0.9198, + "step": 39830 + }, + { + "epoch": 3.984, + "grad_norm": 51.46430587768555, + "learning_rate": 2.0358000000000005e-06, + "loss": 0.7289, + "step": 39840 + }, + { + "epoch": 3.985, + "grad_norm": 49.01710510253906, + "learning_rate": 2.0338000000000003e-06, + "loss": 0.7091, + "step": 39850 + }, + { + "epoch": 3.9859999999999998, + "grad_norm": 7.222930431365967, + "learning_rate": 2.0318e-06, + "loss": 0.5029, + "step": 39860 + }, + { + "epoch": 3.987, + "grad_norm": 50.691001892089844, + "learning_rate": 2.0298e-06, + "loss": 0.8107, + "step": 39870 + }, + { + "epoch": 3.988, + "grad_norm": 6.308470249176025, + "learning_rate": 2.0278e-06, + "loss": 0.4074, + "step": 39880 + }, + { + "epoch": 3.989, + "grad_norm": 12.49143123626709, + "learning_rate": 2.0258e-06, + "loss": 0.6216, + "step": 39890 + }, + { + "epoch": 3.99, + "grad_norm": 44.38469696044922, + "learning_rate": 2.0238e-06, + "loss": 0.6535, + "step": 39900 + }, + { + "epoch": 3.991, + "grad_norm": 26.98875617980957, + "learning_rate": 2.0218000000000003e-06, + "loss": 0.3996, + "step": 39910 + }, + { + "epoch": 3.992, + "grad_norm": 78.4300537109375, + "learning_rate": 2.0198e-06, + "loss": 0.5802, + "step": 39920 + }, + { + "epoch": 3.993, + "grad_norm": 49.365081787109375, + "learning_rate": 2.0178e-06, + "loss": 0.6089, + "step": 39930 + }, + { + "epoch": 3.9939999999999998, + "grad_norm": 49.79063415527344, + "learning_rate": 2.0158000000000002e-06, + "loss": 0.7299, + "step": 39940 + }, + { + "epoch": 3.995, + "grad_norm": 56.70253372192383, + "learning_rate": 2.0138e-06, + "loss": 0.5325, + "step": 39950 + }, + { + "epoch": 3.996, + "grad_norm": 35.089202880859375, + "learning_rate": 2.0118e-06, + "loss": 0.5027, + "step": 39960 + }, + { + "epoch": 3.997, + "grad_norm": 31.567665100097656, + "learning_rate": 2.0098000000000002e-06, + "loss": 0.7014, + "step": 39970 + }, + { + "epoch": 3.998, + "grad_norm": 14.97442626953125, + "learning_rate": 2.0078e-06, + "loss": 0.6241, + "step": 39980 + }, + { + "epoch": 3.999, + "grad_norm": 20.558170318603516, + "learning_rate": 2.0058000000000003e-06, + "loss": 0.6377, + "step": 39990 + }, + { + "epoch": 4.0, + "grad_norm": 25.69771385192871, + "learning_rate": 2.0038e-06, + "loss": 0.5495, + "step": 40000 + }, + { + "epoch": 4.001, + "grad_norm": 2.9470462799072266, + "learning_rate": 2.0018e-06, + "loss": 0.624, + "step": 40010 + }, + { + "epoch": 4.002, + "grad_norm": 7.803587436676025, + "learning_rate": 1.9998e-06, + "loss": 0.6188, + "step": 40020 + }, + { + "epoch": 4.003, + "grad_norm": 58.450340270996094, + "learning_rate": 1.9978e-06, + "loss": 0.6628, + "step": 40030 + }, + { + "epoch": 4.004, + "grad_norm": 2.4178032875061035, + "learning_rate": 1.9958000000000004e-06, + "loss": 0.3826, + "step": 40040 + }, + { + "epoch": 4.005, + "grad_norm": 28.441543579101562, + "learning_rate": 1.9938000000000003e-06, + "loss": 0.6182, + "step": 40050 + }, + { + "epoch": 4.006, + "grad_norm": 4.209710121154785, + "learning_rate": 1.9918e-06, + "loss": 0.5498, + "step": 40060 + }, + { + "epoch": 4.007, + "grad_norm": 6.908935070037842, + "learning_rate": 1.9898e-06, + "loss": 0.6298, + "step": 40070 + }, + { + "epoch": 4.008, + "grad_norm": 49.947784423828125, + "learning_rate": 1.9878000000000003e-06, + "loss": 0.6013, + "step": 40080 + }, + { + "epoch": 4.009, + "grad_norm": 32.38127517700195, + "learning_rate": 1.9858e-06, + "loss": 0.3015, + "step": 40090 + }, + { + "epoch": 4.01, + "grad_norm": 47.0320930480957, + "learning_rate": 1.9838e-06, + "loss": 0.5804, + "step": 40100 + }, + { + "epoch": 4.011, + "grad_norm": 56.43113708496094, + "learning_rate": 1.9818000000000002e-06, + "loss": 0.7144, + "step": 40110 + }, + { + "epoch": 4.012, + "grad_norm": 41.424720764160156, + "learning_rate": 1.9798e-06, + "loss": 0.6213, + "step": 40120 + }, + { + "epoch": 4.013, + "grad_norm": 35.13981628417969, + "learning_rate": 1.9778000000000004e-06, + "loss": 0.5925, + "step": 40130 + }, + { + "epoch": 4.014, + "grad_norm": 23.2755126953125, + "learning_rate": 1.9758e-06, + "loss": 0.7576, + "step": 40140 + }, + { + "epoch": 4.015, + "grad_norm": 41.35113525390625, + "learning_rate": 1.9738e-06, + "loss": 0.7327, + "step": 40150 + }, + { + "epoch": 4.016, + "grad_norm": 33.476600646972656, + "learning_rate": 1.9718e-06, + "loss": 0.5042, + "step": 40160 + }, + { + "epoch": 4.017, + "grad_norm": 47.169395446777344, + "learning_rate": 1.9698e-06, + "loss": 0.7718, + "step": 40170 + }, + { + "epoch": 4.018, + "grad_norm": 34.22056198120117, + "learning_rate": 1.9678000000000005e-06, + "loss": 0.6289, + "step": 40180 + }, + { + "epoch": 4.019, + "grad_norm": 14.578975677490234, + "learning_rate": 1.9658000000000003e-06, + "loss": 0.4055, + "step": 40190 + }, + { + "epoch": 4.02, + "grad_norm": 18.142715454101562, + "learning_rate": 1.9638e-06, + "loss": 0.4437, + "step": 40200 + }, + { + "epoch": 4.021, + "grad_norm": 56.41261291503906, + "learning_rate": 1.9618e-06, + "loss": 0.684, + "step": 40210 + }, + { + "epoch": 4.022, + "grad_norm": 95.6523666381836, + "learning_rate": 1.9598e-06, + "loss": 0.8494, + "step": 40220 + }, + { + "epoch": 4.023, + "grad_norm": 21.764108657836914, + "learning_rate": 1.9578e-06, + "loss": 0.501, + "step": 40230 + }, + { + "epoch": 4.024, + "grad_norm": 26.407747268676758, + "learning_rate": 1.9558000000000004e-06, + "loss": 0.5673, + "step": 40240 + }, + { + "epoch": 4.025, + "grad_norm": 37.53788757324219, + "learning_rate": 1.9538000000000003e-06, + "loss": 0.5227, + "step": 40250 + }, + { + "epoch": 4.026, + "grad_norm": 24.256620407104492, + "learning_rate": 1.9518e-06, + "loss": 0.4052, + "step": 40260 + }, + { + "epoch": 4.027, + "grad_norm": 39.48225021362305, + "learning_rate": 1.9498e-06, + "loss": 0.405, + "step": 40270 + }, + { + "epoch": 4.028, + "grad_norm": 14.27397346496582, + "learning_rate": 1.9478000000000002e-06, + "loss": 0.6717, + "step": 40280 + }, + { + "epoch": 4.029, + "grad_norm": 2.7721076011657715, + "learning_rate": 1.9458e-06, + "loss": 0.2324, + "step": 40290 + }, + { + "epoch": 4.03, + "grad_norm": 22.514490127563477, + "learning_rate": 1.9438e-06, + "loss": 0.7317, + "step": 40300 + }, + { + "epoch": 4.031, + "grad_norm": 41.21731948852539, + "learning_rate": 1.9418e-06, + "loss": 0.4756, + "step": 40310 + }, + { + "epoch": 4.032, + "grad_norm": 27.539142608642578, + "learning_rate": 1.9398000000000005e-06, + "loss": 0.4482, + "step": 40320 + }, + { + "epoch": 4.033, + "grad_norm": 11.524066925048828, + "learning_rate": 1.9378000000000003e-06, + "loss": 0.4907, + "step": 40330 + }, + { + "epoch": 4.034, + "grad_norm": 30.499555587768555, + "learning_rate": 1.9358e-06, + "loss": 0.4553, + "step": 40340 + }, + { + "epoch": 4.035, + "grad_norm": 1.9976459741592407, + "learning_rate": 1.9338e-06, + "loss": 0.4783, + "step": 40350 + }, + { + "epoch": 4.036, + "grad_norm": 29.816242218017578, + "learning_rate": 1.9318e-06, + "loss": 0.6985, + "step": 40360 + }, + { + "epoch": 4.037, + "grad_norm": 46.56698989868164, + "learning_rate": 1.9298e-06, + "loss": 0.5885, + "step": 40370 + }, + { + "epoch": 4.038, + "grad_norm": 52.0035514831543, + "learning_rate": 1.9278000000000004e-06, + "loss": 0.6332, + "step": 40380 + }, + { + "epoch": 4.039, + "grad_norm": 40.463199615478516, + "learning_rate": 1.9258000000000003e-06, + "loss": 0.8216, + "step": 40390 + }, + { + "epoch": 4.04, + "grad_norm": 7.643418788909912, + "learning_rate": 1.9238e-06, + "loss": 0.578, + "step": 40400 + }, + { + "epoch": 4.041, + "grad_norm": 4.566254615783691, + "learning_rate": 1.9218e-06, + "loss": 0.4617, + "step": 40410 + }, + { + "epoch": 4.042, + "grad_norm": 6.617149353027344, + "learning_rate": 1.9198000000000002e-06, + "loss": 0.4563, + "step": 40420 + }, + { + "epoch": 4.043, + "grad_norm": 12.791312217712402, + "learning_rate": 1.9178e-06, + "loss": 0.5765, + "step": 40430 + }, + { + "epoch": 4.044, + "grad_norm": 4.998269081115723, + "learning_rate": 1.9158000000000004e-06, + "loss": 0.3831, + "step": 40440 + }, + { + "epoch": 4.045, + "grad_norm": 100.47882080078125, + "learning_rate": 1.9138e-06, + "loss": 0.7214, + "step": 40450 + }, + { + "epoch": 4.046, + "grad_norm": 30.22802734375, + "learning_rate": 1.9118e-06, + "loss": 0.6184, + "step": 40460 + }, + { + "epoch": 4.047, + "grad_norm": 25.04363441467285, + "learning_rate": 1.9098000000000003e-06, + "loss": 0.5158, + "step": 40470 + }, + { + "epoch": 4.048, + "grad_norm": 21.190914154052734, + "learning_rate": 1.9078e-06, + "loss": 0.5553, + "step": 40480 + }, + { + "epoch": 4.049, + "grad_norm": 68.58704376220703, + "learning_rate": 1.9058000000000002e-06, + "loss": 0.4504, + "step": 40490 + }, + { + "epoch": 4.05, + "grad_norm": 59.41545486450195, + "learning_rate": 1.9038e-06, + "loss": 0.33, + "step": 40500 + }, + { + "epoch": 4.051, + "grad_norm": 44.25145721435547, + "learning_rate": 1.9018e-06, + "loss": 0.5571, + "step": 40510 + }, + { + "epoch": 4.052, + "grad_norm": 47.63776779174805, + "learning_rate": 1.8998000000000002e-06, + "loss": 0.6659, + "step": 40520 + }, + { + "epoch": 4.053, + "grad_norm": 3.7169432640075684, + "learning_rate": 1.8978000000000003e-06, + "loss": 0.6236, + "step": 40530 + }, + { + "epoch": 4.054, + "grad_norm": 28.970352172851562, + "learning_rate": 1.8958000000000001e-06, + "loss": 0.371, + "step": 40540 + }, + { + "epoch": 4.055, + "grad_norm": 20.295238494873047, + "learning_rate": 1.8938e-06, + "loss": 0.2769, + "step": 40550 + }, + { + "epoch": 4.056, + "grad_norm": 55.8317756652832, + "learning_rate": 1.8918e-06, + "loss": 0.4959, + "step": 40560 + }, + { + "epoch": 4.057, + "grad_norm": 42.73490524291992, + "learning_rate": 1.8898000000000003e-06, + "loss": 0.5813, + "step": 40570 + }, + { + "epoch": 4.058, + "grad_norm": 82.32331848144531, + "learning_rate": 1.8878000000000002e-06, + "loss": 0.7, + "step": 40580 + }, + { + "epoch": 4.059, + "grad_norm": 61.270931243896484, + "learning_rate": 1.8858000000000002e-06, + "loss": 0.5079, + "step": 40590 + }, + { + "epoch": 4.06, + "grad_norm": 2.542825937271118, + "learning_rate": 1.8838e-06, + "loss": 0.4273, + "step": 40600 + }, + { + "epoch": 4.061, + "grad_norm": 35.910396575927734, + "learning_rate": 1.8818e-06, + "loss": 0.6405, + "step": 40610 + }, + { + "epoch": 4.062, + "grad_norm": 67.07048797607422, + "learning_rate": 1.8798000000000002e-06, + "loss": 0.2777, + "step": 40620 + }, + { + "epoch": 4.063, + "grad_norm": 42.202327728271484, + "learning_rate": 1.8778000000000003e-06, + "loss": 0.4228, + "step": 40630 + }, + { + "epoch": 4.064, + "grad_norm": 34.79640579223633, + "learning_rate": 1.8758000000000001e-06, + "loss": 0.8618, + "step": 40640 + }, + { + "epoch": 4.065, + "grad_norm": 19.536710739135742, + "learning_rate": 1.8738000000000002e-06, + "loss": 0.5667, + "step": 40650 + }, + { + "epoch": 4.066, + "grad_norm": 45.35587692260742, + "learning_rate": 1.8718000000000002e-06, + "loss": 0.7089, + "step": 40660 + }, + { + "epoch": 4.067, + "grad_norm": 56.838645935058594, + "learning_rate": 1.8698000000000003e-06, + "loss": 0.5213, + "step": 40670 + }, + { + "epoch": 4.068, + "grad_norm": 33.80918884277344, + "learning_rate": 1.8678000000000001e-06, + "loss": 0.8448, + "step": 40680 + }, + { + "epoch": 4.069, + "grad_norm": 10.72282600402832, + "learning_rate": 1.8658000000000002e-06, + "loss": 0.5146, + "step": 40690 + }, + { + "epoch": 4.07, + "grad_norm": 31.536436080932617, + "learning_rate": 1.8638e-06, + "loss": 0.4951, + "step": 40700 + }, + { + "epoch": 4.071, + "grad_norm": 51.59791946411133, + "learning_rate": 1.8618000000000003e-06, + "loss": 0.4358, + "step": 40710 + }, + { + "epoch": 4.072, + "grad_norm": 71.39259338378906, + "learning_rate": 1.8598000000000002e-06, + "loss": 0.6669, + "step": 40720 + }, + { + "epoch": 4.073, + "grad_norm": 13.469985008239746, + "learning_rate": 1.8578000000000002e-06, + "loss": 0.5264, + "step": 40730 + }, + { + "epoch": 4.074, + "grad_norm": 45.90078353881836, + "learning_rate": 1.8558e-06, + "loss": 0.9989, + "step": 40740 + }, + { + "epoch": 4.075, + "grad_norm": 71.98507690429688, + "learning_rate": 1.8538000000000001e-06, + "loss": 0.8605, + "step": 40750 + }, + { + "epoch": 4.076, + "grad_norm": 42.841304779052734, + "learning_rate": 1.8518000000000002e-06, + "loss": 0.7456, + "step": 40760 + }, + { + "epoch": 4.077, + "grad_norm": 43.334720611572266, + "learning_rate": 1.8498000000000003e-06, + "loss": 0.6266, + "step": 40770 + }, + { + "epoch": 4.078, + "grad_norm": 37.730350494384766, + "learning_rate": 1.8478000000000001e-06, + "loss": 0.5659, + "step": 40780 + }, + { + "epoch": 4.079, + "grad_norm": 40.119144439697266, + "learning_rate": 1.8458000000000002e-06, + "loss": 0.4387, + "step": 40790 + }, + { + "epoch": 4.08, + "grad_norm": 3.083294630050659, + "learning_rate": 1.8438e-06, + "loss": 0.6261, + "step": 40800 + }, + { + "epoch": 4.081, + "grad_norm": 28.532621383666992, + "learning_rate": 1.8418000000000003e-06, + "loss": 0.5223, + "step": 40810 + }, + { + "epoch": 4.082, + "grad_norm": 41.42976379394531, + "learning_rate": 1.8398000000000002e-06, + "loss": 0.6217, + "step": 40820 + }, + { + "epoch": 4.083, + "grad_norm": 17.437150955200195, + "learning_rate": 1.8378000000000002e-06, + "loss": 0.7407, + "step": 40830 + }, + { + "epoch": 4.084, + "grad_norm": 20.878398895263672, + "learning_rate": 1.8358e-06, + "loss": 0.6634, + "step": 40840 + }, + { + "epoch": 4.085, + "grad_norm": 74.67936706542969, + "learning_rate": 1.8338000000000001e-06, + "loss": 0.8132, + "step": 40850 + }, + { + "epoch": 4.086, + "grad_norm": 24.71967124938965, + "learning_rate": 1.8318000000000002e-06, + "loss": 0.8061, + "step": 40860 + }, + { + "epoch": 4.087, + "grad_norm": 76.58692932128906, + "learning_rate": 1.8298000000000003e-06, + "loss": 0.5159, + "step": 40870 + }, + { + "epoch": 4.088, + "grad_norm": 49.60830307006836, + "learning_rate": 1.8278e-06, + "loss": 0.5036, + "step": 40880 + }, + { + "epoch": 4.089, + "grad_norm": 33.77963638305664, + "learning_rate": 1.8258000000000002e-06, + "loss": 0.6621, + "step": 40890 + }, + { + "epoch": 4.09, + "grad_norm": 25.869617462158203, + "learning_rate": 1.8238e-06, + "loss": 0.7384, + "step": 40900 + }, + { + "epoch": 4.091, + "grad_norm": 35.43751907348633, + "learning_rate": 1.8218000000000003e-06, + "loss": 0.7112, + "step": 40910 + }, + { + "epoch": 4.092, + "grad_norm": 66.21663665771484, + "learning_rate": 1.8198000000000001e-06, + "loss": 0.5357, + "step": 40920 + }, + { + "epoch": 4.093, + "grad_norm": 18.179227828979492, + "learning_rate": 1.8178000000000002e-06, + "loss": 0.4763, + "step": 40930 + }, + { + "epoch": 4.094, + "grad_norm": 17.802780151367188, + "learning_rate": 1.8158e-06, + "loss": 0.5537, + "step": 40940 + }, + { + "epoch": 4.095, + "grad_norm": 53.57586669921875, + "learning_rate": 1.8138000000000003e-06, + "loss": 0.6408, + "step": 40950 + }, + { + "epoch": 4.096, + "grad_norm": 51.94672775268555, + "learning_rate": 1.8118000000000002e-06, + "loss": 0.7398, + "step": 40960 + }, + { + "epoch": 4.097, + "grad_norm": 15.93326473236084, + "learning_rate": 1.8098000000000002e-06, + "loss": 0.6094, + "step": 40970 + }, + { + "epoch": 4.098, + "grad_norm": 27.715545654296875, + "learning_rate": 1.8078e-06, + "loss": 0.4746, + "step": 40980 + }, + { + "epoch": 4.099, + "grad_norm": 15.6454496383667, + "learning_rate": 1.8058000000000001e-06, + "loss": 0.5093, + "step": 40990 + }, + { + "epoch": 4.1, + "grad_norm": 26.22260093688965, + "learning_rate": 1.8038000000000002e-06, + "loss": 0.6028, + "step": 41000 + }, + { + "epoch": 4.101, + "grad_norm": 1.7048978805541992, + "learning_rate": 1.8018000000000003e-06, + "loss": 0.7598, + "step": 41010 + }, + { + "epoch": 4.102, + "grad_norm": 31.586240768432617, + "learning_rate": 1.7998000000000001e-06, + "loss": 0.4546, + "step": 41020 + }, + { + "epoch": 4.103, + "grad_norm": 63.79600143432617, + "learning_rate": 1.7978000000000002e-06, + "loss": 0.9011, + "step": 41030 + }, + { + "epoch": 4.104, + "grad_norm": 55.758968353271484, + "learning_rate": 1.7958e-06, + "loss": 0.5311, + "step": 41040 + }, + { + "epoch": 4.105, + "grad_norm": 13.822869300842285, + "learning_rate": 1.7938000000000003e-06, + "loss": 0.6063, + "step": 41050 + }, + { + "epoch": 4.106, + "grad_norm": 24.2291202545166, + "learning_rate": 1.7918000000000001e-06, + "loss": 0.662, + "step": 41060 + }, + { + "epoch": 4.107, + "grad_norm": 3.7248871326446533, + "learning_rate": 1.7898000000000002e-06, + "loss": 0.7849, + "step": 41070 + }, + { + "epoch": 4.108, + "grad_norm": 89.2628173828125, + "learning_rate": 1.7878e-06, + "loss": 0.6894, + "step": 41080 + }, + { + "epoch": 4.109, + "grad_norm": 41.21138381958008, + "learning_rate": 1.7858000000000001e-06, + "loss": 0.6289, + "step": 41090 + }, + { + "epoch": 4.11, + "grad_norm": 43.779544830322266, + "learning_rate": 1.7838000000000002e-06, + "loss": 0.7203, + "step": 41100 + }, + { + "epoch": 4.111, + "grad_norm": 57.70969772338867, + "learning_rate": 1.7818000000000002e-06, + "loss": 0.5379, + "step": 41110 + }, + { + "epoch": 4.112, + "grad_norm": 18.52546501159668, + "learning_rate": 1.7798e-06, + "loss": 0.6473, + "step": 41120 + }, + { + "epoch": 4.113, + "grad_norm": 27.89804458618164, + "learning_rate": 1.7778000000000002e-06, + "loss": 0.5624, + "step": 41130 + }, + { + "epoch": 4.114, + "grad_norm": 29.954504013061523, + "learning_rate": 1.7758e-06, + "loss": 0.6228, + "step": 41140 + }, + { + "epoch": 4.115, + "grad_norm": 13.705012321472168, + "learning_rate": 1.7738000000000003e-06, + "loss": 0.4084, + "step": 41150 + }, + { + "epoch": 4.116, + "grad_norm": 3.7398502826690674, + "learning_rate": 1.7718000000000001e-06, + "loss": 0.3762, + "step": 41160 + }, + { + "epoch": 4.117, + "grad_norm": 20.722627639770508, + "learning_rate": 1.7698000000000002e-06, + "loss": 0.4139, + "step": 41170 + }, + { + "epoch": 4.118, + "grad_norm": 6.001708507537842, + "learning_rate": 1.7678e-06, + "loss": 0.6918, + "step": 41180 + }, + { + "epoch": 4.119, + "grad_norm": 47.202796936035156, + "learning_rate": 1.7658e-06, + "loss": 0.3182, + "step": 41190 + }, + { + "epoch": 4.12, + "grad_norm": 59.22598648071289, + "learning_rate": 1.7638000000000002e-06, + "loss": 0.4179, + "step": 41200 + }, + { + "epoch": 4.121, + "grad_norm": 25.26884651184082, + "learning_rate": 1.7618000000000002e-06, + "loss": 0.4623, + "step": 41210 + }, + { + "epoch": 4.122, + "grad_norm": 21.561010360717773, + "learning_rate": 1.7598e-06, + "loss": 0.4474, + "step": 41220 + }, + { + "epoch": 4.123, + "grad_norm": 38.01866149902344, + "learning_rate": 1.7578000000000001e-06, + "loss": 0.6089, + "step": 41230 + }, + { + "epoch": 4.124, + "grad_norm": 23.74958038330078, + "learning_rate": 1.7558000000000002e-06, + "loss": 0.5146, + "step": 41240 + }, + { + "epoch": 4.125, + "grad_norm": 42.2169075012207, + "learning_rate": 1.7538000000000003e-06, + "loss": 0.6283, + "step": 41250 + }, + { + "epoch": 4.126, + "grad_norm": 30.261451721191406, + "learning_rate": 1.7518000000000001e-06, + "loss": 0.3849, + "step": 41260 + }, + { + "epoch": 4.127, + "grad_norm": 16.222307205200195, + "learning_rate": 1.7498000000000002e-06, + "loss": 0.477, + "step": 41270 + }, + { + "epoch": 4.128, + "grad_norm": 54.456966400146484, + "learning_rate": 1.7478e-06, + "loss": 0.9572, + "step": 41280 + }, + { + "epoch": 4.129, + "grad_norm": 8.039139747619629, + "learning_rate": 1.7458000000000003e-06, + "loss": 0.5364, + "step": 41290 + }, + { + "epoch": 4.13, + "grad_norm": 24.38113784790039, + "learning_rate": 1.7438000000000001e-06, + "loss": 0.7153, + "step": 41300 + }, + { + "epoch": 4.131, + "grad_norm": 53.22614669799805, + "learning_rate": 1.7418000000000002e-06, + "loss": 0.6344, + "step": 41310 + }, + { + "epoch": 4.132, + "grad_norm": 38.1357536315918, + "learning_rate": 1.7398e-06, + "loss": 0.7408, + "step": 41320 + }, + { + "epoch": 4.133, + "grad_norm": 5.696017265319824, + "learning_rate": 1.7378000000000001e-06, + "loss": 0.4738, + "step": 41330 + }, + { + "epoch": 4.134, + "grad_norm": 43.506187438964844, + "learning_rate": 1.7358000000000002e-06, + "loss": 0.6412, + "step": 41340 + }, + { + "epoch": 4.135, + "grad_norm": 19.607364654541016, + "learning_rate": 1.7338000000000002e-06, + "loss": 0.4156, + "step": 41350 + }, + { + "epoch": 4.136, + "grad_norm": 9.471464157104492, + "learning_rate": 1.7318e-06, + "loss": 0.5737, + "step": 41360 + }, + { + "epoch": 4.1370000000000005, + "grad_norm": 44.34270095825195, + "learning_rate": 1.7298000000000002e-06, + "loss": 0.7362, + "step": 41370 + }, + { + "epoch": 4.138, + "grad_norm": 17.764965057373047, + "learning_rate": 1.7278e-06, + "loss": 0.3977, + "step": 41380 + }, + { + "epoch": 4.139, + "grad_norm": 17.33642578125, + "learning_rate": 1.7258000000000003e-06, + "loss": 0.5518, + "step": 41390 + }, + { + "epoch": 4.14, + "grad_norm": 56.447975158691406, + "learning_rate": 1.7238000000000001e-06, + "loss": 0.7362, + "step": 41400 + }, + { + "epoch": 4.141, + "grad_norm": 40.42461395263672, + "learning_rate": 1.7218000000000002e-06, + "loss": 0.4879, + "step": 41410 + }, + { + "epoch": 4.142, + "grad_norm": 44.50244140625, + "learning_rate": 1.7198e-06, + "loss": 0.4516, + "step": 41420 + }, + { + "epoch": 4.143, + "grad_norm": 60.044456481933594, + "learning_rate": 1.7178e-06, + "loss": 0.7069, + "step": 41430 + }, + { + "epoch": 4.144, + "grad_norm": 9.243979454040527, + "learning_rate": 1.7158000000000002e-06, + "loss": 0.5722, + "step": 41440 + }, + { + "epoch": 4.145, + "grad_norm": 29.480884552001953, + "learning_rate": 1.7138000000000002e-06, + "loss": 0.5234, + "step": 41450 + }, + { + "epoch": 4.146, + "grad_norm": 23.5086669921875, + "learning_rate": 1.7118e-06, + "loss": 0.5544, + "step": 41460 + }, + { + "epoch": 4.147, + "grad_norm": 34.623836517333984, + "learning_rate": 1.7098000000000001e-06, + "loss": 0.6038, + "step": 41470 + }, + { + "epoch": 4.148, + "grad_norm": 14.842521667480469, + "learning_rate": 1.7078e-06, + "loss": 0.7382, + "step": 41480 + }, + { + "epoch": 4.149, + "grad_norm": 38.99134826660156, + "learning_rate": 1.7058000000000003e-06, + "loss": 0.6176, + "step": 41490 + }, + { + "epoch": 4.15, + "grad_norm": 20.720136642456055, + "learning_rate": 1.7038e-06, + "loss": 0.7344, + "step": 41500 + }, + { + "epoch": 4.151, + "grad_norm": 42.731204986572266, + "learning_rate": 1.7018000000000002e-06, + "loss": 0.8725, + "step": 41510 + }, + { + "epoch": 4.152, + "grad_norm": 35.921173095703125, + "learning_rate": 1.6998e-06, + "loss": 0.7057, + "step": 41520 + }, + { + "epoch": 4.153, + "grad_norm": 33.33821105957031, + "learning_rate": 1.6978e-06, + "loss": 0.6767, + "step": 41530 + }, + { + "epoch": 4.154, + "grad_norm": 51.7625617980957, + "learning_rate": 1.6958000000000001e-06, + "loss": 0.5458, + "step": 41540 + }, + { + "epoch": 4.155, + "grad_norm": 48.38890838623047, + "learning_rate": 1.6938000000000002e-06, + "loss": 0.4643, + "step": 41550 + }, + { + "epoch": 4.156, + "grad_norm": 3.2673356533050537, + "learning_rate": 1.6918e-06, + "loss": 0.7652, + "step": 41560 + }, + { + "epoch": 4.157, + "grad_norm": 3.0207319259643555, + "learning_rate": 1.6898000000000001e-06, + "loss": 0.3428, + "step": 41570 + }, + { + "epoch": 4.158, + "grad_norm": 8.048981666564941, + "learning_rate": 1.6878000000000002e-06, + "loss": 0.674, + "step": 41580 + }, + { + "epoch": 4.159, + "grad_norm": 41.031864166259766, + "learning_rate": 1.6858000000000002e-06, + "loss": 0.6165, + "step": 41590 + }, + { + "epoch": 4.16, + "grad_norm": 20.935832977294922, + "learning_rate": 1.6838e-06, + "loss": 0.3379, + "step": 41600 + }, + { + "epoch": 4.161, + "grad_norm": 5.029835224151611, + "learning_rate": 1.6818000000000001e-06, + "loss": 0.3947, + "step": 41610 + }, + { + "epoch": 4.162, + "grad_norm": 36.44365692138672, + "learning_rate": 1.6798e-06, + "loss": 0.6268, + "step": 41620 + }, + { + "epoch": 4.163, + "grad_norm": 13.238295555114746, + "learning_rate": 1.6778000000000003e-06, + "loss": 0.8674, + "step": 41630 + }, + { + "epoch": 4.164, + "grad_norm": 30.991918563842773, + "learning_rate": 1.6758000000000001e-06, + "loss": 0.6755, + "step": 41640 + }, + { + "epoch": 4.165, + "grad_norm": 14.726704597473145, + "learning_rate": 1.6738000000000002e-06, + "loss": 0.3431, + "step": 41650 + }, + { + "epoch": 4.166, + "grad_norm": 43.18964385986328, + "learning_rate": 1.6718e-06, + "loss": 0.9822, + "step": 41660 + }, + { + "epoch": 4.167, + "grad_norm": 7.514678478240967, + "learning_rate": 1.6698e-06, + "loss": 0.3684, + "step": 41670 + }, + { + "epoch": 4.168, + "grad_norm": 42.86118698120117, + "learning_rate": 1.6678000000000002e-06, + "loss": 0.6661, + "step": 41680 + }, + { + "epoch": 4.169, + "grad_norm": 46.86203384399414, + "learning_rate": 1.6658000000000002e-06, + "loss": 0.3551, + "step": 41690 + }, + { + "epoch": 4.17, + "grad_norm": 49.551971435546875, + "learning_rate": 1.6638e-06, + "loss": 0.7988, + "step": 41700 + }, + { + "epoch": 4.171, + "grad_norm": 48.88741683959961, + "learning_rate": 1.6618000000000001e-06, + "loss": 0.4471, + "step": 41710 + }, + { + "epoch": 4.172, + "grad_norm": 11.72259521484375, + "learning_rate": 1.6598e-06, + "loss": 0.4281, + "step": 41720 + }, + { + "epoch": 4.173, + "grad_norm": 8.203263282775879, + "learning_rate": 1.6578000000000002e-06, + "loss": 0.3705, + "step": 41730 + }, + { + "epoch": 4.174, + "grad_norm": 87.56128692626953, + "learning_rate": 1.6558e-06, + "loss": 0.8434, + "step": 41740 + }, + { + "epoch": 4.175, + "grad_norm": 34.857051849365234, + "learning_rate": 1.6538000000000002e-06, + "loss": 0.5267, + "step": 41750 + }, + { + "epoch": 4.176, + "grad_norm": 7.201389789581299, + "learning_rate": 1.6518e-06, + "loss": 0.3331, + "step": 41760 + }, + { + "epoch": 4.177, + "grad_norm": 26.428733825683594, + "learning_rate": 1.6498e-06, + "loss": 0.616, + "step": 41770 + }, + { + "epoch": 4.178, + "grad_norm": 39.79397201538086, + "learning_rate": 1.6478000000000001e-06, + "loss": 0.4973, + "step": 41780 + }, + { + "epoch": 4.179, + "grad_norm": 41.15972900390625, + "learning_rate": 1.6458000000000002e-06, + "loss": 0.4637, + "step": 41790 + }, + { + "epoch": 4.18, + "grad_norm": 91.5743179321289, + "learning_rate": 1.6438e-06, + "loss": 0.7188, + "step": 41800 + }, + { + "epoch": 4.181, + "grad_norm": 35.621795654296875, + "learning_rate": 1.6418e-06, + "loss": 0.565, + "step": 41810 + }, + { + "epoch": 4.182, + "grad_norm": 27.934715270996094, + "learning_rate": 1.6398e-06, + "loss": 0.6133, + "step": 41820 + }, + { + "epoch": 4.183, + "grad_norm": 42.21739959716797, + "learning_rate": 1.6378000000000002e-06, + "loss": 0.479, + "step": 41830 + }, + { + "epoch": 4.184, + "grad_norm": 50.77339172363281, + "learning_rate": 1.6358e-06, + "loss": 0.4843, + "step": 41840 + }, + { + "epoch": 4.185, + "grad_norm": 10.044069290161133, + "learning_rate": 1.6338000000000001e-06, + "loss": 0.4999, + "step": 41850 + }, + { + "epoch": 4.186, + "grad_norm": 10.109586715698242, + "learning_rate": 1.6318e-06, + "loss": 0.4976, + "step": 41860 + }, + { + "epoch": 4.187, + "grad_norm": 58.6328125, + "learning_rate": 1.6298000000000003e-06, + "loss": 0.4399, + "step": 41870 + }, + { + "epoch": 4.188, + "grad_norm": 24.95362663269043, + "learning_rate": 1.6278000000000001e-06, + "loss": 0.5809, + "step": 41880 + }, + { + "epoch": 4.189, + "grad_norm": 40.617855072021484, + "learning_rate": 1.6258000000000002e-06, + "loss": 0.431, + "step": 41890 + }, + { + "epoch": 4.19, + "grad_norm": 37.19096374511719, + "learning_rate": 1.6238e-06, + "loss": 0.5244, + "step": 41900 + }, + { + "epoch": 4.191, + "grad_norm": 53.91977310180664, + "learning_rate": 1.6218e-06, + "loss": 0.6796, + "step": 41910 + }, + { + "epoch": 4.192, + "grad_norm": 3.6687369346618652, + "learning_rate": 1.6198000000000004e-06, + "loss": 0.6903, + "step": 41920 + }, + { + "epoch": 4.193, + "grad_norm": 22.804624557495117, + "learning_rate": 1.6178000000000002e-06, + "loss": 0.4569, + "step": 41930 + }, + { + "epoch": 4.194, + "grad_norm": 56.276893615722656, + "learning_rate": 1.6158e-06, + "loss": 0.6385, + "step": 41940 + }, + { + "epoch": 4.195, + "grad_norm": 41.6296272277832, + "learning_rate": 1.6138000000000001e-06, + "loss": 0.3963, + "step": 41950 + }, + { + "epoch": 4.196, + "grad_norm": 66.05530548095703, + "learning_rate": 1.6118e-06, + "loss": 0.8673, + "step": 41960 + }, + { + "epoch": 4.197, + "grad_norm": 47.94367218017578, + "learning_rate": 1.6098000000000002e-06, + "loss": 0.815, + "step": 41970 + }, + { + "epoch": 4.198, + "grad_norm": 0.4343576431274414, + "learning_rate": 1.6078e-06, + "loss": 0.39, + "step": 41980 + }, + { + "epoch": 4.199, + "grad_norm": 8.591513633728027, + "learning_rate": 1.6058000000000002e-06, + "loss": 0.66, + "step": 41990 + }, + { + "epoch": 4.2, + "grad_norm": 66.63551330566406, + "learning_rate": 1.6038e-06, + "loss": 0.4365, + "step": 42000 + }, + { + "epoch": 4.201, + "grad_norm": 34.489219665527344, + "learning_rate": 1.6018e-06, + "loss": 0.6739, + "step": 42010 + }, + { + "epoch": 4.202, + "grad_norm": 45.61314392089844, + "learning_rate": 1.5998000000000003e-06, + "loss": 0.848, + "step": 42020 + }, + { + "epoch": 4.203, + "grad_norm": 24.58756446838379, + "learning_rate": 1.5978000000000002e-06, + "loss": 0.6043, + "step": 42030 + }, + { + "epoch": 4.204, + "grad_norm": 22.763195037841797, + "learning_rate": 1.5958e-06, + "loss": 0.4617, + "step": 42040 + }, + { + "epoch": 4.205, + "grad_norm": 6.496039867401123, + "learning_rate": 1.5938e-06, + "loss": 0.6126, + "step": 42050 + }, + { + "epoch": 4.206, + "grad_norm": 36.29051971435547, + "learning_rate": 1.5918e-06, + "loss": 0.5598, + "step": 42060 + }, + { + "epoch": 4.207, + "grad_norm": 28.738428115844727, + "learning_rate": 1.5898000000000002e-06, + "loss": 0.7017, + "step": 42070 + }, + { + "epoch": 4.208, + "grad_norm": 60.276710510253906, + "learning_rate": 1.5878e-06, + "loss": 0.8548, + "step": 42080 + }, + { + "epoch": 4.209, + "grad_norm": 44.32135772705078, + "learning_rate": 1.5858000000000001e-06, + "loss": 0.6744, + "step": 42090 + }, + { + "epoch": 4.21, + "grad_norm": 35.0414924621582, + "learning_rate": 1.5838e-06, + "loss": 0.6921, + "step": 42100 + }, + { + "epoch": 4.211, + "grad_norm": 44.21099090576172, + "learning_rate": 1.5818e-06, + "loss": 0.5771, + "step": 42110 + }, + { + "epoch": 4.212, + "grad_norm": 38.59548568725586, + "learning_rate": 1.5798000000000003e-06, + "loss": 0.4153, + "step": 42120 + }, + { + "epoch": 4.213, + "grad_norm": 30.115324020385742, + "learning_rate": 1.5778000000000002e-06, + "loss": 0.363, + "step": 42130 + }, + { + "epoch": 4.214, + "grad_norm": 31.779865264892578, + "learning_rate": 1.5758e-06, + "loss": 0.7121, + "step": 42140 + }, + { + "epoch": 4.215, + "grad_norm": 55.575439453125, + "learning_rate": 1.5738e-06, + "loss": 0.7864, + "step": 42150 + }, + { + "epoch": 4.216, + "grad_norm": 49.964656829833984, + "learning_rate": 1.5718e-06, + "loss": 0.5532, + "step": 42160 + }, + { + "epoch": 4.217, + "grad_norm": 72.00129699707031, + "learning_rate": 1.5698000000000002e-06, + "loss": 0.5505, + "step": 42170 + }, + { + "epoch": 4.218, + "grad_norm": 49.91716003417969, + "learning_rate": 1.5678e-06, + "loss": 0.5834, + "step": 42180 + }, + { + "epoch": 4.219, + "grad_norm": 36.31462860107422, + "learning_rate": 1.5658000000000001e-06, + "loss": 0.478, + "step": 42190 + }, + { + "epoch": 4.22, + "grad_norm": 47.1279182434082, + "learning_rate": 1.5638e-06, + "loss": 0.6494, + "step": 42200 + }, + { + "epoch": 4.221, + "grad_norm": 7.2586517333984375, + "learning_rate": 1.5618000000000002e-06, + "loss": 0.5058, + "step": 42210 + }, + { + "epoch": 4.222, + "grad_norm": 56.2879638671875, + "learning_rate": 1.5598000000000003e-06, + "loss": 0.5478, + "step": 42220 + }, + { + "epoch": 4.223, + "grad_norm": 32.43406295776367, + "learning_rate": 1.5578000000000001e-06, + "loss": 0.5427, + "step": 42230 + }, + { + "epoch": 4.224, + "grad_norm": 23.466623306274414, + "learning_rate": 1.5558e-06, + "loss": 0.5276, + "step": 42240 + }, + { + "epoch": 4.225, + "grad_norm": 42.27043533325195, + "learning_rate": 1.5538e-06, + "loss": 0.5317, + "step": 42250 + }, + { + "epoch": 4.226, + "grad_norm": 60.1855583190918, + "learning_rate": 1.5518000000000003e-06, + "loss": 0.7668, + "step": 42260 + }, + { + "epoch": 4.227, + "grad_norm": 58.870296478271484, + "learning_rate": 1.5498000000000002e-06, + "loss": 0.6913, + "step": 42270 + }, + { + "epoch": 4.228, + "grad_norm": 13.989805221557617, + "learning_rate": 1.5478000000000002e-06, + "loss": 0.4032, + "step": 42280 + }, + { + "epoch": 4.229, + "grad_norm": 18.71600914001465, + "learning_rate": 1.5458e-06, + "loss": 0.542, + "step": 42290 + }, + { + "epoch": 4.23, + "grad_norm": 11.906790733337402, + "learning_rate": 1.5438e-06, + "loss": 0.6872, + "step": 42300 + }, + { + "epoch": 4.231, + "grad_norm": 19.292734146118164, + "learning_rate": 1.5418000000000002e-06, + "loss": 0.4605, + "step": 42310 + }, + { + "epoch": 4.232, + "grad_norm": 65.98567199707031, + "learning_rate": 1.5398000000000003e-06, + "loss": 0.4226, + "step": 42320 + }, + { + "epoch": 4.233, + "grad_norm": 12.095321655273438, + "learning_rate": 1.5378000000000001e-06, + "loss": 0.8225, + "step": 42330 + }, + { + "epoch": 4.234, + "grad_norm": 15.595905303955078, + "learning_rate": 1.5358e-06, + "loss": 0.4639, + "step": 42340 + }, + { + "epoch": 4.235, + "grad_norm": 48.75836944580078, + "learning_rate": 1.5338e-06, + "loss": 0.2764, + "step": 42350 + }, + { + "epoch": 4.236, + "grad_norm": 31.910316467285156, + "learning_rate": 1.5318000000000003e-06, + "loss": 0.6411, + "step": 42360 + }, + { + "epoch": 4.237, + "grad_norm": 47.41012954711914, + "learning_rate": 1.5298000000000002e-06, + "loss": 0.8059, + "step": 42370 + }, + { + "epoch": 4.2379999999999995, + "grad_norm": 47.09851837158203, + "learning_rate": 1.5278000000000002e-06, + "loss": 0.685, + "step": 42380 + }, + { + "epoch": 4.239, + "grad_norm": 61.549922943115234, + "learning_rate": 1.5258e-06, + "loss": 0.7116, + "step": 42390 + }, + { + "epoch": 4.24, + "grad_norm": 41.90022659301758, + "learning_rate": 1.5238e-06, + "loss": 0.8182, + "step": 42400 + }, + { + "epoch": 4.241, + "grad_norm": 19.784992218017578, + "learning_rate": 1.5218000000000002e-06, + "loss": 0.6319, + "step": 42410 + }, + { + "epoch": 4.242, + "grad_norm": 14.872299194335938, + "learning_rate": 1.5198000000000003e-06, + "loss": 0.9768, + "step": 42420 + }, + { + "epoch": 4.243, + "grad_norm": 14.719264030456543, + "learning_rate": 1.5178e-06, + "loss": 0.5052, + "step": 42430 + }, + { + "epoch": 4.244, + "grad_norm": 24.612985610961914, + "learning_rate": 1.5158e-06, + "loss": 0.5563, + "step": 42440 + }, + { + "epoch": 4.245, + "grad_norm": 60.18811798095703, + "learning_rate": 1.5138e-06, + "loss": 0.5981, + "step": 42450 + }, + { + "epoch": 4.246, + "grad_norm": 80.66825103759766, + "learning_rate": 1.5118000000000003e-06, + "loss": 0.4633, + "step": 42460 + }, + { + "epoch": 4.247, + "grad_norm": 13.229582786560059, + "learning_rate": 1.5098000000000001e-06, + "loss": 0.6477, + "step": 42470 + }, + { + "epoch": 4.248, + "grad_norm": 2.7987141609191895, + "learning_rate": 1.5078000000000002e-06, + "loss": 0.3514, + "step": 42480 + }, + { + "epoch": 4.249, + "grad_norm": 44.73332214355469, + "learning_rate": 1.5058e-06, + "loss": 0.5872, + "step": 42490 + }, + { + "epoch": 4.25, + "grad_norm": 51.61971664428711, + "learning_rate": 1.5038000000000003e-06, + "loss": 0.4737, + "step": 42500 + }, + { + "epoch": 4.251, + "grad_norm": 5.696041107177734, + "learning_rate": 1.5018000000000002e-06, + "loss": 0.6114, + "step": 42510 + }, + { + "epoch": 4.252, + "grad_norm": 19.12189483642578, + "learning_rate": 1.4998000000000002e-06, + "loss": 0.64, + "step": 42520 + }, + { + "epoch": 4.253, + "grad_norm": 54.96479797363281, + "learning_rate": 1.4978e-06, + "loss": 0.628, + "step": 42530 + }, + { + "epoch": 4.254, + "grad_norm": 13.414827346801758, + "learning_rate": 1.4958e-06, + "loss": 0.7258, + "step": 42540 + }, + { + "epoch": 4.255, + "grad_norm": 20.74048614501953, + "learning_rate": 1.4938000000000002e-06, + "loss": 0.6399, + "step": 42550 + }, + { + "epoch": 4.256, + "grad_norm": 17.05641746520996, + "learning_rate": 1.4918000000000003e-06, + "loss": 0.6818, + "step": 42560 + }, + { + "epoch": 4.257, + "grad_norm": 3.655008554458618, + "learning_rate": 1.4898000000000001e-06, + "loss": 0.6916, + "step": 42570 + }, + { + "epoch": 4.258, + "grad_norm": 29.011837005615234, + "learning_rate": 1.4878000000000002e-06, + "loss": 0.5478, + "step": 42580 + }, + { + "epoch": 4.259, + "grad_norm": 6.549126148223877, + "learning_rate": 1.4858e-06, + "loss": 0.3863, + "step": 42590 + }, + { + "epoch": 4.26, + "grad_norm": 59.96338653564453, + "learning_rate": 1.4838000000000003e-06, + "loss": 0.7019, + "step": 42600 + }, + { + "epoch": 4.261, + "grad_norm": 3.527003526687622, + "learning_rate": 1.4818000000000002e-06, + "loss": 0.5075, + "step": 42610 + }, + { + "epoch": 4.2620000000000005, + "grad_norm": 66.73096466064453, + "learning_rate": 1.4798000000000002e-06, + "loss": 0.6057, + "step": 42620 + }, + { + "epoch": 4.263, + "grad_norm": 51.33205795288086, + "learning_rate": 1.4778e-06, + "loss": 0.5798, + "step": 42630 + }, + { + "epoch": 4.264, + "grad_norm": 61.584869384765625, + "learning_rate": 1.4758e-06, + "loss": 0.4595, + "step": 42640 + }, + { + "epoch": 4.265, + "grad_norm": 43.708984375, + "learning_rate": 1.4738000000000002e-06, + "loss": 0.5689, + "step": 42650 + }, + { + "epoch": 4.266, + "grad_norm": 16.016572952270508, + "learning_rate": 1.4718000000000002e-06, + "loss": 0.5122, + "step": 42660 + }, + { + "epoch": 4.267, + "grad_norm": 42.98859405517578, + "learning_rate": 1.4698e-06, + "loss": 0.4487, + "step": 42670 + }, + { + "epoch": 4.268, + "grad_norm": 48.663185119628906, + "learning_rate": 1.4678000000000002e-06, + "loss": 0.4685, + "step": 42680 + }, + { + "epoch": 4.269, + "grad_norm": 55.1309700012207, + "learning_rate": 1.4658e-06, + "loss": 0.4818, + "step": 42690 + }, + { + "epoch": 4.27, + "grad_norm": 46.402252197265625, + "learning_rate": 1.4638000000000003e-06, + "loss": 0.52, + "step": 42700 + }, + { + "epoch": 4.271, + "grad_norm": 19.3892822265625, + "learning_rate": 1.4618000000000001e-06, + "loss": 0.5317, + "step": 42710 + }, + { + "epoch": 4.272, + "grad_norm": 40.837039947509766, + "learning_rate": 1.4598000000000002e-06, + "loss": 0.5411, + "step": 42720 + }, + { + "epoch": 4.273, + "grad_norm": 31.86037254333496, + "learning_rate": 1.4578e-06, + "loss": 0.5173, + "step": 42730 + }, + { + "epoch": 4.274, + "grad_norm": 62.265621185302734, + "learning_rate": 1.4557999999999999e-06, + "loss": 0.6121, + "step": 42740 + }, + { + "epoch": 4.275, + "grad_norm": 44.93138122558594, + "learning_rate": 1.4538000000000002e-06, + "loss": 0.54, + "step": 42750 + }, + { + "epoch": 4.276, + "grad_norm": 67.72014617919922, + "learning_rate": 1.4518000000000002e-06, + "loss": 0.784, + "step": 42760 + }, + { + "epoch": 4.277, + "grad_norm": 40.16845703125, + "learning_rate": 1.4498e-06, + "loss": 0.7528, + "step": 42770 + }, + { + "epoch": 4.2780000000000005, + "grad_norm": 61.454490661621094, + "learning_rate": 1.4478000000000001e-06, + "loss": 0.5798, + "step": 42780 + }, + { + "epoch": 4.279, + "grad_norm": 55.03683853149414, + "learning_rate": 1.4458e-06, + "loss": 0.756, + "step": 42790 + }, + { + "epoch": 4.28, + "grad_norm": 42.04933547973633, + "learning_rate": 1.4438000000000003e-06, + "loss": 0.7275, + "step": 42800 + }, + { + "epoch": 4.281, + "grad_norm": 56.68568801879883, + "learning_rate": 1.4418000000000001e-06, + "loss": 0.5234, + "step": 42810 + }, + { + "epoch": 4.282, + "grad_norm": 6.141942501068115, + "learning_rate": 1.4398000000000002e-06, + "loss": 0.311, + "step": 42820 + }, + { + "epoch": 4.283, + "grad_norm": 36.63520050048828, + "learning_rate": 1.4378e-06, + "loss": 0.6934, + "step": 42830 + }, + { + "epoch": 4.284, + "grad_norm": 40.52085494995117, + "learning_rate": 1.4358000000000003e-06, + "loss": 0.5693, + "step": 42840 + }, + { + "epoch": 4.285, + "grad_norm": 25.857606887817383, + "learning_rate": 1.4338000000000001e-06, + "loss": 0.4119, + "step": 42850 + }, + { + "epoch": 4.286, + "grad_norm": 34.83437728881836, + "learning_rate": 1.4318000000000002e-06, + "loss": 0.6163, + "step": 42860 + }, + { + "epoch": 4.287, + "grad_norm": 9.676780700683594, + "learning_rate": 1.4298e-06, + "loss": 0.5552, + "step": 42870 + }, + { + "epoch": 4.288, + "grad_norm": 7.326340198516846, + "learning_rate": 1.4278000000000001e-06, + "loss": 0.4313, + "step": 42880 + }, + { + "epoch": 4.289, + "grad_norm": 45.25532913208008, + "learning_rate": 1.4258000000000002e-06, + "loss": 0.7117, + "step": 42890 + }, + { + "epoch": 4.29, + "grad_norm": 9.764056205749512, + "learning_rate": 1.4238000000000002e-06, + "loss": 0.3815, + "step": 42900 + }, + { + "epoch": 4.291, + "grad_norm": 20.11690902709961, + "learning_rate": 1.4218e-06, + "loss": 0.5441, + "step": 42910 + }, + { + "epoch": 4.292, + "grad_norm": 26.8229923248291, + "learning_rate": 1.4198000000000002e-06, + "loss": 0.7206, + "step": 42920 + }, + { + "epoch": 4.293, + "grad_norm": 30.303131103515625, + "learning_rate": 1.4178e-06, + "loss": 0.3032, + "step": 42930 + }, + { + "epoch": 4.294, + "grad_norm": 47.463016510009766, + "learning_rate": 1.4158000000000003e-06, + "loss": 0.4759, + "step": 42940 + }, + { + "epoch": 4.295, + "grad_norm": 57.69182586669922, + "learning_rate": 1.4138000000000001e-06, + "loss": 0.6244, + "step": 42950 + }, + { + "epoch": 4.296, + "grad_norm": 40.416038513183594, + "learning_rate": 1.4118000000000002e-06, + "loss": 0.5296, + "step": 42960 + }, + { + "epoch": 4.297, + "grad_norm": 36.48316955566406, + "learning_rate": 1.4098e-06, + "loss": 0.6653, + "step": 42970 + }, + { + "epoch": 4.298, + "grad_norm": 4.3902764320373535, + "learning_rate": 1.4078e-06, + "loss": 0.5357, + "step": 42980 + }, + { + "epoch": 4.299, + "grad_norm": 11.479517936706543, + "learning_rate": 1.4058000000000002e-06, + "loss": 0.5215, + "step": 42990 + }, + { + "epoch": 4.3, + "grad_norm": 26.62324333190918, + "learning_rate": 1.4038000000000002e-06, + "loss": 0.6729, + "step": 43000 + }, + { + "epoch": 4.301, + "grad_norm": 79.1596908569336, + "learning_rate": 1.4018e-06, + "loss": 0.7622, + "step": 43010 + }, + { + "epoch": 4.302, + "grad_norm": 43.866886138916016, + "learning_rate": 1.3998000000000001e-06, + "loss": 0.5798, + "step": 43020 + }, + { + "epoch": 4.303, + "grad_norm": 75.57432556152344, + "learning_rate": 1.3978e-06, + "loss": 0.7021, + "step": 43030 + }, + { + "epoch": 4.304, + "grad_norm": 55.48227310180664, + "learning_rate": 1.3958000000000003e-06, + "loss": 0.5015, + "step": 43040 + }, + { + "epoch": 4.305, + "grad_norm": 64.40435791015625, + "learning_rate": 1.3938000000000001e-06, + "loss": 0.787, + "step": 43050 + }, + { + "epoch": 4.306, + "grad_norm": 32.22929000854492, + "learning_rate": 1.3918000000000002e-06, + "loss": 0.6516, + "step": 43060 + }, + { + "epoch": 4.307, + "grad_norm": 11.515064239501953, + "learning_rate": 1.3898e-06, + "loss": 0.3711, + "step": 43070 + }, + { + "epoch": 4.308, + "grad_norm": 68.9782485961914, + "learning_rate": 1.3878e-06, + "loss": 0.522, + "step": 43080 + }, + { + "epoch": 4.309, + "grad_norm": 35.55900573730469, + "learning_rate": 1.3858000000000001e-06, + "loss": 0.4953, + "step": 43090 + }, + { + "epoch": 4.31, + "grad_norm": 49.99945831298828, + "learning_rate": 1.3838000000000002e-06, + "loss": 0.5207, + "step": 43100 + }, + { + "epoch": 4.311, + "grad_norm": 3.150254726409912, + "learning_rate": 1.3818e-06, + "loss": 0.4207, + "step": 43110 + }, + { + "epoch": 4.312, + "grad_norm": 6.824671745300293, + "learning_rate": 1.3798000000000001e-06, + "loss": 0.66, + "step": 43120 + }, + { + "epoch": 4.313, + "grad_norm": 7.719470500946045, + "learning_rate": 1.3778000000000002e-06, + "loss": 0.5685, + "step": 43130 + }, + { + "epoch": 4.314, + "grad_norm": 10.207037925720215, + "learning_rate": 1.3758000000000002e-06, + "loss": 0.5784, + "step": 43140 + }, + { + "epoch": 4.315, + "grad_norm": 43.468528747558594, + "learning_rate": 1.3738e-06, + "loss": 0.4621, + "step": 43150 + }, + { + "epoch": 4.316, + "grad_norm": 35.88478469848633, + "learning_rate": 1.3718000000000001e-06, + "loss": 0.3395, + "step": 43160 + }, + { + "epoch": 4.317, + "grad_norm": 4.721527576446533, + "learning_rate": 1.3698e-06, + "loss": 0.3421, + "step": 43170 + }, + { + "epoch": 4.318, + "grad_norm": 60.953182220458984, + "learning_rate": 1.3678000000000003e-06, + "loss": 0.6947, + "step": 43180 + }, + { + "epoch": 4.319, + "grad_norm": 11.123573303222656, + "learning_rate": 1.3658000000000001e-06, + "loss": 0.5041, + "step": 43190 + }, + { + "epoch": 4.32, + "grad_norm": 66.29830932617188, + "learning_rate": 1.3638000000000002e-06, + "loss": 0.9367, + "step": 43200 + }, + { + "epoch": 4.321, + "grad_norm": 24.80149269104004, + "learning_rate": 1.3618e-06, + "loss": 0.5205, + "step": 43210 + }, + { + "epoch": 4.322, + "grad_norm": 41.04637145996094, + "learning_rate": 1.3598e-06, + "loss": 0.4256, + "step": 43220 + }, + { + "epoch": 4.323, + "grad_norm": 52.53520202636719, + "learning_rate": 1.3578000000000002e-06, + "loss": 0.6093, + "step": 43230 + }, + { + "epoch": 4.324, + "grad_norm": 2.6135032176971436, + "learning_rate": 1.3558000000000002e-06, + "loss": 0.2796, + "step": 43240 + }, + { + "epoch": 4.325, + "grad_norm": 59.64257049560547, + "learning_rate": 1.3538e-06, + "loss": 0.7455, + "step": 43250 + }, + { + "epoch": 4.326, + "grad_norm": 26.32259750366211, + "learning_rate": 1.3518000000000001e-06, + "loss": 0.3787, + "step": 43260 + }, + { + "epoch": 4.327, + "grad_norm": 7.690916538238525, + "learning_rate": 1.3498e-06, + "loss": 0.5234, + "step": 43270 + }, + { + "epoch": 4.328, + "grad_norm": 41.93429946899414, + "learning_rate": 1.3478000000000003e-06, + "loss": 0.5841, + "step": 43280 + }, + { + "epoch": 4.329, + "grad_norm": 15.757827758789062, + "learning_rate": 1.3458e-06, + "loss": 0.3595, + "step": 43290 + }, + { + "epoch": 4.33, + "grad_norm": 15.618181228637695, + "learning_rate": 1.3438000000000002e-06, + "loss": 0.6405, + "step": 43300 + }, + { + "epoch": 4.331, + "grad_norm": 35.363834381103516, + "learning_rate": 1.3418e-06, + "loss": 0.5346, + "step": 43310 + }, + { + "epoch": 4.332, + "grad_norm": 18.901430130004883, + "learning_rate": 1.3398e-06, + "loss": 0.7658, + "step": 43320 + }, + { + "epoch": 4.333, + "grad_norm": 15.545169830322266, + "learning_rate": 1.3378000000000001e-06, + "loss": 0.3699, + "step": 43330 + }, + { + "epoch": 4.334, + "grad_norm": 16.735557556152344, + "learning_rate": 1.3358000000000002e-06, + "loss": 0.3499, + "step": 43340 + }, + { + "epoch": 4.335, + "grad_norm": 13.97559928894043, + "learning_rate": 1.3338e-06, + "loss": 0.522, + "step": 43350 + }, + { + "epoch": 4.336, + "grad_norm": 58.62332534790039, + "learning_rate": 1.3318000000000001e-06, + "loss": 0.6854, + "step": 43360 + }, + { + "epoch": 4.337, + "grad_norm": 38.00271224975586, + "learning_rate": 1.3298e-06, + "loss": 0.4987, + "step": 43370 + }, + { + "epoch": 4.338, + "grad_norm": 48.7909049987793, + "learning_rate": 1.3278000000000002e-06, + "loss": 0.536, + "step": 43380 + }, + { + "epoch": 4.339, + "grad_norm": 10.43138599395752, + "learning_rate": 1.3258e-06, + "loss": 0.568, + "step": 43390 + }, + { + "epoch": 4.34, + "grad_norm": 47.01791000366211, + "learning_rate": 1.324e-06, + "loss": 0.4428, + "step": 43400 + }, + { + "epoch": 4.341, + "grad_norm": 2.758685350418091, + "learning_rate": 1.3220000000000002e-06, + "loss": 0.4351, + "step": 43410 + }, + { + "epoch": 4.342, + "grad_norm": 9.140447616577148, + "learning_rate": 1.32e-06, + "loss": 0.3583, + "step": 43420 + }, + { + "epoch": 4.343, + "grad_norm": 21.019405364990234, + "learning_rate": 1.3180000000000001e-06, + "loss": 0.6468, + "step": 43430 + }, + { + "epoch": 4.344, + "grad_norm": 10.030232429504395, + "learning_rate": 1.316e-06, + "loss": 0.6114, + "step": 43440 + }, + { + "epoch": 4.345, + "grad_norm": 21.26093101501465, + "learning_rate": 1.314e-06, + "loss": 0.3142, + "step": 43450 + }, + { + "epoch": 4.346, + "grad_norm": 31.151533126831055, + "learning_rate": 1.3120000000000003e-06, + "loss": 0.3533, + "step": 43460 + }, + { + "epoch": 4.3469999999999995, + "grad_norm": 28.893701553344727, + "learning_rate": 1.3100000000000002e-06, + "loss": 0.614, + "step": 43470 + }, + { + "epoch": 4.348, + "grad_norm": 67.40365600585938, + "learning_rate": 1.308e-06, + "loss": 0.4243, + "step": 43480 + }, + { + "epoch": 4.349, + "grad_norm": 28.138734817504883, + "learning_rate": 1.306e-06, + "loss": 0.4, + "step": 43490 + }, + { + "epoch": 4.35, + "grad_norm": 36.38271713256836, + "learning_rate": 1.304e-06, + "loss": 0.7892, + "step": 43500 + }, + { + "epoch": 4.351, + "grad_norm": 36.84843826293945, + "learning_rate": 1.3020000000000002e-06, + "loss": 0.5398, + "step": 43510 + }, + { + "epoch": 4.352, + "grad_norm": 50.559635162353516, + "learning_rate": 1.3e-06, + "loss": 0.7597, + "step": 43520 + }, + { + "epoch": 4.353, + "grad_norm": 18.063173294067383, + "learning_rate": 1.2980000000000001e-06, + "loss": 0.5857, + "step": 43530 + }, + { + "epoch": 4.354, + "grad_norm": 55.33283615112305, + "learning_rate": 1.296e-06, + "loss": 0.6993, + "step": 43540 + }, + { + "epoch": 4.355, + "grad_norm": 48.159889221191406, + "learning_rate": 1.294e-06, + "loss": 0.6422, + "step": 43550 + }, + { + "epoch": 4.356, + "grad_norm": 56.55753707885742, + "learning_rate": 1.2920000000000003e-06, + "loss": 0.5918, + "step": 43560 + }, + { + "epoch": 4.357, + "grad_norm": 28.358430862426758, + "learning_rate": 1.2900000000000001e-06, + "loss": 0.5238, + "step": 43570 + }, + { + "epoch": 4.358, + "grad_norm": 16.447559356689453, + "learning_rate": 1.288e-06, + "loss": 0.8666, + "step": 43580 + }, + { + "epoch": 4.359, + "grad_norm": 22.63037872314453, + "learning_rate": 1.286e-06, + "loss": 0.5798, + "step": 43590 + }, + { + "epoch": 4.36, + "grad_norm": 6.575098991394043, + "learning_rate": 1.284e-06, + "loss": 0.5513, + "step": 43600 + }, + { + "epoch": 4.361, + "grad_norm": 30.44253158569336, + "learning_rate": 1.2820000000000002e-06, + "loss": 0.651, + "step": 43610 + }, + { + "epoch": 4.362, + "grad_norm": 62.634090423583984, + "learning_rate": 1.28e-06, + "loss": 0.9522, + "step": 43620 + }, + { + "epoch": 4.3629999999999995, + "grad_norm": 42.4155158996582, + "learning_rate": 1.278e-06, + "loss": 0.681, + "step": 43630 + }, + { + "epoch": 4.364, + "grad_norm": 7.64183235168457, + "learning_rate": 1.276e-06, + "loss": 0.3704, + "step": 43640 + }, + { + "epoch": 4.365, + "grad_norm": 55.91519546508789, + "learning_rate": 1.2740000000000002e-06, + "loss": 0.956, + "step": 43650 + }, + { + "epoch": 4.366, + "grad_norm": 43.432830810546875, + "learning_rate": 1.2720000000000003e-06, + "loss": 0.5726, + "step": 43660 + }, + { + "epoch": 4.367, + "grad_norm": 5.294198513031006, + "learning_rate": 1.2700000000000001e-06, + "loss": 0.5203, + "step": 43670 + }, + { + "epoch": 4.368, + "grad_norm": 49.72700881958008, + "learning_rate": 1.268e-06, + "loss": 0.5669, + "step": 43680 + }, + { + "epoch": 4.369, + "grad_norm": 25.87734031677246, + "learning_rate": 1.266e-06, + "loss": 0.7345, + "step": 43690 + }, + { + "epoch": 4.37, + "grad_norm": 34.031837463378906, + "learning_rate": 1.2640000000000003e-06, + "loss": 0.6073, + "step": 43700 + }, + { + "epoch": 4.371, + "grad_norm": 8.156420707702637, + "learning_rate": 1.2620000000000002e-06, + "loss": 0.548, + "step": 43710 + }, + { + "epoch": 4.372, + "grad_norm": 28.363479614257812, + "learning_rate": 1.26e-06, + "loss": 0.699, + "step": 43720 + }, + { + "epoch": 4.373, + "grad_norm": 27.95302391052246, + "learning_rate": 1.258e-06, + "loss": 0.5499, + "step": 43730 + }, + { + "epoch": 4.374, + "grad_norm": 28.509695053100586, + "learning_rate": 1.256e-06, + "loss": 0.4613, + "step": 43740 + }, + { + "epoch": 4.375, + "grad_norm": 23.42409324645996, + "learning_rate": 1.2540000000000002e-06, + "loss": 0.5041, + "step": 43750 + }, + { + "epoch": 4.376, + "grad_norm": 75.357666015625, + "learning_rate": 1.2520000000000003e-06, + "loss": 0.8561, + "step": 43760 + }, + { + "epoch": 4.377, + "grad_norm": 19.810129165649414, + "learning_rate": 1.25e-06, + "loss": 0.5149, + "step": 43770 + }, + { + "epoch": 4.378, + "grad_norm": 49.86960983276367, + "learning_rate": 1.248e-06, + "loss": 1.0917, + "step": 43780 + }, + { + "epoch": 4.379, + "grad_norm": 48.763427734375, + "learning_rate": 1.2460000000000002e-06, + "loss": 0.5727, + "step": 43790 + }, + { + "epoch": 4.38, + "grad_norm": 40.76609420776367, + "learning_rate": 1.244e-06, + "loss": 0.6641, + "step": 43800 + }, + { + "epoch": 4.381, + "grad_norm": 24.340974807739258, + "learning_rate": 1.2420000000000001e-06, + "loss": 0.3564, + "step": 43810 + }, + { + "epoch": 4.382, + "grad_norm": 20.576501846313477, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.4448, + "step": 43820 + }, + { + "epoch": 4.383, + "grad_norm": 33.2127571105957, + "learning_rate": 1.238e-06, + "loss": 0.7988, + "step": 43830 + }, + { + "epoch": 4.384, + "grad_norm": 62.10398864746094, + "learning_rate": 1.2360000000000001e-06, + "loss": 0.4453, + "step": 43840 + }, + { + "epoch": 4.385, + "grad_norm": 33.38579177856445, + "learning_rate": 1.234e-06, + "loss": 0.5619, + "step": 43850 + }, + { + "epoch": 4.386, + "grad_norm": 17.548891067504883, + "learning_rate": 1.2320000000000002e-06, + "loss": 0.3925, + "step": 43860 + }, + { + "epoch": 4.3870000000000005, + "grad_norm": 12.536221504211426, + "learning_rate": 1.23e-06, + "loss": 0.2319, + "step": 43870 + }, + { + "epoch": 4.388, + "grad_norm": 12.191761016845703, + "learning_rate": 1.2280000000000001e-06, + "loss": 0.7507, + "step": 43880 + }, + { + "epoch": 4.389, + "grad_norm": 26.444080352783203, + "learning_rate": 1.2260000000000002e-06, + "loss": 0.5662, + "step": 43890 + }, + { + "epoch": 4.39, + "grad_norm": 52.09244918823242, + "learning_rate": 1.224e-06, + "loss": 0.5521, + "step": 43900 + }, + { + "epoch": 4.391, + "grad_norm": 42.6751594543457, + "learning_rate": 1.2220000000000001e-06, + "loss": 0.5683, + "step": 43910 + }, + { + "epoch": 4.392, + "grad_norm": 102.61528015136719, + "learning_rate": 1.2200000000000002e-06, + "loss": 0.5493, + "step": 43920 + }, + { + "epoch": 4.393, + "grad_norm": 37.700775146484375, + "learning_rate": 1.2180000000000002e-06, + "loss": 0.4303, + "step": 43930 + }, + { + "epoch": 4.394, + "grad_norm": 12.019837379455566, + "learning_rate": 1.216e-06, + "loss": 0.3392, + "step": 43940 + }, + { + "epoch": 4.395, + "grad_norm": 16.28959846496582, + "learning_rate": 1.214e-06, + "loss": 0.5402, + "step": 43950 + }, + { + "epoch": 4.396, + "grad_norm": 27.6002197265625, + "learning_rate": 1.2120000000000002e-06, + "loss": 0.7572, + "step": 43960 + }, + { + "epoch": 4.397, + "grad_norm": 74.15684509277344, + "learning_rate": 1.21e-06, + "loss": 0.4435, + "step": 43970 + }, + { + "epoch": 4.398, + "grad_norm": 7.384476661682129, + "learning_rate": 1.2080000000000001e-06, + "loss": 0.3989, + "step": 43980 + }, + { + "epoch": 4.399, + "grad_norm": 21.462980270385742, + "learning_rate": 1.2060000000000002e-06, + "loss": 0.4035, + "step": 43990 + }, + { + "epoch": 4.4, + "grad_norm": 50.33262252807617, + "learning_rate": 1.204e-06, + "loss": 0.5787, + "step": 44000 + }, + { + "epoch": 4.401, + "grad_norm": 28.74320411682129, + "learning_rate": 1.202e-06, + "loss": 0.4512, + "step": 44010 + }, + { + "epoch": 4.402, + "grad_norm": 6.188956260681152, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.5807, + "step": 44020 + }, + { + "epoch": 4.4030000000000005, + "grad_norm": 16.039443969726562, + "learning_rate": 1.1980000000000002e-06, + "loss": 0.3878, + "step": 44030 + }, + { + "epoch": 4.404, + "grad_norm": 4.788004398345947, + "learning_rate": 1.196e-06, + "loss": 0.7235, + "step": 44040 + }, + { + "epoch": 4.405, + "grad_norm": 34.19270324707031, + "learning_rate": 1.1940000000000001e-06, + "loss": 0.2703, + "step": 44050 + }, + { + "epoch": 4.406, + "grad_norm": 17.581157684326172, + "learning_rate": 1.1920000000000002e-06, + "loss": 0.6753, + "step": 44060 + }, + { + "epoch": 4.407, + "grad_norm": 37.96449661254883, + "learning_rate": 1.19e-06, + "loss": 0.4106, + "step": 44070 + }, + { + "epoch": 4.408, + "grad_norm": 35.01892852783203, + "learning_rate": 1.188e-06, + "loss": 0.4215, + "step": 44080 + }, + { + "epoch": 4.409, + "grad_norm": 7.498485565185547, + "learning_rate": 1.1860000000000002e-06, + "loss": 0.6555, + "step": 44090 + }, + { + "epoch": 4.41, + "grad_norm": 43.318233489990234, + "learning_rate": 1.1840000000000002e-06, + "loss": 0.8606, + "step": 44100 + }, + { + "epoch": 4.411, + "grad_norm": 45.79195022583008, + "learning_rate": 1.182e-06, + "loss": 0.4382, + "step": 44110 + }, + { + "epoch": 4.412, + "grad_norm": 55.97035598754883, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.6438, + "step": 44120 + }, + { + "epoch": 4.413, + "grad_norm": 11.8862943649292, + "learning_rate": 1.1780000000000002e-06, + "loss": 0.3783, + "step": 44130 + }, + { + "epoch": 4.414, + "grad_norm": 53.719871520996094, + "learning_rate": 1.176e-06, + "loss": 0.476, + "step": 44140 + }, + { + "epoch": 4.415, + "grad_norm": 47.69070053100586, + "learning_rate": 1.1740000000000001e-06, + "loss": 0.4777, + "step": 44150 + }, + { + "epoch": 4.416, + "grad_norm": 60.3067626953125, + "learning_rate": 1.1720000000000002e-06, + "loss": 0.782, + "step": 44160 + }, + { + "epoch": 4.417, + "grad_norm": 53.17386245727539, + "learning_rate": 1.1700000000000002e-06, + "loss": 0.5574, + "step": 44170 + }, + { + "epoch": 4.418, + "grad_norm": 71.64693450927734, + "learning_rate": 1.168e-06, + "loss": 0.7901, + "step": 44180 + }, + { + "epoch": 4.419, + "grad_norm": 7.084906101226807, + "learning_rate": 1.1660000000000001e-06, + "loss": 0.5609, + "step": 44190 + }, + { + "epoch": 4.42, + "grad_norm": 42.32343673706055, + "learning_rate": 1.1640000000000002e-06, + "loss": 0.6591, + "step": 44200 + }, + { + "epoch": 4.421, + "grad_norm": 1.9073995351791382, + "learning_rate": 1.162e-06, + "loss": 0.643, + "step": 44210 + }, + { + "epoch": 4.422, + "grad_norm": 3.4417290687561035, + "learning_rate": 1.1600000000000001e-06, + "loss": 0.4076, + "step": 44220 + }, + { + "epoch": 4.423, + "grad_norm": 5.756375789642334, + "learning_rate": 1.1580000000000002e-06, + "loss": 0.644, + "step": 44230 + }, + { + "epoch": 4.424, + "grad_norm": 2.408170461654663, + "learning_rate": 1.156e-06, + "loss": 0.7076, + "step": 44240 + }, + { + "epoch": 4.425, + "grad_norm": 47.58955383300781, + "learning_rate": 1.154e-06, + "loss": 0.532, + "step": 44250 + }, + { + "epoch": 4.426, + "grad_norm": 43.46930694580078, + "learning_rate": 1.1520000000000002e-06, + "loss": 0.636, + "step": 44260 + }, + { + "epoch": 4.427, + "grad_norm": 76.24018859863281, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.5307, + "step": 44270 + }, + { + "epoch": 4.428, + "grad_norm": 60.64769744873047, + "learning_rate": 1.148e-06, + "loss": 0.5436, + "step": 44280 + }, + { + "epoch": 4.429, + "grad_norm": 8.244017601013184, + "learning_rate": 1.1460000000000001e-06, + "loss": 0.5549, + "step": 44290 + }, + { + "epoch": 4.43, + "grad_norm": 5.509671211242676, + "learning_rate": 1.1440000000000002e-06, + "loss": 0.4393, + "step": 44300 + }, + { + "epoch": 4.431, + "grad_norm": 41.25537109375, + "learning_rate": 1.142e-06, + "loss": 0.5594, + "step": 44310 + }, + { + "epoch": 4.432, + "grad_norm": 64.00753784179688, + "learning_rate": 1.14e-06, + "loss": 0.8429, + "step": 44320 + }, + { + "epoch": 4.433, + "grad_norm": 13.171112060546875, + "learning_rate": 1.1380000000000002e-06, + "loss": 0.715, + "step": 44330 + }, + { + "epoch": 4.434, + "grad_norm": 21.44270896911621, + "learning_rate": 1.1360000000000002e-06, + "loss": 0.4565, + "step": 44340 + }, + { + "epoch": 4.435, + "grad_norm": 17.770322799682617, + "learning_rate": 1.134e-06, + "loss": 0.5274, + "step": 44350 + }, + { + "epoch": 4.436, + "grad_norm": 1.8393038511276245, + "learning_rate": 1.1320000000000001e-06, + "loss": 0.4659, + "step": 44360 + }, + { + "epoch": 4.437, + "grad_norm": 2.2791614532470703, + "learning_rate": 1.1300000000000002e-06, + "loss": 0.3917, + "step": 44370 + }, + { + "epoch": 4.438, + "grad_norm": 99.7815170288086, + "learning_rate": 1.128e-06, + "loss": 0.6234, + "step": 44380 + }, + { + "epoch": 4.439, + "grad_norm": 59.29553985595703, + "learning_rate": 1.126e-06, + "loss": 0.6075, + "step": 44390 + }, + { + "epoch": 4.44, + "grad_norm": 19.124174118041992, + "learning_rate": 1.1240000000000002e-06, + "loss": 0.8834, + "step": 44400 + }, + { + "epoch": 4.441, + "grad_norm": 17.70714569091797, + "learning_rate": 1.122e-06, + "loss": 0.3932, + "step": 44410 + }, + { + "epoch": 4.442, + "grad_norm": 59.30381774902344, + "learning_rate": 1.12e-06, + "loss": 0.8754, + "step": 44420 + }, + { + "epoch": 4.443, + "grad_norm": 17.972822189331055, + "learning_rate": 1.1180000000000001e-06, + "loss": 0.4991, + "step": 44430 + }, + { + "epoch": 4.444, + "grad_norm": 5.084202289581299, + "learning_rate": 1.1160000000000002e-06, + "loss": 0.9272, + "step": 44440 + }, + { + "epoch": 4.445, + "grad_norm": 21.268983840942383, + "learning_rate": 1.114e-06, + "loss": 0.5014, + "step": 44450 + }, + { + "epoch": 4.446, + "grad_norm": 46.41943359375, + "learning_rate": 1.1120000000000001e-06, + "loss": 0.7518, + "step": 44460 + }, + { + "epoch": 4.447, + "grad_norm": 25.36971664428711, + "learning_rate": 1.1100000000000002e-06, + "loss": 0.776, + "step": 44470 + }, + { + "epoch": 4.448, + "grad_norm": 57.427738189697266, + "learning_rate": 1.108e-06, + "loss": 0.469, + "step": 44480 + }, + { + "epoch": 4.449, + "grad_norm": 17.008895874023438, + "learning_rate": 1.106e-06, + "loss": 0.6733, + "step": 44490 + }, + { + "epoch": 4.45, + "grad_norm": 13.199875831604004, + "learning_rate": 1.1040000000000001e-06, + "loss": 0.7869, + "step": 44500 + }, + { + "epoch": 4.451, + "grad_norm": 24.979137420654297, + "learning_rate": 1.1020000000000002e-06, + "loss": 0.5413, + "step": 44510 + }, + { + "epoch": 4.452, + "grad_norm": 34.9896354675293, + "learning_rate": 1.1e-06, + "loss": 0.6921, + "step": 44520 + }, + { + "epoch": 4.453, + "grad_norm": 34.93574523925781, + "learning_rate": 1.0980000000000001e-06, + "loss": 0.418, + "step": 44530 + }, + { + "epoch": 4.454, + "grad_norm": 2.021084785461426, + "learning_rate": 1.0960000000000002e-06, + "loss": 0.4103, + "step": 44540 + }, + { + "epoch": 4.455, + "grad_norm": 5.255341529846191, + "learning_rate": 1.094e-06, + "loss": 0.4357, + "step": 44550 + }, + { + "epoch": 4.456, + "grad_norm": 60.97416687011719, + "learning_rate": 1.092e-06, + "loss": 0.4744, + "step": 44560 + }, + { + "epoch": 4.457, + "grad_norm": 3.359767198562622, + "learning_rate": 1.0900000000000002e-06, + "loss": 0.5682, + "step": 44570 + }, + { + "epoch": 4.458, + "grad_norm": 6.942392826080322, + "learning_rate": 1.088e-06, + "loss": 0.7497, + "step": 44580 + }, + { + "epoch": 4.459, + "grad_norm": 35.56052780151367, + "learning_rate": 1.086e-06, + "loss": 0.903, + "step": 44590 + }, + { + "epoch": 4.46, + "grad_norm": 53.783538818359375, + "learning_rate": 1.0840000000000001e-06, + "loss": 0.7616, + "step": 44600 + }, + { + "epoch": 4.461, + "grad_norm": 26.523746490478516, + "learning_rate": 1.0820000000000002e-06, + "loss": 0.5566, + "step": 44610 + }, + { + "epoch": 4.462, + "grad_norm": 41.27775573730469, + "learning_rate": 1.08e-06, + "loss": 0.8386, + "step": 44620 + }, + { + "epoch": 4.463, + "grad_norm": 15.443517684936523, + "learning_rate": 1.078e-06, + "loss": 0.4062, + "step": 44630 + }, + { + "epoch": 4.464, + "grad_norm": 39.41666793823242, + "learning_rate": 1.0760000000000002e-06, + "loss": 0.5806, + "step": 44640 + }, + { + "epoch": 4.465, + "grad_norm": 19.193965911865234, + "learning_rate": 1.074e-06, + "loss": 0.5718, + "step": 44650 + }, + { + "epoch": 4.466, + "grad_norm": 3.6427814960479736, + "learning_rate": 1.072e-06, + "loss": 0.4269, + "step": 44660 + }, + { + "epoch": 4.467, + "grad_norm": 50.77950668334961, + "learning_rate": 1.0700000000000001e-06, + "loss": 0.6791, + "step": 44670 + }, + { + "epoch": 4.468, + "grad_norm": 29.322980880737305, + "learning_rate": 1.0680000000000002e-06, + "loss": 0.4933, + "step": 44680 + }, + { + "epoch": 4.469, + "grad_norm": 86.63690185546875, + "learning_rate": 1.066e-06, + "loss": 0.666, + "step": 44690 + }, + { + "epoch": 4.47, + "grad_norm": 11.375699996948242, + "learning_rate": 1.064e-06, + "loss": 0.4718, + "step": 44700 + }, + { + "epoch": 4.471, + "grad_norm": 12.194612503051758, + "learning_rate": 1.0620000000000002e-06, + "loss": 0.5804, + "step": 44710 + }, + { + "epoch": 4.4719999999999995, + "grad_norm": 33.018497467041016, + "learning_rate": 1.06e-06, + "loss": 0.6052, + "step": 44720 + }, + { + "epoch": 4.473, + "grad_norm": 62.320247650146484, + "learning_rate": 1.058e-06, + "loss": 0.7786, + "step": 44730 + }, + { + "epoch": 4.474, + "grad_norm": 2.1804471015930176, + "learning_rate": 1.0560000000000001e-06, + "loss": 0.3413, + "step": 44740 + }, + { + "epoch": 4.475, + "grad_norm": 78.71873474121094, + "learning_rate": 1.054e-06, + "loss": 0.5368, + "step": 44750 + }, + { + "epoch": 4.476, + "grad_norm": 64.87627410888672, + "learning_rate": 1.052e-06, + "loss": 0.5431, + "step": 44760 + }, + { + "epoch": 4.477, + "grad_norm": 47.796199798583984, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.6959, + "step": 44770 + }, + { + "epoch": 4.478, + "grad_norm": 44.98781204223633, + "learning_rate": 1.0480000000000002e-06, + "loss": 0.5795, + "step": 44780 + }, + { + "epoch": 4.479, + "grad_norm": 22.857219696044922, + "learning_rate": 1.046e-06, + "loss": 0.9602, + "step": 44790 + }, + { + "epoch": 4.48, + "grad_norm": 32.4987678527832, + "learning_rate": 1.044e-06, + "loss": 0.941, + "step": 44800 + }, + { + "epoch": 4.481, + "grad_norm": 42.770912170410156, + "learning_rate": 1.0420000000000001e-06, + "loss": 0.4813, + "step": 44810 + }, + { + "epoch": 4.482, + "grad_norm": 2.932715654373169, + "learning_rate": 1.04e-06, + "loss": 0.4139, + "step": 44820 + }, + { + "epoch": 4.483, + "grad_norm": 20.752546310424805, + "learning_rate": 1.038e-06, + "loss": 0.6361, + "step": 44830 + }, + { + "epoch": 4.484, + "grad_norm": 56.61863708496094, + "learning_rate": 1.0360000000000001e-06, + "loss": 0.8215, + "step": 44840 + }, + { + "epoch": 4.485, + "grad_norm": 59.40135192871094, + "learning_rate": 1.0340000000000002e-06, + "loss": 0.297, + "step": 44850 + }, + { + "epoch": 4.486, + "grad_norm": 14.24195384979248, + "learning_rate": 1.032e-06, + "loss": 0.577, + "step": 44860 + }, + { + "epoch": 4.487, + "grad_norm": 75.15922546386719, + "learning_rate": 1.03e-06, + "loss": 0.6376, + "step": 44870 + }, + { + "epoch": 4.4879999999999995, + "grad_norm": 71.25492095947266, + "learning_rate": 1.0280000000000002e-06, + "loss": 0.6669, + "step": 44880 + }, + { + "epoch": 4.489, + "grad_norm": 6.253817081451416, + "learning_rate": 1.026e-06, + "loss": 0.4921, + "step": 44890 + }, + { + "epoch": 4.49, + "grad_norm": 44.84489822387695, + "learning_rate": 1.024e-06, + "loss": 0.6678, + "step": 44900 + }, + { + "epoch": 4.491, + "grad_norm": 54.61061477661133, + "learning_rate": 1.0220000000000001e-06, + "loss": 0.6607, + "step": 44910 + }, + { + "epoch": 4.492, + "grad_norm": 49.33956527709961, + "learning_rate": 1.02e-06, + "loss": 0.7338, + "step": 44920 + }, + { + "epoch": 4.493, + "grad_norm": 11.369890213012695, + "learning_rate": 1.018e-06, + "loss": 0.5798, + "step": 44930 + }, + { + "epoch": 4.494, + "grad_norm": 1.8291288614273071, + "learning_rate": 1.016e-06, + "loss": 0.5389, + "step": 44940 + }, + { + "epoch": 4.495, + "grad_norm": 42.062808990478516, + "learning_rate": 1.0140000000000002e-06, + "loss": 0.6113, + "step": 44950 + }, + { + "epoch": 4.496, + "grad_norm": 21.281036376953125, + "learning_rate": 1.012e-06, + "loss": 0.4842, + "step": 44960 + }, + { + "epoch": 4.497, + "grad_norm": 19.57158660888672, + "learning_rate": 1.01e-06, + "loss": 0.4564, + "step": 44970 + }, + { + "epoch": 4.498, + "grad_norm": 16.572309494018555, + "learning_rate": 1.0080000000000001e-06, + "loss": 0.4437, + "step": 44980 + }, + { + "epoch": 4.499, + "grad_norm": 52.86716079711914, + "learning_rate": 1.006e-06, + "loss": 0.6462, + "step": 44990 + }, + { + "epoch": 4.5, + "grad_norm": 13.026033401489258, + "learning_rate": 1.004e-06, + "loss": 0.59, + "step": 45000 + }, + { + "epoch": 4.501, + "grad_norm": 22.181806564331055, + "learning_rate": 1.002e-06, + "loss": 0.5236, + "step": 45010 + }, + { + "epoch": 4.502, + "grad_norm": 32.47010803222656, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.7592, + "step": 45020 + }, + { + "epoch": 4.503, + "grad_norm": 4.673216819763184, + "learning_rate": 9.98e-07, + "loss": 0.3559, + "step": 45030 + }, + { + "epoch": 4.504, + "grad_norm": 29.3690128326416, + "learning_rate": 9.96e-07, + "loss": 0.5105, + "step": 45040 + }, + { + "epoch": 4.505, + "grad_norm": 13.767579078674316, + "learning_rate": 9.940000000000001e-07, + "loss": 0.5032, + "step": 45050 + }, + { + "epoch": 4.506, + "grad_norm": 42.104862213134766, + "learning_rate": 9.92e-07, + "loss": 0.7156, + "step": 45060 + }, + { + "epoch": 4.507, + "grad_norm": 42.20527648925781, + "learning_rate": 9.9e-07, + "loss": 0.5525, + "step": 45070 + }, + { + "epoch": 4.508, + "grad_norm": 34.467933654785156, + "learning_rate": 9.880000000000001e-07, + "loss": 0.5533, + "step": 45080 + }, + { + "epoch": 4.509, + "grad_norm": 3.9194490909576416, + "learning_rate": 9.86e-07, + "loss": 0.3542, + "step": 45090 + }, + { + "epoch": 4.51, + "grad_norm": 71.32640838623047, + "learning_rate": 9.84e-07, + "loss": 0.4193, + "step": 45100 + }, + { + "epoch": 4.511, + "grad_norm": 56.496826171875, + "learning_rate": 9.82e-07, + "loss": 0.591, + "step": 45110 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 17.558574676513672, + "learning_rate": 9.800000000000001e-07, + "loss": 0.4785, + "step": 45120 + }, + { + "epoch": 4.513, + "grad_norm": 32.05054473876953, + "learning_rate": 9.78e-07, + "loss": 0.3853, + "step": 45130 + }, + { + "epoch": 4.514, + "grad_norm": 22.20078468322754, + "learning_rate": 9.76e-07, + "loss": 0.6342, + "step": 45140 + }, + { + "epoch": 4.515, + "grad_norm": 44.59579086303711, + "learning_rate": 9.740000000000001e-07, + "loss": 0.4882, + "step": 45150 + }, + { + "epoch": 4.516, + "grad_norm": 45.14775466918945, + "learning_rate": 9.72e-07, + "loss": 0.5903, + "step": 45160 + }, + { + "epoch": 4.517, + "grad_norm": 117.20407104492188, + "learning_rate": 9.7e-07, + "loss": 0.7166, + "step": 45170 + }, + { + "epoch": 4.518, + "grad_norm": 29.686355590820312, + "learning_rate": 9.68e-07, + "loss": 0.6031, + "step": 45180 + }, + { + "epoch": 4.519, + "grad_norm": 44.74457550048828, + "learning_rate": 9.660000000000002e-07, + "loss": 0.5089, + "step": 45190 + }, + { + "epoch": 4.52, + "grad_norm": 22.07326889038086, + "learning_rate": 9.64e-07, + "loss": 0.6083, + "step": 45200 + }, + { + "epoch": 4.521, + "grad_norm": 49.83152389526367, + "learning_rate": 9.62e-07, + "loss": 0.562, + "step": 45210 + }, + { + "epoch": 4.522, + "grad_norm": 50.51664733886719, + "learning_rate": 9.600000000000001e-07, + "loss": 0.6047, + "step": 45220 + }, + { + "epoch": 4.523, + "grad_norm": 56.542938232421875, + "learning_rate": 9.58e-07, + "loss": 0.5488, + "step": 45230 + }, + { + "epoch": 4.524, + "grad_norm": 46.843406677246094, + "learning_rate": 9.56e-07, + "loss": 0.7209, + "step": 45240 + }, + { + "epoch": 4.525, + "grad_norm": 6.081303119659424, + "learning_rate": 9.54e-07, + "loss": 0.4724, + "step": 45250 + }, + { + "epoch": 4.526, + "grad_norm": 28.085397720336914, + "learning_rate": 9.520000000000002e-07, + "loss": 0.7269, + "step": 45260 + }, + { + "epoch": 4.527, + "grad_norm": 52.40395736694336, + "learning_rate": 9.500000000000001e-07, + "loss": 0.6334, + "step": 45270 + }, + { + "epoch": 4.5280000000000005, + "grad_norm": 49.768287658691406, + "learning_rate": 9.480000000000001e-07, + "loss": 0.3782, + "step": 45280 + }, + { + "epoch": 4.529, + "grad_norm": 10.42574405670166, + "learning_rate": 9.460000000000001e-07, + "loss": 0.8404, + "step": 45290 + }, + { + "epoch": 4.53, + "grad_norm": 13.148209571838379, + "learning_rate": 9.440000000000001e-07, + "loss": 0.4263, + "step": 45300 + }, + { + "epoch": 4.531, + "grad_norm": 56.37141799926758, + "learning_rate": 9.420000000000002e-07, + "loss": 0.6954, + "step": 45310 + }, + { + "epoch": 4.532, + "grad_norm": 41.969112396240234, + "learning_rate": 9.400000000000001e-07, + "loss": 0.4243, + "step": 45320 + }, + { + "epoch": 4.533, + "grad_norm": 61.941429138183594, + "learning_rate": 9.380000000000001e-07, + "loss": 0.8364, + "step": 45330 + }, + { + "epoch": 4.534, + "grad_norm": 7.106180667877197, + "learning_rate": 9.360000000000001e-07, + "loss": 0.6544, + "step": 45340 + }, + { + "epoch": 4.535, + "grad_norm": 31.053434371948242, + "learning_rate": 9.340000000000001e-07, + "loss": 0.4281, + "step": 45350 + }, + { + "epoch": 4.536, + "grad_norm": 38.16424560546875, + "learning_rate": 9.320000000000001e-07, + "loss": 0.6195, + "step": 45360 + }, + { + "epoch": 4.537, + "grad_norm": 57.80653381347656, + "learning_rate": 9.300000000000001e-07, + "loss": 0.6443, + "step": 45370 + }, + { + "epoch": 4.538, + "grad_norm": 7.248365879058838, + "learning_rate": 9.28e-07, + "loss": 0.6943, + "step": 45380 + }, + { + "epoch": 4.539, + "grad_norm": 2.168619394302368, + "learning_rate": 9.260000000000001e-07, + "loss": 0.4539, + "step": 45390 + }, + { + "epoch": 4.54, + "grad_norm": 9.784174919128418, + "learning_rate": 9.240000000000001e-07, + "loss": 0.4954, + "step": 45400 + }, + { + "epoch": 4.541, + "grad_norm": 27.470226287841797, + "learning_rate": 9.220000000000001e-07, + "loss": 0.9503, + "step": 45410 + }, + { + "epoch": 4.542, + "grad_norm": 24.790925979614258, + "learning_rate": 9.200000000000001e-07, + "loss": 0.3663, + "step": 45420 + }, + { + "epoch": 4.543, + "grad_norm": 37.76428985595703, + "learning_rate": 9.180000000000001e-07, + "loss": 0.7152, + "step": 45430 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 29.315391540527344, + "learning_rate": 9.160000000000001e-07, + "loss": 0.3819, + "step": 45440 + }, + { + "epoch": 4.545, + "grad_norm": 19.65518569946289, + "learning_rate": 9.140000000000001e-07, + "loss": 0.3005, + "step": 45450 + }, + { + "epoch": 4.546, + "grad_norm": 4.487301349639893, + "learning_rate": 9.120000000000001e-07, + "loss": 0.4622, + "step": 45460 + }, + { + "epoch": 4.547, + "grad_norm": 41.73360824584961, + "learning_rate": 9.100000000000001e-07, + "loss": 0.5676, + "step": 45470 + }, + { + "epoch": 4.548, + "grad_norm": 48.73320770263672, + "learning_rate": 9.080000000000001e-07, + "loss": 0.8195, + "step": 45480 + }, + { + "epoch": 4.549, + "grad_norm": 12.341236114501953, + "learning_rate": 9.062000000000001e-07, + "loss": 0.3925, + "step": 45490 + }, + { + "epoch": 4.55, + "grad_norm": 4.098908424377441, + "learning_rate": 9.042000000000001e-07, + "loss": 0.4159, + "step": 45500 + }, + { + "epoch": 4.551, + "grad_norm": 22.557924270629883, + "learning_rate": 9.022e-07, + "loss": 0.2809, + "step": 45510 + }, + { + "epoch": 4.552, + "grad_norm": 19.807369232177734, + "learning_rate": 9.002000000000001e-07, + "loss": 0.6193, + "step": 45520 + }, + { + "epoch": 4.553, + "grad_norm": 30.91956901550293, + "learning_rate": 8.982e-07, + "loss": 0.9155, + "step": 45530 + }, + { + "epoch": 4.554, + "grad_norm": 30.81283187866211, + "learning_rate": 8.962000000000001e-07, + "loss": 0.5091, + "step": 45540 + }, + { + "epoch": 4.555, + "grad_norm": 39.03916549682617, + "learning_rate": 8.942e-07, + "loss": 0.4792, + "step": 45550 + }, + { + "epoch": 4.556, + "grad_norm": 30.683202743530273, + "learning_rate": 8.922e-07, + "loss": 0.5482, + "step": 45560 + }, + { + "epoch": 4.557, + "grad_norm": 36.451107025146484, + "learning_rate": 8.902000000000001e-07, + "loss": 0.5982, + "step": 45570 + }, + { + "epoch": 4.558, + "grad_norm": 19.348159790039062, + "learning_rate": 8.882e-07, + "loss": 0.4162, + "step": 45580 + }, + { + "epoch": 4.559, + "grad_norm": 69.45014953613281, + "learning_rate": 8.862000000000001e-07, + "loss": 0.6971, + "step": 45590 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 21.53099250793457, + "learning_rate": 8.842e-07, + "loss": 0.5507, + "step": 45600 + }, + { + "epoch": 4.561, + "grad_norm": 25.485855102539062, + "learning_rate": 8.822000000000001e-07, + "loss": 0.6958, + "step": 45610 + }, + { + "epoch": 4.562, + "grad_norm": 43.30933380126953, + "learning_rate": 8.802000000000001e-07, + "loss": 0.5381, + "step": 45620 + }, + { + "epoch": 4.563, + "grad_norm": 29.24176597595215, + "learning_rate": 8.782e-07, + "loss": 0.5499, + "step": 45630 + }, + { + "epoch": 4.564, + "grad_norm": 34.37154006958008, + "learning_rate": 8.762000000000001e-07, + "loss": 0.2587, + "step": 45640 + }, + { + "epoch": 4.5649999999999995, + "grad_norm": 31.237136840820312, + "learning_rate": 8.742e-07, + "loss": 0.6016, + "step": 45650 + }, + { + "epoch": 4.566, + "grad_norm": 35.17666244506836, + "learning_rate": 8.722000000000001e-07, + "loss": 0.4353, + "step": 45660 + }, + { + "epoch": 4.567, + "grad_norm": 24.52681541442871, + "learning_rate": 8.702e-07, + "loss": 0.8129, + "step": 45670 + }, + { + "epoch": 4.568, + "grad_norm": 73.54619598388672, + "learning_rate": 8.682e-07, + "loss": 0.4345, + "step": 45680 + }, + { + "epoch": 4.569, + "grad_norm": 51.41396713256836, + "learning_rate": 8.662000000000001e-07, + "loss": 0.5862, + "step": 45690 + }, + { + "epoch": 4.57, + "grad_norm": 22.1870059967041, + "learning_rate": 8.642e-07, + "loss": 0.8158, + "step": 45700 + }, + { + "epoch": 4.571, + "grad_norm": 53.16621398925781, + "learning_rate": 8.622000000000001e-07, + "loss": 0.5901, + "step": 45710 + }, + { + "epoch": 4.572, + "grad_norm": 52.27548599243164, + "learning_rate": 8.602e-07, + "loss": 0.5686, + "step": 45720 + }, + { + "epoch": 4.573, + "grad_norm": 30.82957649230957, + "learning_rate": 8.582e-07, + "loss": 0.4284, + "step": 45730 + }, + { + "epoch": 4.574, + "grad_norm": 4.987204551696777, + "learning_rate": 8.562000000000001e-07, + "loss": 0.5962, + "step": 45740 + }, + { + "epoch": 4.575, + "grad_norm": 44.10116195678711, + "learning_rate": 8.542e-07, + "loss": 0.8412, + "step": 45750 + }, + { + "epoch": 4.576, + "grad_norm": 18.577295303344727, + "learning_rate": 8.522000000000001e-07, + "loss": 0.7083, + "step": 45760 + }, + { + "epoch": 4.577, + "grad_norm": 30.12908363342285, + "learning_rate": 8.502e-07, + "loss": 0.7062, + "step": 45770 + }, + { + "epoch": 4.578, + "grad_norm": 5.816198825836182, + "learning_rate": 8.482000000000001e-07, + "loss": 0.8032, + "step": 45780 + }, + { + "epoch": 4.579, + "grad_norm": 5.501286506652832, + "learning_rate": 8.462e-07, + "loss": 0.3029, + "step": 45790 + }, + { + "epoch": 4.58, + "grad_norm": 41.75872039794922, + "learning_rate": 8.442e-07, + "loss": 0.975, + "step": 45800 + }, + { + "epoch": 4.5809999999999995, + "grad_norm": 28.56104278564453, + "learning_rate": 8.422000000000001e-07, + "loss": 0.6924, + "step": 45810 + }, + { + "epoch": 4.582, + "grad_norm": 64.78409576416016, + "learning_rate": 8.402e-07, + "loss": 0.6091, + "step": 45820 + }, + { + "epoch": 4.583, + "grad_norm": 25.520673751831055, + "learning_rate": 8.382000000000001e-07, + "loss": 0.6849, + "step": 45830 + }, + { + "epoch": 4.584, + "grad_norm": 15.34619140625, + "learning_rate": 8.362e-07, + "loss": 0.5021, + "step": 45840 + }, + { + "epoch": 4.585, + "grad_norm": 42.24744415283203, + "learning_rate": 8.342e-07, + "loss": 0.4582, + "step": 45850 + }, + { + "epoch": 4.586, + "grad_norm": 24.966739654541016, + "learning_rate": 8.322000000000001e-07, + "loss": 0.4246, + "step": 45860 + }, + { + "epoch": 4.587, + "grad_norm": 22.995031356811523, + "learning_rate": 8.302e-07, + "loss": 0.7483, + "step": 45870 + }, + { + "epoch": 4.588, + "grad_norm": 36.11094284057617, + "learning_rate": 8.282000000000001e-07, + "loss": 0.4904, + "step": 45880 + }, + { + "epoch": 4.589, + "grad_norm": 40.967899322509766, + "learning_rate": 8.262e-07, + "loss": 0.4971, + "step": 45890 + }, + { + "epoch": 4.59, + "grad_norm": 40.29697799682617, + "learning_rate": 8.242e-07, + "loss": 0.8347, + "step": 45900 + }, + { + "epoch": 4.591, + "grad_norm": 28.283031463623047, + "learning_rate": 8.222e-07, + "loss": 0.658, + "step": 45910 + }, + { + "epoch": 4.592, + "grad_norm": 25.174957275390625, + "learning_rate": 8.202e-07, + "loss": 0.6491, + "step": 45920 + }, + { + "epoch": 4.593, + "grad_norm": 62.129547119140625, + "learning_rate": 8.184000000000001e-07, + "loss": 0.6112, + "step": 45930 + }, + { + "epoch": 4.594, + "grad_norm": 26.676013946533203, + "learning_rate": 8.164000000000001e-07, + "loss": 0.4537, + "step": 45940 + }, + { + "epoch": 4.595, + "grad_norm": 66.71521759033203, + "learning_rate": 8.144000000000001e-07, + "loss": 0.6426, + "step": 45950 + }, + { + "epoch": 4.596, + "grad_norm": 52.26560974121094, + "learning_rate": 8.124000000000002e-07, + "loss": 0.5763, + "step": 45960 + }, + { + "epoch": 4.5969999999999995, + "grad_norm": 29.50721549987793, + "learning_rate": 8.104000000000001e-07, + "loss": 0.6662, + "step": 45970 + }, + { + "epoch": 4.598, + "grad_norm": 28.411151885986328, + "learning_rate": 8.084000000000001e-07, + "loss": 0.4994, + "step": 45980 + }, + { + "epoch": 4.599, + "grad_norm": 19.696985244750977, + "learning_rate": 8.064000000000001e-07, + "loss": 0.6327, + "step": 45990 + }, + { + "epoch": 4.6, + "grad_norm": 34.80906295776367, + "learning_rate": 8.044000000000001e-07, + "loss": 0.3584, + "step": 46000 + }, + { + "epoch": 4.601, + "grad_norm": 40.034202575683594, + "learning_rate": 8.024000000000001e-07, + "loss": 0.4287, + "step": 46010 + }, + { + "epoch": 4.602, + "grad_norm": 65.08027648925781, + "learning_rate": 8.004000000000001e-07, + "loss": 0.531, + "step": 46020 + }, + { + "epoch": 4.603, + "grad_norm": 46.98908615112305, + "learning_rate": 7.984000000000001e-07, + "loss": 0.469, + "step": 46030 + }, + { + "epoch": 4.604, + "grad_norm": 35.33990478515625, + "learning_rate": 7.964000000000001e-07, + "loss": 0.532, + "step": 46040 + }, + { + "epoch": 4.605, + "grad_norm": 25.324203491210938, + "learning_rate": 7.944000000000001e-07, + "loss": 0.571, + "step": 46050 + }, + { + "epoch": 4.606, + "grad_norm": 7.645961284637451, + "learning_rate": 7.924000000000001e-07, + "loss": 0.4075, + "step": 46060 + }, + { + "epoch": 4.607, + "grad_norm": 12.345396041870117, + "learning_rate": 7.904000000000001e-07, + "loss": 0.5891, + "step": 46070 + }, + { + "epoch": 4.608, + "grad_norm": 8.199934959411621, + "learning_rate": 7.884e-07, + "loss": 0.6039, + "step": 46080 + }, + { + "epoch": 4.609, + "grad_norm": 3.992199420928955, + "learning_rate": 7.864000000000001e-07, + "loss": 0.6168, + "step": 46090 + }, + { + "epoch": 4.61, + "grad_norm": 82.65238189697266, + "learning_rate": 7.844000000000001e-07, + "loss": 0.9055, + "step": 46100 + }, + { + "epoch": 4.611, + "grad_norm": 3.730809450149536, + "learning_rate": 7.824000000000001e-07, + "loss": 0.5675, + "step": 46110 + }, + { + "epoch": 4.612, + "grad_norm": 48.9563102722168, + "learning_rate": 7.804000000000001e-07, + "loss": 0.6902, + "step": 46120 + }, + { + "epoch": 4.6129999999999995, + "grad_norm": 7.581906795501709, + "learning_rate": 7.784000000000001e-07, + "loss": 0.8163, + "step": 46130 + }, + { + "epoch": 4.614, + "grad_norm": 2.6554839611053467, + "learning_rate": 7.764000000000001e-07, + "loss": 0.4846, + "step": 46140 + }, + { + "epoch": 4.615, + "grad_norm": 40.605262756347656, + "learning_rate": 7.744e-07, + "loss": 1.097, + "step": 46150 + }, + { + "epoch": 4.616, + "grad_norm": 30.826297760009766, + "learning_rate": 7.724000000000001e-07, + "loss": 0.5738, + "step": 46160 + }, + { + "epoch": 4.617, + "grad_norm": 8.789215087890625, + "learning_rate": 7.704000000000001e-07, + "loss": 0.3848, + "step": 46170 + }, + { + "epoch": 4.618, + "grad_norm": 7.239010810852051, + "learning_rate": 7.684000000000001e-07, + "loss": 0.5266, + "step": 46180 + }, + { + "epoch": 4.619, + "grad_norm": 62.47308349609375, + "learning_rate": 7.664000000000001e-07, + "loss": 0.8456, + "step": 46190 + }, + { + "epoch": 4.62, + "grad_norm": 38.9660758972168, + "learning_rate": 7.644e-07, + "loss": 0.7502, + "step": 46200 + }, + { + "epoch": 4.621, + "grad_norm": 59.97346878051758, + "learning_rate": 7.624000000000001e-07, + "loss": 0.4927, + "step": 46210 + }, + { + "epoch": 4.622, + "grad_norm": 6.728061676025391, + "learning_rate": 7.604000000000001e-07, + "loss": 0.4951, + "step": 46220 + }, + { + "epoch": 4.623, + "grad_norm": 25.506572723388672, + "learning_rate": 7.584000000000001e-07, + "loss": 0.4714, + "step": 46230 + }, + { + "epoch": 4.624, + "grad_norm": 85.05602264404297, + "learning_rate": 7.564000000000001e-07, + "loss": 0.5414, + "step": 46240 + }, + { + "epoch": 4.625, + "grad_norm": 31.7155818939209, + "learning_rate": 7.544e-07, + "loss": 0.5379, + "step": 46250 + }, + { + "epoch": 4.626, + "grad_norm": 6.111644268035889, + "learning_rate": 7.524000000000001e-07, + "loss": 0.4328, + "step": 46260 + }, + { + "epoch": 4.627, + "grad_norm": 41.613582611083984, + "learning_rate": 7.504e-07, + "loss": 0.7449, + "step": 46270 + }, + { + "epoch": 4.628, + "grad_norm": 62.398040771484375, + "learning_rate": 7.484000000000001e-07, + "loss": 0.4983, + "step": 46280 + }, + { + "epoch": 4.629, + "grad_norm": 71.82337951660156, + "learning_rate": 7.464000000000001e-07, + "loss": 0.6349, + "step": 46290 + }, + { + "epoch": 4.63, + "grad_norm": 3.016054153442383, + "learning_rate": 7.444000000000001e-07, + "loss": 0.6469, + "step": 46300 + }, + { + "epoch": 4.631, + "grad_norm": 64.6484603881836, + "learning_rate": 7.424000000000001e-07, + "loss": 0.504, + "step": 46310 + }, + { + "epoch": 4.632, + "grad_norm": 66.25682067871094, + "learning_rate": 7.404e-07, + "loss": 0.6037, + "step": 46320 + }, + { + "epoch": 4.633, + "grad_norm": 16.990617752075195, + "learning_rate": 7.384000000000001e-07, + "loss": 0.4708, + "step": 46330 + }, + { + "epoch": 4.634, + "grad_norm": 42.48113250732422, + "learning_rate": 7.364000000000001e-07, + "loss": 0.8633, + "step": 46340 + }, + { + "epoch": 4.635, + "grad_norm": 20.38407325744629, + "learning_rate": 7.344000000000001e-07, + "loss": 0.6119, + "step": 46350 + }, + { + "epoch": 4.636, + "grad_norm": 60.075557708740234, + "learning_rate": 7.324000000000001e-07, + "loss": 0.5929, + "step": 46360 + }, + { + "epoch": 4.6370000000000005, + "grad_norm": 5.794075012207031, + "learning_rate": 7.304e-07, + "loss": 0.5081, + "step": 46370 + }, + { + "epoch": 4.638, + "grad_norm": 15.373174667358398, + "learning_rate": 7.284000000000001e-07, + "loss": 0.4686, + "step": 46380 + }, + { + "epoch": 4.639, + "grad_norm": 16.02683448791504, + "learning_rate": 7.264e-07, + "loss": 0.3858, + "step": 46390 + }, + { + "epoch": 4.64, + "grad_norm": 35.6048583984375, + "learning_rate": 7.244000000000001e-07, + "loss": 0.4706, + "step": 46400 + }, + { + "epoch": 4.641, + "grad_norm": 16.356292724609375, + "learning_rate": 7.224000000000001e-07, + "loss": 0.3739, + "step": 46410 + }, + { + "epoch": 4.642, + "grad_norm": 43.27982711791992, + "learning_rate": 7.204000000000001e-07, + "loss": 0.529, + "step": 46420 + }, + { + "epoch": 4.643, + "grad_norm": 38.51179122924805, + "learning_rate": 7.184000000000001e-07, + "loss": 0.6962, + "step": 46430 + }, + { + "epoch": 4.644, + "grad_norm": 20.879756927490234, + "learning_rate": 7.164e-07, + "loss": 0.6176, + "step": 46440 + }, + { + "epoch": 4.645, + "grad_norm": 45.5706672668457, + "learning_rate": 7.144000000000001e-07, + "loss": 0.8066, + "step": 46450 + }, + { + "epoch": 4.646, + "grad_norm": 54.51177215576172, + "learning_rate": 7.124e-07, + "loss": 0.4615, + "step": 46460 + }, + { + "epoch": 4.647, + "grad_norm": 87.40930938720703, + "learning_rate": 7.104000000000001e-07, + "loss": 0.4578, + "step": 46470 + }, + { + "epoch": 4.648, + "grad_norm": 7.970445156097412, + "learning_rate": 7.084000000000001e-07, + "loss": 0.3683, + "step": 46480 + }, + { + "epoch": 4.649, + "grad_norm": 45.14849853515625, + "learning_rate": 7.064e-07, + "loss": 0.8526, + "step": 46490 + }, + { + "epoch": 4.65, + "grad_norm": 7.996401786804199, + "learning_rate": 7.044000000000001e-07, + "loss": 0.2735, + "step": 46500 + }, + { + "epoch": 4.651, + "grad_norm": 27.318073272705078, + "learning_rate": 7.024e-07, + "loss": 0.6858, + "step": 46510 + }, + { + "epoch": 4.652, + "grad_norm": 28.923954010009766, + "learning_rate": 7.004000000000001e-07, + "loss": 0.2842, + "step": 46520 + }, + { + "epoch": 4.6530000000000005, + "grad_norm": 33.51847457885742, + "learning_rate": 6.984000000000001e-07, + "loss": 0.5165, + "step": 46530 + }, + { + "epoch": 4.654, + "grad_norm": 52.75628662109375, + "learning_rate": 6.964e-07, + "loss": 0.6128, + "step": 46540 + }, + { + "epoch": 4.655, + "grad_norm": 42.89748764038086, + "learning_rate": 6.944000000000001e-07, + "loss": 0.4989, + "step": 46550 + }, + { + "epoch": 4.656, + "grad_norm": 46.054161071777344, + "learning_rate": 6.924e-07, + "loss": 0.6215, + "step": 46560 + }, + { + "epoch": 4.657, + "grad_norm": 12.866451263427734, + "learning_rate": 6.904000000000001e-07, + "loss": 0.4183, + "step": 46570 + }, + { + "epoch": 4.658, + "grad_norm": 13.437576293945312, + "learning_rate": 6.884e-07, + "loss": 0.5324, + "step": 46580 + }, + { + "epoch": 4.659, + "grad_norm": 7.054804801940918, + "learning_rate": 6.864000000000001e-07, + "loss": 0.697, + "step": 46590 + }, + { + "epoch": 4.66, + "grad_norm": 6.7603373527526855, + "learning_rate": 6.844000000000001e-07, + "loss": 0.3793, + "step": 46600 + }, + { + "epoch": 4.661, + "grad_norm": 5.329585075378418, + "learning_rate": 6.824e-07, + "loss": 0.7276, + "step": 46610 + }, + { + "epoch": 4.662, + "grad_norm": 24.512773513793945, + "learning_rate": 6.804000000000001e-07, + "loss": 0.4644, + "step": 46620 + }, + { + "epoch": 4.663, + "grad_norm": 5.408405780792236, + "learning_rate": 6.784e-07, + "loss": 0.3011, + "step": 46630 + }, + { + "epoch": 4.664, + "grad_norm": 56.339935302734375, + "learning_rate": 6.764000000000001e-07, + "loss": 0.6349, + "step": 46640 + }, + { + "epoch": 4.665, + "grad_norm": 1.9910184144973755, + "learning_rate": 6.744000000000001e-07, + "loss": 0.7266, + "step": 46650 + }, + { + "epoch": 4.666, + "grad_norm": 47.08732986450195, + "learning_rate": 6.724e-07, + "loss": 0.5365, + "step": 46660 + }, + { + "epoch": 4.667, + "grad_norm": 77.28034973144531, + "learning_rate": 6.704000000000001e-07, + "loss": 0.6119, + "step": 46670 + }, + { + "epoch": 4.668, + "grad_norm": 61.18629837036133, + "learning_rate": 6.684e-07, + "loss": 0.6465, + "step": 46680 + }, + { + "epoch": 4.6690000000000005, + "grad_norm": 37.235626220703125, + "learning_rate": 6.664000000000001e-07, + "loss": 0.504, + "step": 46690 + }, + { + "epoch": 4.67, + "grad_norm": 11.716021537780762, + "learning_rate": 6.644e-07, + "loss": 0.5464, + "step": 46700 + }, + { + "epoch": 4.671, + "grad_norm": 40.31194305419922, + "learning_rate": 6.624e-07, + "loss": 0.2915, + "step": 46710 + }, + { + "epoch": 4.672, + "grad_norm": 41.208595275878906, + "learning_rate": 6.604000000000001e-07, + "loss": 0.759, + "step": 46720 + }, + { + "epoch": 4.673, + "grad_norm": 37.54507827758789, + "learning_rate": 6.584e-07, + "loss": 0.6278, + "step": 46730 + }, + { + "epoch": 4.674, + "grad_norm": 52.48876953125, + "learning_rate": 6.564000000000001e-07, + "loss": 0.6083, + "step": 46740 + }, + { + "epoch": 4.675, + "grad_norm": 20.885011672973633, + "learning_rate": 6.544e-07, + "loss": 0.6174, + "step": 46750 + }, + { + "epoch": 4.676, + "grad_norm": 41.223793029785156, + "learning_rate": 6.524000000000001e-07, + "loss": 0.6118, + "step": 46760 + }, + { + "epoch": 4.677, + "grad_norm": 28.714216232299805, + "learning_rate": 6.504000000000001e-07, + "loss": 0.54, + "step": 46770 + }, + { + "epoch": 4.678, + "grad_norm": 23.326507568359375, + "learning_rate": 6.484e-07, + "loss": 0.6858, + "step": 46780 + }, + { + "epoch": 4.679, + "grad_norm": 5.641720771789551, + "learning_rate": 6.464000000000001e-07, + "loss": 0.5215, + "step": 46790 + }, + { + "epoch": 4.68, + "grad_norm": 12.176937103271484, + "learning_rate": 6.444e-07, + "loss": 0.8599, + "step": 46800 + }, + { + "epoch": 4.681, + "grad_norm": 15.131599426269531, + "learning_rate": 6.424000000000001e-07, + "loss": 0.5364, + "step": 46810 + }, + { + "epoch": 4.682, + "grad_norm": 17.37408447265625, + "learning_rate": 6.404e-07, + "loss": 0.6698, + "step": 46820 + }, + { + "epoch": 4.683, + "grad_norm": 49.73809814453125, + "learning_rate": 6.384e-07, + "loss": 0.7712, + "step": 46830 + }, + { + "epoch": 4.684, + "grad_norm": 36.00857925415039, + "learning_rate": 6.364000000000001e-07, + "loss": 0.6566, + "step": 46840 + }, + { + "epoch": 4.6850000000000005, + "grad_norm": 41.920127868652344, + "learning_rate": 6.344e-07, + "loss": 0.5428, + "step": 46850 + }, + { + "epoch": 4.686, + "grad_norm": 45.1785888671875, + "learning_rate": 6.324000000000001e-07, + "loss": 0.4806, + "step": 46860 + }, + { + "epoch": 4.687, + "grad_norm": 44.14488220214844, + "learning_rate": 6.304e-07, + "loss": 0.6205, + "step": 46870 + }, + { + "epoch": 4.688, + "grad_norm": 1.7023221254348755, + "learning_rate": 6.284e-07, + "loss": 0.4456, + "step": 46880 + }, + { + "epoch": 4.689, + "grad_norm": 13.15688419342041, + "learning_rate": 6.266e-07, + "loss": 0.6845, + "step": 46890 + }, + { + "epoch": 4.6899999999999995, + "grad_norm": 15.498842239379883, + "learning_rate": 6.246e-07, + "loss": 0.5764, + "step": 46900 + }, + { + "epoch": 4.691, + "grad_norm": 23.742435455322266, + "learning_rate": 6.226000000000001e-07, + "loss": 0.5667, + "step": 46910 + }, + { + "epoch": 4.692, + "grad_norm": 16.880615234375, + "learning_rate": 6.206e-07, + "loss": 0.3689, + "step": 46920 + }, + { + "epoch": 4.693, + "grad_norm": 34.34412384033203, + "learning_rate": 6.186e-07, + "loss": 0.5903, + "step": 46930 + }, + { + "epoch": 4.694, + "grad_norm": 41.17317199707031, + "learning_rate": 6.166000000000001e-07, + "loss": 0.4317, + "step": 46940 + }, + { + "epoch": 4.695, + "grad_norm": 40.64748001098633, + "learning_rate": 6.146e-07, + "loss": 0.4934, + "step": 46950 + }, + { + "epoch": 4.696, + "grad_norm": 49.303955078125, + "learning_rate": 6.126000000000001e-07, + "loss": 0.5757, + "step": 46960 + }, + { + "epoch": 4.697, + "grad_norm": 43.190826416015625, + "learning_rate": 6.106000000000001e-07, + "loss": 0.6163, + "step": 46970 + }, + { + "epoch": 4.698, + "grad_norm": 9.210566520690918, + "learning_rate": 6.086e-07, + "loss": 0.3262, + "step": 46980 + }, + { + "epoch": 4.699, + "grad_norm": 41.45174026489258, + "learning_rate": 6.066e-07, + "loss": 0.6636, + "step": 46990 + }, + { + "epoch": 4.7, + "grad_norm": 42.5493049621582, + "learning_rate": 6.046e-07, + "loss": 0.7651, + "step": 47000 + }, + { + "epoch": 4.701, + "grad_norm": 14.347480773925781, + "learning_rate": 6.026000000000001e-07, + "loss": 0.7254, + "step": 47010 + }, + { + "epoch": 4.702, + "grad_norm": 55.891910552978516, + "learning_rate": 6.006000000000001e-07, + "loss": 0.6508, + "step": 47020 + }, + { + "epoch": 4.703, + "grad_norm": 66.14480590820312, + "learning_rate": 5.986e-07, + "loss": 0.5825, + "step": 47030 + }, + { + "epoch": 4.704, + "grad_norm": 29.104063034057617, + "learning_rate": 5.966e-07, + "loss": 0.6674, + "step": 47040 + }, + { + "epoch": 4.705, + "grad_norm": 12.830475807189941, + "learning_rate": 5.946e-07, + "loss": 0.3653, + "step": 47050 + }, + { + "epoch": 4.7059999999999995, + "grad_norm": 30.010398864746094, + "learning_rate": 5.926e-07, + "loss": 0.8521, + "step": 47060 + }, + { + "epoch": 4.707, + "grad_norm": 31.613298416137695, + "learning_rate": 5.906000000000001e-07, + "loss": 0.7231, + "step": 47070 + }, + { + "epoch": 4.708, + "grad_norm": 17.527433395385742, + "learning_rate": 5.886000000000001e-07, + "loss": 0.4612, + "step": 47080 + }, + { + "epoch": 4.709, + "grad_norm": 55.67793655395508, + "learning_rate": 5.866e-07, + "loss": 0.373, + "step": 47090 + }, + { + "epoch": 4.71, + "grad_norm": 40.79391098022461, + "learning_rate": 5.846e-07, + "loss": 0.5253, + "step": 47100 + }, + { + "epoch": 4.711, + "grad_norm": 18.90387725830078, + "learning_rate": 5.826e-07, + "loss": 0.6227, + "step": 47110 + }, + { + "epoch": 4.712, + "grad_norm": 83.02630615234375, + "learning_rate": 5.806000000000001e-07, + "loss": 0.3808, + "step": 47120 + }, + { + "epoch": 4.713, + "grad_norm": 11.049756050109863, + "learning_rate": 5.786000000000001e-07, + "loss": 0.6286, + "step": 47130 + }, + { + "epoch": 4.714, + "grad_norm": 46.445465087890625, + "learning_rate": 5.766000000000001e-07, + "loss": 0.5005, + "step": 47140 + }, + { + "epoch": 4.715, + "grad_norm": 58.55226135253906, + "learning_rate": 5.746e-07, + "loss": 0.708, + "step": 47150 + }, + { + "epoch": 4.716, + "grad_norm": 37.50386047363281, + "learning_rate": 5.726e-07, + "loss": 0.5185, + "step": 47160 + }, + { + "epoch": 4.717, + "grad_norm": 18.741193771362305, + "learning_rate": 5.706000000000001e-07, + "loss": 0.7934, + "step": 47170 + }, + { + "epoch": 4.718, + "grad_norm": 31.67978858947754, + "learning_rate": 5.686e-07, + "loss": 0.5104, + "step": 47180 + }, + { + "epoch": 4.719, + "grad_norm": 58.85725021362305, + "learning_rate": 5.666000000000001e-07, + "loss": 0.5489, + "step": 47190 + }, + { + "epoch": 4.72, + "grad_norm": 42.56383514404297, + "learning_rate": 5.646000000000001e-07, + "loss": 0.3513, + "step": 47200 + }, + { + "epoch": 4.721, + "grad_norm": 21.690288543701172, + "learning_rate": 5.626e-07, + "loss": 0.4654, + "step": 47210 + }, + { + "epoch": 4.7219999999999995, + "grad_norm": 42.254913330078125, + "learning_rate": 5.606000000000001e-07, + "loss": 0.6836, + "step": 47220 + }, + { + "epoch": 4.723, + "grad_norm": 52.095916748046875, + "learning_rate": 5.586e-07, + "loss": 0.4583, + "step": 47230 + }, + { + "epoch": 4.724, + "grad_norm": 74.2481918334961, + "learning_rate": 5.566000000000001e-07, + "loss": 0.4986, + "step": 47240 + }, + { + "epoch": 4.725, + "grad_norm": 27.45410919189453, + "learning_rate": 5.546000000000001e-07, + "loss": 0.7195, + "step": 47250 + }, + { + "epoch": 4.726, + "grad_norm": 14.02073860168457, + "learning_rate": 5.526e-07, + "loss": 0.5896, + "step": 47260 + }, + { + "epoch": 4.727, + "grad_norm": 27.584619522094727, + "learning_rate": 5.506000000000001e-07, + "loss": 0.5507, + "step": 47270 + }, + { + "epoch": 4.728, + "grad_norm": 23.282672882080078, + "learning_rate": 5.486e-07, + "loss": 0.4576, + "step": 47280 + }, + { + "epoch": 4.729, + "grad_norm": 40.43513107299805, + "learning_rate": 5.466000000000001e-07, + "loss": 0.7413, + "step": 47290 + }, + { + "epoch": 4.73, + "grad_norm": 43.14529800415039, + "learning_rate": 5.446e-07, + "loss": 0.5918, + "step": 47300 + }, + { + "epoch": 4.731, + "grad_norm": 41.957130432128906, + "learning_rate": 5.426000000000001e-07, + "loss": 0.365, + "step": 47310 + }, + { + "epoch": 4.732, + "grad_norm": 41.200340270996094, + "learning_rate": 5.406000000000001e-07, + "loss": 0.5841, + "step": 47320 + }, + { + "epoch": 4.733, + "grad_norm": 2.041828155517578, + "learning_rate": 5.386e-07, + "loss": 0.5923, + "step": 47330 + }, + { + "epoch": 4.734, + "grad_norm": 44.379859924316406, + "learning_rate": 5.366000000000001e-07, + "loss": 0.4744, + "step": 47340 + }, + { + "epoch": 4.735, + "grad_norm": 16.02285385131836, + "learning_rate": 5.346e-07, + "loss": 0.5911, + "step": 47350 + }, + { + "epoch": 4.736, + "grad_norm": 47.35900115966797, + "learning_rate": 5.326000000000001e-07, + "loss": 0.4788, + "step": 47360 + }, + { + "epoch": 4.737, + "grad_norm": 8.870173454284668, + "learning_rate": 5.306e-07, + "loss": 0.3585, + "step": 47370 + }, + { + "epoch": 4.7379999999999995, + "grad_norm": 27.033184051513672, + "learning_rate": 5.286e-07, + "loss": 0.3327, + "step": 47380 + }, + { + "epoch": 4.739, + "grad_norm": 11.23887825012207, + "learning_rate": 5.266000000000001e-07, + "loss": 0.7588, + "step": 47390 + }, + { + "epoch": 4.74, + "grad_norm": 25.569849014282227, + "learning_rate": 5.246e-07, + "loss": 0.3565, + "step": 47400 + }, + { + "epoch": 4.741, + "grad_norm": 59.0633430480957, + "learning_rate": 5.226000000000001e-07, + "loss": 1.0683, + "step": 47410 + }, + { + "epoch": 4.742, + "grad_norm": 7.970805644989014, + "learning_rate": 5.206e-07, + "loss": 0.7565, + "step": 47420 + }, + { + "epoch": 4.743, + "grad_norm": 24.26109504699707, + "learning_rate": 5.186000000000001e-07, + "loss": 0.7171, + "step": 47430 + }, + { + "epoch": 4.744, + "grad_norm": 63.284576416015625, + "learning_rate": 5.166000000000001e-07, + "loss": 0.6909, + "step": 47440 + }, + { + "epoch": 4.745, + "grad_norm": 46.69160842895508, + "learning_rate": 5.146e-07, + "loss": 0.777, + "step": 47450 + }, + { + "epoch": 4.746, + "grad_norm": 33.398040771484375, + "learning_rate": 5.126000000000001e-07, + "loss": 0.5602, + "step": 47460 + }, + { + "epoch": 4.747, + "grad_norm": 1.8984227180480957, + "learning_rate": 5.106e-07, + "loss": 0.4943, + "step": 47470 + }, + { + "epoch": 4.748, + "grad_norm": 35.92313003540039, + "learning_rate": 5.086000000000001e-07, + "loss": 0.4842, + "step": 47480 + }, + { + "epoch": 4.749, + "grad_norm": 33.2078742980957, + "learning_rate": 5.066e-07, + "loss": 0.4065, + "step": 47490 + }, + { + "epoch": 4.75, + "grad_norm": 3.545969247817993, + "learning_rate": 5.046e-07, + "loss": 0.4536, + "step": 47500 + }, + { + "epoch": 4.751, + "grad_norm": 27.299518585205078, + "learning_rate": 5.026000000000001e-07, + "loss": 0.5854, + "step": 47510 + }, + { + "epoch": 4.752, + "grad_norm": 57.00627517700195, + "learning_rate": 5.006e-07, + "loss": 0.6149, + "step": 47520 + }, + { + "epoch": 4.753, + "grad_norm": 11.110939979553223, + "learning_rate": 4.986000000000001e-07, + "loss": 0.5755, + "step": 47530 + }, + { + "epoch": 4.754, + "grad_norm": 31.530773162841797, + "learning_rate": 4.966e-07, + "loss": 0.5945, + "step": 47540 + }, + { + "epoch": 4.755, + "grad_norm": 44.44669723510742, + "learning_rate": 4.946e-07, + "loss": 0.5209, + "step": 47550 + }, + { + "epoch": 4.756, + "grad_norm": 2.758739948272705, + "learning_rate": 4.926000000000001e-07, + "loss": 0.3809, + "step": 47560 + }, + { + "epoch": 4.757, + "grad_norm": 49.22467041015625, + "learning_rate": 4.906e-07, + "loss": 0.8543, + "step": 47570 + }, + { + "epoch": 4.758, + "grad_norm": 7.413259983062744, + "learning_rate": 4.886000000000001e-07, + "loss": 0.5036, + "step": 47580 + }, + { + "epoch": 4.759, + "grad_norm": 16.42161750793457, + "learning_rate": 4.866e-07, + "loss": 0.5253, + "step": 47590 + }, + { + "epoch": 4.76, + "grad_norm": 55.729469299316406, + "learning_rate": 4.846000000000001e-07, + "loss": 0.5325, + "step": 47600 + }, + { + "epoch": 4.761, + "grad_norm": 13.986032485961914, + "learning_rate": 4.826e-07, + "loss": 0.7039, + "step": 47610 + }, + { + "epoch": 4.7620000000000005, + "grad_norm": 41.571712493896484, + "learning_rate": 4.806e-07, + "loss": 0.6829, + "step": 47620 + }, + { + "epoch": 4.763, + "grad_norm": 29.633747100830078, + "learning_rate": 4.786000000000001e-07, + "loss": 0.8681, + "step": 47630 + }, + { + "epoch": 4.764, + "grad_norm": 16.89261817932129, + "learning_rate": 4.7660000000000007e-07, + "loss": 0.4153, + "step": 47640 + }, + { + "epoch": 4.765, + "grad_norm": 21.218664169311523, + "learning_rate": 4.746000000000001e-07, + "loss": 0.467, + "step": 47650 + }, + { + "epoch": 4.766, + "grad_norm": 14.788098335266113, + "learning_rate": 4.726000000000001e-07, + "loss": 0.5418, + "step": 47660 + }, + { + "epoch": 4.767, + "grad_norm": 8.380966186523438, + "learning_rate": 4.706e-07, + "loss": 0.7172, + "step": 47670 + }, + { + "epoch": 4.768, + "grad_norm": 27.72803497314453, + "learning_rate": 4.6860000000000005e-07, + "loss": 0.6403, + "step": 47680 + }, + { + "epoch": 4.769, + "grad_norm": 48.69464874267578, + "learning_rate": 4.6660000000000006e-07, + "loss": 0.628, + "step": 47690 + }, + { + "epoch": 4.77, + "grad_norm": 16.649084091186523, + "learning_rate": 4.6460000000000007e-07, + "loss": 0.5426, + "step": 47700 + }, + { + "epoch": 4.771, + "grad_norm": 47.99180603027344, + "learning_rate": 4.626000000000001e-07, + "loss": 0.5476, + "step": 47710 + }, + { + "epoch": 4.772, + "grad_norm": 9.165487289428711, + "learning_rate": 4.606e-07, + "loss": 0.6779, + "step": 47720 + }, + { + "epoch": 4.773, + "grad_norm": 36.032318115234375, + "learning_rate": 4.5860000000000004e-07, + "loss": 0.5837, + "step": 47730 + }, + { + "epoch": 4.774, + "grad_norm": 76.74220275878906, + "learning_rate": 4.5660000000000005e-07, + "loss": 0.7682, + "step": 47740 + }, + { + "epoch": 4.775, + "grad_norm": 76.99014282226562, + "learning_rate": 4.5460000000000006e-07, + "loss": 0.4876, + "step": 47750 + }, + { + "epoch": 4.776, + "grad_norm": 65.31263732910156, + "learning_rate": 4.5260000000000007e-07, + "loss": 0.7052, + "step": 47760 + }, + { + "epoch": 4.777, + "grad_norm": 86.4690170288086, + "learning_rate": 4.506000000000001e-07, + "loss": 0.654, + "step": 47770 + }, + { + "epoch": 4.7780000000000005, + "grad_norm": 46.257225036621094, + "learning_rate": 4.4860000000000003e-07, + "loss": 0.9597, + "step": 47780 + }, + { + "epoch": 4.779, + "grad_norm": 18.706884384155273, + "learning_rate": 4.4660000000000004e-07, + "loss": 0.3825, + "step": 47790 + }, + { + "epoch": 4.78, + "grad_norm": 35.63112258911133, + "learning_rate": 4.4460000000000005e-07, + "loss": 0.7898, + "step": 47800 + }, + { + "epoch": 4.781, + "grad_norm": 91.87278747558594, + "learning_rate": 4.4260000000000006e-07, + "loss": 0.9079, + "step": 47810 + }, + { + "epoch": 4.782, + "grad_norm": 3.9658284187316895, + "learning_rate": 4.4060000000000007e-07, + "loss": 0.5102, + "step": 47820 + }, + { + "epoch": 4.783, + "grad_norm": 50.287960052490234, + "learning_rate": 4.386000000000001e-07, + "loss": 0.5358, + "step": 47830 + }, + { + "epoch": 4.784, + "grad_norm": 50.575645446777344, + "learning_rate": 4.3660000000000003e-07, + "loss": 0.559, + "step": 47840 + }, + { + "epoch": 4.785, + "grad_norm": 73.50508880615234, + "learning_rate": 4.3460000000000004e-07, + "loss": 0.5853, + "step": 47850 + }, + { + "epoch": 4.786, + "grad_norm": 40.10487747192383, + "learning_rate": 4.3260000000000005e-07, + "loss": 0.6971, + "step": 47860 + }, + { + "epoch": 4.787, + "grad_norm": 48.18107223510742, + "learning_rate": 4.3060000000000006e-07, + "loss": 0.5523, + "step": 47870 + }, + { + "epoch": 4.788, + "grad_norm": 44.405696868896484, + "learning_rate": 4.2860000000000007e-07, + "loss": 0.6391, + "step": 47880 + }, + { + "epoch": 4.789, + "grad_norm": 42.72483444213867, + "learning_rate": 4.266e-07, + "loss": 0.7918, + "step": 47890 + }, + { + "epoch": 4.79, + "grad_norm": 70.26227569580078, + "learning_rate": 4.2460000000000003e-07, + "loss": 0.8501, + "step": 47900 + }, + { + "epoch": 4.791, + "grad_norm": 31.3819522857666, + "learning_rate": 4.2260000000000004e-07, + "loss": 0.4523, + "step": 47910 + }, + { + "epoch": 4.792, + "grad_norm": 4.014766216278076, + "learning_rate": 4.2060000000000005e-07, + "loss": 0.7206, + "step": 47920 + }, + { + "epoch": 4.793, + "grad_norm": 43.470123291015625, + "learning_rate": 4.1860000000000006e-07, + "loss": 0.6754, + "step": 47930 + }, + { + "epoch": 4.7940000000000005, + "grad_norm": 29.402721405029297, + "learning_rate": 4.1660000000000006e-07, + "loss": 0.4421, + "step": 47940 + }, + { + "epoch": 4.795, + "grad_norm": 15.759544372558594, + "learning_rate": 4.146e-07, + "loss": 0.6538, + "step": 47950 + }, + { + "epoch": 4.796, + "grad_norm": 59.475364685058594, + "learning_rate": 4.1260000000000003e-07, + "loss": 0.4141, + "step": 47960 + }, + { + "epoch": 4.797, + "grad_norm": 15.005043983459473, + "learning_rate": 4.1060000000000004e-07, + "loss": 0.4547, + "step": 47970 + }, + { + "epoch": 4.798, + "grad_norm": 38.049049377441406, + "learning_rate": 4.0860000000000005e-07, + "loss": 0.5253, + "step": 47980 + }, + { + "epoch": 4.799, + "grad_norm": 59.97571563720703, + "learning_rate": 4.0660000000000005e-07, + "loss": 0.4392, + "step": 47990 + }, + { + "epoch": 4.8, + "grad_norm": 17.64630699157715, + "learning_rate": 4.0460000000000006e-07, + "loss": 0.9641, + "step": 48000 + }, + { + "epoch": 4.801, + "grad_norm": 1.9898121356964111, + "learning_rate": 4.026e-07, + "loss": 0.4899, + "step": 48010 + }, + { + "epoch": 4.802, + "grad_norm": 11.052675247192383, + "learning_rate": 4.0060000000000003e-07, + "loss": 0.4204, + "step": 48020 + }, + { + "epoch": 4.803, + "grad_norm": 17.712017059326172, + "learning_rate": 3.9860000000000004e-07, + "loss": 0.3256, + "step": 48030 + }, + { + "epoch": 4.804, + "grad_norm": 51.75345230102539, + "learning_rate": 3.9660000000000004e-07, + "loss": 0.59, + "step": 48040 + }, + { + "epoch": 4.805, + "grad_norm": 10.139244079589844, + "learning_rate": 3.9460000000000005e-07, + "loss": 0.4246, + "step": 48050 + }, + { + "epoch": 4.806, + "grad_norm": 4.2397565841674805, + "learning_rate": 3.9260000000000006e-07, + "loss": 0.6121, + "step": 48060 + }, + { + "epoch": 4.807, + "grad_norm": 54.398658752441406, + "learning_rate": 3.906e-07, + "loss": 0.5002, + "step": 48070 + }, + { + "epoch": 4.808, + "grad_norm": 7.418992519378662, + "learning_rate": 3.886e-07, + "loss": 0.6649, + "step": 48080 + }, + { + "epoch": 4.809, + "grad_norm": 47.92473220825195, + "learning_rate": 3.8660000000000003e-07, + "loss": 0.8182, + "step": 48090 + }, + { + "epoch": 4.8100000000000005, + "grad_norm": 4.537411212921143, + "learning_rate": 3.8460000000000004e-07, + "loss": 0.7614, + "step": 48100 + }, + { + "epoch": 4.811, + "grad_norm": 48.04644012451172, + "learning_rate": 3.8260000000000005e-07, + "loss": 0.3634, + "step": 48110 + }, + { + "epoch": 4.812, + "grad_norm": 59.322444915771484, + "learning_rate": 3.806e-07, + "loss": 0.5423, + "step": 48120 + }, + { + "epoch": 4.813, + "grad_norm": 33.344459533691406, + "learning_rate": 3.786e-07, + "loss": 0.5785, + "step": 48130 + }, + { + "epoch": 4.814, + "grad_norm": 12.657391548156738, + "learning_rate": 3.766e-07, + "loss": 0.2472, + "step": 48140 + }, + { + "epoch": 4.8149999999999995, + "grad_norm": 47.85975646972656, + "learning_rate": 3.7460000000000003e-07, + "loss": 0.7532, + "step": 48150 + }, + { + "epoch": 4.816, + "grad_norm": 24.278907775878906, + "learning_rate": 3.7260000000000004e-07, + "loss": 0.3976, + "step": 48160 + }, + { + "epoch": 4.817, + "grad_norm": 54.600223541259766, + "learning_rate": 3.7060000000000005e-07, + "loss": 0.6928, + "step": 48170 + }, + { + "epoch": 4.818, + "grad_norm": 8.999923706054688, + "learning_rate": 3.686e-07, + "loss": 0.3901, + "step": 48180 + }, + { + "epoch": 4.819, + "grad_norm": 37.98063278198242, + "learning_rate": 3.666e-07, + "loss": 0.7222, + "step": 48190 + }, + { + "epoch": 4.82, + "grad_norm": 1.4624940156936646, + "learning_rate": 3.646e-07, + "loss": 0.3561, + "step": 48200 + }, + { + "epoch": 4.821, + "grad_norm": 60.897274017333984, + "learning_rate": 3.6260000000000003e-07, + "loss": 0.6086, + "step": 48210 + }, + { + "epoch": 4.822, + "grad_norm": 7.063051223754883, + "learning_rate": 3.6060000000000004e-07, + "loss": 0.7798, + "step": 48220 + }, + { + "epoch": 4.823, + "grad_norm": 9.376193046569824, + "learning_rate": 3.5860000000000005e-07, + "loss": 0.4789, + "step": 48230 + }, + { + "epoch": 4.824, + "grad_norm": 19.10816192626953, + "learning_rate": 3.566e-07, + "loss": 0.5622, + "step": 48240 + }, + { + "epoch": 4.825, + "grad_norm": 11.21962833404541, + "learning_rate": 3.546e-07, + "loss": 0.635, + "step": 48250 + }, + { + "epoch": 4.826, + "grad_norm": 37.95383071899414, + "learning_rate": 3.526e-07, + "loss": 0.4713, + "step": 48260 + }, + { + "epoch": 4.827, + "grad_norm": 22.839855194091797, + "learning_rate": 3.5060000000000003e-07, + "loss": 0.4713, + "step": 48270 + }, + { + "epoch": 4.828, + "grad_norm": 11.70213794708252, + "learning_rate": 3.4860000000000004e-07, + "loss": 0.5201, + "step": 48280 + }, + { + "epoch": 4.829, + "grad_norm": 45.10593795776367, + "learning_rate": 3.4660000000000005e-07, + "loss": 0.3367, + "step": 48290 + }, + { + "epoch": 4.83, + "grad_norm": 36.21401596069336, + "learning_rate": 3.446e-07, + "loss": 0.8444, + "step": 48300 + }, + { + "epoch": 4.8309999999999995, + "grad_norm": 75.2431640625, + "learning_rate": 3.426e-07, + "loss": 0.5507, + "step": 48310 + }, + { + "epoch": 4.832, + "grad_norm": 29.88875961303711, + "learning_rate": 3.406e-07, + "loss": 0.5728, + "step": 48320 + }, + { + "epoch": 4.833, + "grad_norm": 11.833975791931152, + "learning_rate": 3.3860000000000003e-07, + "loss": 0.6646, + "step": 48330 + }, + { + "epoch": 4.834, + "grad_norm": 4.960333347320557, + "learning_rate": 3.3660000000000004e-07, + "loss": 0.427, + "step": 48340 + }, + { + "epoch": 4.835, + "grad_norm": 49.04979705810547, + "learning_rate": 3.346e-07, + "loss": 0.4861, + "step": 48350 + }, + { + "epoch": 4.836, + "grad_norm": 56.9226188659668, + "learning_rate": 3.326e-07, + "loss": 0.4323, + "step": 48360 + }, + { + "epoch": 4.837, + "grad_norm": 21.75094223022461, + "learning_rate": 3.306e-07, + "loss": 0.333, + "step": 48370 + }, + { + "epoch": 4.838, + "grad_norm": 10.19582748413086, + "learning_rate": 3.286e-07, + "loss": 0.4192, + "step": 48380 + }, + { + "epoch": 4.839, + "grad_norm": 32.832767486572266, + "learning_rate": 3.266e-07, + "loss": 0.3247, + "step": 48390 + }, + { + "epoch": 4.84, + "grad_norm": 47.89490509033203, + "learning_rate": 3.2460000000000003e-07, + "loss": 0.6204, + "step": 48400 + }, + { + "epoch": 4.841, + "grad_norm": 64.20399475097656, + "learning_rate": 3.226e-07, + "loss": 0.4072, + "step": 48410 + }, + { + "epoch": 4.842, + "grad_norm": 42.400634765625, + "learning_rate": 3.206e-07, + "loss": 0.5873, + "step": 48420 + }, + { + "epoch": 4.843, + "grad_norm": 7.792730331420898, + "learning_rate": 3.186e-07, + "loss": 0.5349, + "step": 48430 + }, + { + "epoch": 4.844, + "grad_norm": 40.12997817993164, + "learning_rate": 3.166e-07, + "loss": 0.7333, + "step": 48440 + }, + { + "epoch": 4.845, + "grad_norm": 14.662808418273926, + "learning_rate": 3.146e-07, + "loss": 0.8452, + "step": 48450 + }, + { + "epoch": 4.846, + "grad_norm": 11.02692985534668, + "learning_rate": 3.1260000000000003e-07, + "loss": 0.6037, + "step": 48460 + }, + { + "epoch": 4.8469999999999995, + "grad_norm": 107.82157897949219, + "learning_rate": 3.1060000000000004e-07, + "loss": 0.8563, + "step": 48470 + }, + { + "epoch": 4.848, + "grad_norm": 6.1017961502075195, + "learning_rate": 3.086e-07, + "loss": 0.4407, + "step": 48480 + }, + { + "epoch": 4.849, + "grad_norm": 32.714542388916016, + "learning_rate": 3.066e-07, + "loss": 0.4017, + "step": 48490 + }, + { + "epoch": 4.85, + "grad_norm": 3.4416143894195557, + "learning_rate": 3.046e-07, + "loss": 0.4439, + "step": 48500 + }, + { + "epoch": 4.851, + "grad_norm": 45.20467758178711, + "learning_rate": 3.026e-07, + "loss": 0.4796, + "step": 48510 + }, + { + "epoch": 4.852, + "grad_norm": 44.18875503540039, + "learning_rate": 3.0060000000000003e-07, + "loss": 0.5077, + "step": 48520 + }, + { + "epoch": 4.853, + "grad_norm": 44.58082962036133, + "learning_rate": 2.9860000000000004e-07, + "loss": 0.5945, + "step": 48530 + }, + { + "epoch": 4.854, + "grad_norm": 18.132604598999023, + "learning_rate": 2.966e-07, + "loss": 0.8124, + "step": 48540 + }, + { + "epoch": 4.855, + "grad_norm": 35.660037994384766, + "learning_rate": 2.946e-07, + "loss": 0.3541, + "step": 48550 + }, + { + "epoch": 4.856, + "grad_norm": 37.676273345947266, + "learning_rate": 2.926e-07, + "loss": 0.4191, + "step": 48560 + }, + { + "epoch": 4.857, + "grad_norm": 18.786176681518555, + "learning_rate": 2.906e-07, + "loss": 0.3393, + "step": 48570 + }, + { + "epoch": 4.858, + "grad_norm": 35.841712951660156, + "learning_rate": 2.8860000000000003e-07, + "loss": 0.7147, + "step": 48580 + }, + { + "epoch": 4.859, + "grad_norm": 4.490417003631592, + "learning_rate": 2.8660000000000004e-07, + "loss": 0.441, + "step": 48590 + }, + { + "epoch": 4.86, + "grad_norm": 27.341760635375977, + "learning_rate": 2.846e-07, + "loss": 0.7622, + "step": 48600 + }, + { + "epoch": 4.861, + "grad_norm": 43.09422302246094, + "learning_rate": 2.826e-07, + "loss": 0.6165, + "step": 48610 + }, + { + "epoch": 4.862, + "grad_norm": 38.81334686279297, + "learning_rate": 2.8060000000000006e-07, + "loss": 0.5359, + "step": 48620 + }, + { + "epoch": 4.8629999999999995, + "grad_norm": 5.982778549194336, + "learning_rate": 2.786e-07, + "loss": 0.5695, + "step": 48630 + }, + { + "epoch": 4.864, + "grad_norm": 33.21123504638672, + "learning_rate": 2.7660000000000003e-07, + "loss": 0.4943, + "step": 48640 + }, + { + "epoch": 4.865, + "grad_norm": 61.49201965332031, + "learning_rate": 2.746e-07, + "loss": 0.6732, + "step": 48650 + }, + { + "epoch": 4.866, + "grad_norm": 34.918212890625, + "learning_rate": 2.726e-07, + "loss": 0.7058, + "step": 48660 + }, + { + "epoch": 4.867, + "grad_norm": 42.28673553466797, + "learning_rate": 2.7060000000000005e-07, + "loss": 0.4972, + "step": 48670 + }, + { + "epoch": 4.868, + "grad_norm": 12.002147674560547, + "learning_rate": 2.686e-07, + "loss": 0.8624, + "step": 48680 + }, + { + "epoch": 4.869, + "grad_norm": 62.372684478759766, + "learning_rate": 2.666e-07, + "loss": 0.6257, + "step": 48690 + }, + { + "epoch": 4.87, + "grad_norm": 39.709686279296875, + "learning_rate": 2.6460000000000003e-07, + "loss": 0.6162, + "step": 48700 + }, + { + "epoch": 4.871, + "grad_norm": 16.147838592529297, + "learning_rate": 2.626e-07, + "loss": 0.8548, + "step": 48710 + }, + { + "epoch": 4.872, + "grad_norm": 4.075834274291992, + "learning_rate": 2.6060000000000004e-07, + "loss": 0.382, + "step": 48720 + }, + { + "epoch": 4.873, + "grad_norm": 19.27889060974121, + "learning_rate": 2.5860000000000005e-07, + "loss": 0.641, + "step": 48730 + }, + { + "epoch": 4.874, + "grad_norm": 48.61284255981445, + "learning_rate": 2.566e-07, + "loss": 0.3655, + "step": 48740 + }, + { + "epoch": 4.875, + "grad_norm": 56.545310974121094, + "learning_rate": 2.546e-07, + "loss": 0.6569, + "step": 48750 + }, + { + "epoch": 4.876, + "grad_norm": 36.132381439208984, + "learning_rate": 2.526e-07, + "loss": 0.4681, + "step": 48760 + }, + { + "epoch": 4.877, + "grad_norm": 51.53099822998047, + "learning_rate": 2.5060000000000003e-07, + "loss": 0.6395, + "step": 48770 + }, + { + "epoch": 4.878, + "grad_norm": 46.316402435302734, + "learning_rate": 2.4860000000000004e-07, + "loss": 0.4726, + "step": 48780 + }, + { + "epoch": 4.879, + "grad_norm": 6.3659210205078125, + "learning_rate": 2.4660000000000005e-07, + "loss": 0.8035, + "step": 48790 + }, + { + "epoch": 4.88, + "grad_norm": 16.26994514465332, + "learning_rate": 2.446e-07, + "loss": 0.4792, + "step": 48800 + }, + { + "epoch": 4.881, + "grad_norm": 46.80910110473633, + "learning_rate": 2.426e-07, + "loss": 0.9064, + "step": 48810 + }, + { + "epoch": 4.882, + "grad_norm": 19.964271545410156, + "learning_rate": 2.406e-07, + "loss": 0.8111, + "step": 48820 + }, + { + "epoch": 4.883, + "grad_norm": 72.45056915283203, + "learning_rate": 2.3860000000000003e-07, + "loss": 0.6046, + "step": 48830 + }, + { + "epoch": 4.884, + "grad_norm": 33.62139892578125, + "learning_rate": 2.3660000000000001e-07, + "loss": 0.2831, + "step": 48840 + }, + { + "epoch": 4.885, + "grad_norm": 6.268035888671875, + "learning_rate": 2.3460000000000002e-07, + "loss": 0.2903, + "step": 48850 + }, + { + "epoch": 4.886, + "grad_norm": 31.210264205932617, + "learning_rate": 2.326e-07, + "loss": 0.5656, + "step": 48860 + }, + { + "epoch": 4.8870000000000005, + "grad_norm": 89.84651947021484, + "learning_rate": 2.3060000000000001e-07, + "loss": 0.6617, + "step": 48870 + }, + { + "epoch": 4.888, + "grad_norm": 78.52317810058594, + "learning_rate": 2.286e-07, + "loss": 0.733, + "step": 48880 + }, + { + "epoch": 4.889, + "grad_norm": 16.838603973388672, + "learning_rate": 2.266e-07, + "loss": 0.5299, + "step": 48890 + }, + { + "epoch": 4.89, + "grad_norm": 0.49068504571914673, + "learning_rate": 2.2460000000000004e-07, + "loss": 0.3942, + "step": 48900 + }, + { + "epoch": 4.891, + "grad_norm": 28.578716278076172, + "learning_rate": 2.226e-07, + "loss": 0.6955, + "step": 48910 + }, + { + "epoch": 4.892, + "grad_norm": 1.8998624086380005, + "learning_rate": 2.206e-07, + "loss": 0.5737, + "step": 48920 + }, + { + "epoch": 4.893, + "grad_norm": 50.37765884399414, + "learning_rate": 2.1860000000000004e-07, + "loss": 0.6957, + "step": 48930 + }, + { + "epoch": 4.894, + "grad_norm": 21.070785522460938, + "learning_rate": 2.166e-07, + "loss": 0.6923, + "step": 48940 + }, + { + "epoch": 4.895, + "grad_norm": 44.579280853271484, + "learning_rate": 2.1460000000000003e-07, + "loss": 0.5985, + "step": 48950 + }, + { + "epoch": 4.896, + "grad_norm": 15.574660301208496, + "learning_rate": 2.1260000000000004e-07, + "loss": 0.6324, + "step": 48960 + }, + { + "epoch": 4.897, + "grad_norm": 54.42012405395508, + "learning_rate": 2.106e-07, + "loss": 0.746, + "step": 48970 + }, + { + "epoch": 4.898, + "grad_norm": 15.344828605651855, + "learning_rate": 2.0860000000000003e-07, + "loss": 0.5326, + "step": 48980 + }, + { + "epoch": 4.899, + "grad_norm": 37.128597259521484, + "learning_rate": 2.0660000000000004e-07, + "loss": 0.5567, + "step": 48990 + }, + { + "epoch": 4.9, + "grad_norm": 22.833782196044922, + "learning_rate": 2.0460000000000002e-07, + "loss": 0.6224, + "step": 49000 + }, + { + "epoch": 4.901, + "grad_norm": 43.60334396362305, + "learning_rate": 2.0260000000000003e-07, + "loss": 0.8438, + "step": 49010 + }, + { + "epoch": 4.902, + "grad_norm": 17.917469024658203, + "learning_rate": 2.0060000000000004e-07, + "loss": 0.2035, + "step": 49020 + }, + { + "epoch": 4.9030000000000005, + "grad_norm": 22.8342227935791, + "learning_rate": 1.9860000000000002e-07, + "loss": 0.5491, + "step": 49030 + }, + { + "epoch": 4.904, + "grad_norm": 19.834346771240234, + "learning_rate": 1.9660000000000003e-07, + "loss": 0.411, + "step": 49040 + }, + { + "epoch": 4.905, + "grad_norm": 23.630340576171875, + "learning_rate": 1.9460000000000004e-07, + "loss": 0.6375, + "step": 49050 + }, + { + "epoch": 4.906, + "grad_norm": 54.59148025512695, + "learning_rate": 1.9260000000000002e-07, + "loss": 0.8771, + "step": 49060 + }, + { + "epoch": 4.907, + "grad_norm": 30.65956687927246, + "learning_rate": 1.9060000000000003e-07, + "loss": 0.6741, + "step": 49070 + }, + { + "epoch": 4.908, + "grad_norm": 61.472137451171875, + "learning_rate": 1.886e-07, + "loss": 0.6089, + "step": 49080 + }, + { + "epoch": 4.909, + "grad_norm": 38.48383331298828, + "learning_rate": 1.8660000000000002e-07, + "loss": 0.3453, + "step": 49090 + }, + { + "epoch": 4.91, + "grad_norm": 25.503416061401367, + "learning_rate": 1.8460000000000003e-07, + "loss": 0.3371, + "step": 49100 + }, + { + "epoch": 4.911, + "grad_norm": 26.52053451538086, + "learning_rate": 1.826e-07, + "loss": 1.0525, + "step": 49110 + }, + { + "epoch": 4.912, + "grad_norm": 53.508087158203125, + "learning_rate": 1.8060000000000002e-07, + "loss": 0.5461, + "step": 49120 + }, + { + "epoch": 4.913, + "grad_norm": 36.59590530395508, + "learning_rate": 1.7860000000000002e-07, + "loss": 0.4117, + "step": 49130 + }, + { + "epoch": 4.914, + "grad_norm": 31.170352935791016, + "learning_rate": 1.766e-07, + "loss": 0.4214, + "step": 49140 + }, + { + "epoch": 4.915, + "grad_norm": 72.6932601928711, + "learning_rate": 1.7460000000000001e-07, + "loss": 0.719, + "step": 49150 + }, + { + "epoch": 4.916, + "grad_norm": 34.64444351196289, + "learning_rate": 1.7260000000000002e-07, + "loss": 0.4933, + "step": 49160 + }, + { + "epoch": 4.917, + "grad_norm": 54.596031188964844, + "learning_rate": 1.706e-07, + "loss": 0.6415, + "step": 49170 + }, + { + "epoch": 4.918, + "grad_norm": 34.222679138183594, + "learning_rate": 1.6860000000000001e-07, + "loss": 0.4704, + "step": 49180 + }, + { + "epoch": 4.9190000000000005, + "grad_norm": 3.0823545455932617, + "learning_rate": 1.6660000000000002e-07, + "loss": 0.6312, + "step": 49190 + }, + { + "epoch": 4.92, + "grad_norm": 61.71739196777344, + "learning_rate": 1.646e-07, + "loss": 0.6899, + "step": 49200 + }, + { + "epoch": 4.921, + "grad_norm": 62.166526794433594, + "learning_rate": 1.6260000000000001e-07, + "loss": 0.5632, + "step": 49210 + }, + { + "epoch": 4.922, + "grad_norm": 75.72866821289062, + "learning_rate": 1.6060000000000002e-07, + "loss": 0.6717, + "step": 49220 + }, + { + "epoch": 4.923, + "grad_norm": 19.29648208618164, + "learning_rate": 1.586e-07, + "loss": 0.412, + "step": 49230 + }, + { + "epoch": 4.924, + "grad_norm": 57.19310760498047, + "learning_rate": 1.566e-07, + "loss": 0.7854, + "step": 49240 + }, + { + "epoch": 4.925, + "grad_norm": 16.37799835205078, + "learning_rate": 1.5460000000000002e-07, + "loss": 0.4818, + "step": 49250 + }, + { + "epoch": 4.926, + "grad_norm": 3.643502712249756, + "learning_rate": 1.526e-07, + "loss": 0.3695, + "step": 49260 + }, + { + "epoch": 4.927, + "grad_norm": 57.638790130615234, + "learning_rate": 1.506e-07, + "loss": 0.5841, + "step": 49270 + }, + { + "epoch": 4.928, + "grad_norm": 38.98382568359375, + "learning_rate": 1.4860000000000002e-07, + "loss": 0.311, + "step": 49280 + }, + { + "epoch": 4.929, + "grad_norm": 31.647018432617188, + "learning_rate": 1.466e-07, + "loss": 0.5787, + "step": 49290 + }, + { + "epoch": 4.93, + "grad_norm": 5.648628234863281, + "learning_rate": 1.446e-07, + "loss": 0.5783, + "step": 49300 + }, + { + "epoch": 4.931, + "grad_norm": 9.384773254394531, + "learning_rate": 1.4260000000000002e-07, + "loss": 0.5762, + "step": 49310 + }, + { + "epoch": 4.932, + "grad_norm": 55.00576400756836, + "learning_rate": 1.406e-07, + "loss": 0.4893, + "step": 49320 + }, + { + "epoch": 4.933, + "grad_norm": 23.570556640625, + "learning_rate": 1.386e-07, + "loss": 0.5582, + "step": 49330 + }, + { + "epoch": 4.934, + "grad_norm": 43.51008224487305, + "learning_rate": 1.3660000000000002e-07, + "loss": 0.5739, + "step": 49340 + }, + { + "epoch": 4.9350000000000005, + "grad_norm": 58.253780364990234, + "learning_rate": 1.346e-07, + "loss": 0.5964, + "step": 49350 + }, + { + "epoch": 4.936, + "grad_norm": 46.08802795410156, + "learning_rate": 1.326e-07, + "loss": 0.6438, + "step": 49360 + }, + { + "epoch": 4.937, + "grad_norm": 8.2721586227417, + "learning_rate": 1.3060000000000002e-07, + "loss": 0.372, + "step": 49370 + }, + { + "epoch": 4.938, + "grad_norm": 24.139907836914062, + "learning_rate": 1.286e-07, + "loss": 0.5713, + "step": 49380 + }, + { + "epoch": 4.939, + "grad_norm": 16.5051212310791, + "learning_rate": 1.266e-07, + "loss": 0.7615, + "step": 49390 + }, + { + "epoch": 4.9399999999999995, + "grad_norm": 3.0267720222473145, + "learning_rate": 1.2460000000000002e-07, + "loss": 0.3022, + "step": 49400 + }, + { + "epoch": 4.941, + "grad_norm": 53.91283416748047, + "learning_rate": 1.226e-07, + "loss": 0.7291, + "step": 49410 + }, + { + "epoch": 4.942, + "grad_norm": 23.457191467285156, + "learning_rate": 1.206e-07, + "loss": 0.5664, + "step": 49420 + }, + { + "epoch": 4.943, + "grad_norm": 14.853793144226074, + "learning_rate": 1.1860000000000002e-07, + "loss": 0.4499, + "step": 49430 + }, + { + "epoch": 4.944, + "grad_norm": 27.185049057006836, + "learning_rate": 1.1660000000000001e-07, + "loss": 0.4925, + "step": 49440 + }, + { + "epoch": 4.945, + "grad_norm": 69.38607788085938, + "learning_rate": 1.1460000000000001e-07, + "loss": 0.393, + "step": 49450 + }, + { + "epoch": 4.946, + "grad_norm": 35.48990249633789, + "learning_rate": 1.126e-07, + "loss": 0.4629, + "step": 49460 + }, + { + "epoch": 4.947, + "grad_norm": 35.915348052978516, + "learning_rate": 1.1060000000000001e-07, + "loss": 0.4844, + "step": 49470 + }, + { + "epoch": 4.948, + "grad_norm": 47.767757415771484, + "learning_rate": 1.086e-07, + "loss": 0.6391, + "step": 49480 + }, + { + "epoch": 4.949, + "grad_norm": 46.730716705322266, + "learning_rate": 1.066e-07, + "loss": 0.5344, + "step": 49490 + }, + { + "epoch": 4.95, + "grad_norm": 9.486944198608398, + "learning_rate": 1.0460000000000001e-07, + "loss": 0.5001, + "step": 49500 + }, + { + "epoch": 4.951, + "grad_norm": 49.90391540527344, + "learning_rate": 1.026e-07, + "loss": 0.4198, + "step": 49510 + }, + { + "epoch": 4.952, + "grad_norm": 31.162935256958008, + "learning_rate": 1.006e-07, + "loss": 0.6015, + "step": 49520 + }, + { + "epoch": 4.953, + "grad_norm": 14.631115913391113, + "learning_rate": 9.860000000000001e-08, + "loss": 0.5281, + "step": 49530 + }, + { + "epoch": 4.954, + "grad_norm": 89.07907104492188, + "learning_rate": 9.66e-08, + "loss": 0.7108, + "step": 49540 + }, + { + "epoch": 4.955, + "grad_norm": 29.647022247314453, + "learning_rate": 9.46e-08, + "loss": 0.4537, + "step": 49550 + }, + { + "epoch": 4.9559999999999995, + "grad_norm": 57.06145095825195, + "learning_rate": 9.26e-08, + "loss": 0.5961, + "step": 49560 + }, + { + "epoch": 4.957, + "grad_norm": 28.40804672241211, + "learning_rate": 9.060000000000002e-08, + "loss": 0.5376, + "step": 49570 + }, + { + "epoch": 4.958, + "grad_norm": 69.8585205078125, + "learning_rate": 8.86e-08, + "loss": 0.5019, + "step": 49580 + }, + { + "epoch": 4.959, + "grad_norm": 42.624027252197266, + "learning_rate": 8.66e-08, + "loss": 0.3875, + "step": 49590 + }, + { + "epoch": 4.96, + "grad_norm": 2.737194538116455, + "learning_rate": 8.460000000000002e-08, + "loss": 0.563, + "step": 49600 + }, + { + "epoch": 4.961, + "grad_norm": 48.68153381347656, + "learning_rate": 8.260000000000001e-08, + "loss": 0.6256, + "step": 49610 + }, + { + "epoch": 4.962, + "grad_norm": 10.589071273803711, + "learning_rate": 8.060000000000001e-08, + "loss": 0.4583, + "step": 49620 + }, + { + "epoch": 4.963, + "grad_norm": 49.633148193359375, + "learning_rate": 7.860000000000002e-08, + "loss": 0.8238, + "step": 49630 + }, + { + "epoch": 4.964, + "grad_norm": 47.60674285888672, + "learning_rate": 7.660000000000001e-08, + "loss": 0.4204, + "step": 49640 + }, + { + "epoch": 4.965, + "grad_norm": 68.43152618408203, + "learning_rate": 7.46e-08, + "loss": 0.5481, + "step": 49650 + }, + { + "epoch": 4.966, + "grad_norm": 62.700435638427734, + "learning_rate": 7.26e-08, + "loss": 0.5413, + "step": 49660 + }, + { + "epoch": 4.967, + "grad_norm": 43.648651123046875, + "learning_rate": 7.060000000000001e-08, + "loss": 0.4658, + "step": 49670 + }, + { + "epoch": 4.968, + "grad_norm": 53.35328674316406, + "learning_rate": 6.86e-08, + "loss": 0.4889, + "step": 49680 + }, + { + "epoch": 4.969, + "grad_norm": 69.61092376708984, + "learning_rate": 6.66e-08, + "loss": 0.5914, + "step": 49690 + }, + { + "epoch": 4.97, + "grad_norm": 34.3068962097168, + "learning_rate": 6.46e-08, + "loss": 0.4936, + "step": 49700 + }, + { + "epoch": 4.971, + "grad_norm": 70.36450958251953, + "learning_rate": 6.26e-08, + "loss": 0.4934, + "step": 49710 + }, + { + "epoch": 4.9719999999999995, + "grad_norm": 29.908767700195312, + "learning_rate": 6.060000000000001e-08, + "loss": 0.6727, + "step": 49720 + }, + { + "epoch": 4.973, + "grad_norm": 74.44947814941406, + "learning_rate": 5.86e-08, + "loss": 0.5676, + "step": 49730 + }, + { + "epoch": 4.974, + "grad_norm": 36.034976959228516, + "learning_rate": 5.6600000000000004e-08, + "loss": 0.63, + "step": 49740 + }, + { + "epoch": 4.975, + "grad_norm": 2.7080929279327393, + "learning_rate": 5.46e-08, + "loss": 0.6098, + "step": 49750 + }, + { + "epoch": 4.976, + "grad_norm": 56.25693893432617, + "learning_rate": 5.26e-08, + "loss": 0.3895, + "step": 49760 + }, + { + "epoch": 4.977, + "grad_norm": 18.81259536743164, + "learning_rate": 5.060000000000001e-08, + "loss": 0.5584, + "step": 49770 + }, + { + "epoch": 4.978, + "grad_norm": 4.564599990844727, + "learning_rate": 4.86e-08, + "loss": 0.583, + "step": 49780 + }, + { + "epoch": 4.979, + "grad_norm": 35.461177825927734, + "learning_rate": 4.660000000000001e-08, + "loss": 0.7756, + "step": 49790 + }, + { + "epoch": 4.98, + "grad_norm": 43.247154235839844, + "learning_rate": 4.460000000000001e-08, + "loss": 0.5009, + "step": 49800 + }, + { + "epoch": 4.981, + "grad_norm": 61.074039459228516, + "learning_rate": 4.2600000000000004e-08, + "loss": 0.6001, + "step": 49810 + }, + { + "epoch": 4.982, + "grad_norm": 78.60579681396484, + "learning_rate": 4.0600000000000006e-08, + "loss": 0.6014, + "step": 49820 + }, + { + "epoch": 4.983, + "grad_norm": 32.76850891113281, + "learning_rate": 3.86e-08, + "loss": 0.4978, + "step": 49830 + }, + { + "epoch": 4.984, + "grad_norm": 1.1829404830932617, + "learning_rate": 3.6600000000000003e-08, + "loss": 0.2477, + "step": 49840 + }, + { + "epoch": 4.985, + "grad_norm": 36.923988342285156, + "learning_rate": 3.4600000000000005e-08, + "loss": 0.5418, + "step": 49850 + }, + { + "epoch": 4.986, + "grad_norm": 27.69318199157715, + "learning_rate": 3.26e-08, + "loss": 0.5853, + "step": 49860 + }, + { + "epoch": 4.987, + "grad_norm": 42.778106689453125, + "learning_rate": 3.06e-08, + "loss": 0.6131, + "step": 49870 + }, + { + "epoch": 4.9879999999999995, + "grad_norm": 11.142000198364258, + "learning_rate": 2.8600000000000005e-08, + "loss": 0.3882, + "step": 49880 + }, + { + "epoch": 4.989, + "grad_norm": 37.72666931152344, + "learning_rate": 2.6600000000000003e-08, + "loss": 0.6759, + "step": 49890 + }, + { + "epoch": 4.99, + "grad_norm": 36.423744201660156, + "learning_rate": 2.4600000000000002e-08, + "loss": 0.4503, + "step": 49900 + }, + { + "epoch": 4.991, + "grad_norm": 37.07157897949219, + "learning_rate": 2.26e-08, + "loss": 0.7043, + "step": 49910 + }, + { + "epoch": 4.992, + "grad_norm": 8.626225471496582, + "learning_rate": 2.0600000000000002e-08, + "loss": 0.4042, + "step": 49920 + }, + { + "epoch": 4.993, + "grad_norm": 66.21869659423828, + "learning_rate": 1.86e-08, + "loss": 0.9308, + "step": 49930 + }, + { + "epoch": 4.994, + "grad_norm": 42.68752670288086, + "learning_rate": 1.6600000000000003e-08, + "loss": 0.2277, + "step": 49940 + }, + { + "epoch": 4.995, + "grad_norm": 23.218891143798828, + "learning_rate": 1.46e-08, + "loss": 0.6241, + "step": 49950 + }, + { + "epoch": 4.996, + "grad_norm": 44.54765319824219, + "learning_rate": 1.2600000000000002e-08, + "loss": 0.6953, + "step": 49960 + }, + { + "epoch": 4.997, + "grad_norm": 19.68933868408203, + "learning_rate": 1.06e-08, + "loss": 0.4644, + "step": 49970 + }, + { + "epoch": 4.998, + "grad_norm": 6.50738525390625, + "learning_rate": 8.600000000000001e-09, + "loss": 0.8382, + "step": 49980 + }, + { + "epoch": 4.999, + "grad_norm": 44.634403228759766, + "learning_rate": 6.6e-09, + "loss": 0.561, + "step": 49990 + }, + { + "epoch": 5.0, + "grad_norm": 44.17949676513672, + "learning_rate": 4.600000000000001e-09, + "loss": 0.4599, + "step": 50000 + } + ], + "logging_steps": 10, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}