diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5330 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999172801720573, + "eval_steps": 2300, + "global_step": 7555, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 9.696115493774414, + "learning_rate": 4.347826086956522e-08, + "loss": 1.6976, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 9.331488609313965, + "learning_rate": 8.695652173913044e-08, + "loss": 1.711, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 9.760047912597656, + "learning_rate": 1.3043478260869566e-07, + "loss": 1.7084, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 8.302080154418945, + "learning_rate": 1.7391304347826088e-07, + "loss": 1.6865, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 7.753058433532715, + "learning_rate": 2.173913043478261e-07, + "loss": 1.6633, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 7.2695817947387695, + "learning_rate": 2.608695652173913e-07, + "loss": 1.654, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 6.353436470031738, + "learning_rate": 3.0434782608695656e-07, + "loss": 1.6029, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 5.489682674407959, + "learning_rate": 3.4782608695652175e-07, + "loss": 1.5619, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 5.3502631187438965, + "learning_rate": 3.91304347826087e-07, + "loss": 1.522, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 5.116358280181885, + "learning_rate": 4.347826086956522e-07, + "loss": 1.4462, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 4.4354634284973145, + "learning_rate": 4.782608695652174e-07, + "loss": 1.4002, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 5.6585693359375, + "learning_rate": 5.217391304347826e-07, + "loss": 1.3227, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 4.757506847381592, + "learning_rate": 5.652173913043478e-07, + "loss": 1.2458, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 4.914801120758057, + "learning_rate": 6.086956521739131e-07, + "loss": 1.2131, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 9.453673362731934, + "learning_rate": 6.521739130434783e-07, + "loss": 1.15, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 6.355442047119141, + "learning_rate": 6.956521739130435e-07, + "loss": 1.0977, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 6.3270087242126465, + "learning_rate": 7.391304347826088e-07, + "loss": 1.0666, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 6.119747638702393, + "learning_rate": 7.82608695652174e-07, + "loss": 1.044, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 8.558585166931152, + "learning_rate": 8.260869565217392e-07, + "loss": 1.0215, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 5.8033766746521, + "learning_rate": 8.695652173913044e-07, + "loss": 0.996, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 6.511552810668945, + "learning_rate": 9.130434782608697e-07, + "loss": 0.9866, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 7.597463607788086, + "learning_rate": 9.565217391304349e-07, + "loss": 0.9724, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 6.662442207336426, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9711, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 5.849515914916992, + "learning_rate": 1.0434782608695653e-06, + "loss": 0.9607, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 6.360912322998047, + "learning_rate": 1.0869565217391306e-06, + "loss": 0.946, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 4.921197891235352, + "learning_rate": 1.1304347826086956e-06, + "loss": 0.9355, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 4.507955074310303, + "learning_rate": 1.173913043478261e-06, + "loss": 0.9248, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 4.760683059692383, + "learning_rate": 1.2173913043478262e-06, + "loss": 0.9315, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 5.035203456878662, + "learning_rate": 1.2608695652173913e-06, + "loss": 0.9209, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 4.834679126739502, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.9104, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 3.9745352268218994, + "learning_rate": 1.347826086956522e-06, + "loss": 0.9025, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 3.602553606033325, + "learning_rate": 1.391304347826087e-06, + "loss": 0.8954, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 3.429502487182617, + "learning_rate": 1.4347826086956523e-06, + "loss": 0.889, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 4.117110252380371, + "learning_rate": 1.4782608695652176e-06, + "loss": 0.8732, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 3.424821615219116, + "learning_rate": 1.521739130434783e-06, + "loss": 0.8819, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 4.870906829833984, + "learning_rate": 1.565217391304348e-06, + "loss": 0.8829, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 6.471639156341553, + "learning_rate": 1.608695652173913e-06, + "loss": 0.8632, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 4.399796962738037, + "learning_rate": 1.6521739130434784e-06, + "loss": 0.8543, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 3.4351227283477783, + "learning_rate": 1.6956521739130435e-06, + "loss": 0.847, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 3.1236326694488525, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.8445, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 3.5134594440460205, + "learning_rate": 1.782608695652174e-06, + "loss": 0.8377, + "step": 410 + }, + { + "epoch": 0.06, + "grad_norm": 3.1803650856018066, + "learning_rate": 1.8260869565217394e-06, + "loss": 0.835, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 5.259217739105225, + "learning_rate": 1.8695652173913044e-06, + "loss": 0.8368, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 4.378271579742432, + "learning_rate": 1.9130434782608697e-06, + "loss": 0.8377, + "step": 440 + }, + { + "epoch": 0.06, + "grad_norm": 3.6259055137634277, + "learning_rate": 1.956521739130435e-06, + "loss": 0.8383, + "step": 450 + }, + { + "epoch": 0.06, + "grad_norm": 4.089799404144287, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8309, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 3.041405200958252, + "learning_rate": 2.0434782608695656e-06, + "loss": 0.8178, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 3.9156289100646973, + "learning_rate": 2.0869565217391305e-06, + "loss": 0.8091, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 3.1488077640533447, + "learning_rate": 2.130434782608696e-06, + "loss": 0.8287, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 3.1481711864471436, + "learning_rate": 2.173913043478261e-06, + "loss": 0.8109, + "step": 500 + }, + { + "epoch": 0.07, + "grad_norm": 3.6406595706939697, + "learning_rate": 2.2173913043478264e-06, + "loss": 0.8071, + "step": 510 + }, + { + "epoch": 0.07, + "grad_norm": 3.674004554748535, + "learning_rate": 2.2608695652173913e-06, + "loss": 0.8033, + "step": 520 + }, + { + "epoch": 0.07, + "grad_norm": 2.566720485687256, + "learning_rate": 2.3043478260869566e-06, + "loss": 0.8138, + "step": 530 + }, + { + "epoch": 0.07, + "grad_norm": 3.0154435634613037, + "learning_rate": 2.347826086956522e-06, + "loss": 0.8147, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 3.2751858234405518, + "learning_rate": 2.391304347826087e-06, + "loss": 0.7845, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 5.122045993804932, + "learning_rate": 2.4347826086956525e-06, + "loss": 0.7984, + "step": 560 + }, + { + "epoch": 0.08, + "grad_norm": 4.417994499206543, + "learning_rate": 2.4782608695652178e-06, + "loss": 0.7981, + "step": 570 + }, + { + "epoch": 0.08, + "grad_norm": 2.893526315689087, + "learning_rate": 2.5217391304347826e-06, + "loss": 0.799, + "step": 580 + }, + { + "epoch": 0.08, + "grad_norm": 3.6591286659240723, + "learning_rate": 2.5652173913043484e-06, + "loss": 0.7965, + "step": 590 + }, + { + "epoch": 0.08, + "grad_norm": 2.970930337905884, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.8057, + "step": 600 + }, + { + "epoch": 0.08, + "grad_norm": 2.9321494102478027, + "learning_rate": 2.6521739130434785e-06, + "loss": 0.7862, + "step": 610 + }, + { + "epoch": 0.08, + "grad_norm": 3.5950493812561035, + "learning_rate": 2.695652173913044e-06, + "loss": 0.7844, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 2.530301332473755, + "learning_rate": 2.7391304347826087e-06, + "loss": 0.7863, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 2.930530548095703, + "learning_rate": 2.782608695652174e-06, + "loss": 0.7914, + "step": 640 + }, + { + "epoch": 0.09, + "grad_norm": 2.9592342376708984, + "learning_rate": 2.8260869565217393e-06, + "loss": 0.7738, + "step": 650 + }, + { + "epoch": 0.09, + "grad_norm": 2.425995349884033, + "learning_rate": 2.8695652173913046e-06, + "loss": 0.7941, + "step": 660 + }, + { + "epoch": 0.09, + "grad_norm": 2.796645402908325, + "learning_rate": 2.9130434782608695e-06, + "loss": 0.7969, + "step": 670 + }, + { + "epoch": 0.09, + "grad_norm": 2.917015314102173, + "learning_rate": 2.956521739130435e-06, + "loss": 0.7803, + "step": 680 + }, + { + "epoch": 0.09, + "grad_norm": 2.644934892654419, + "learning_rate": 3e-06, + "loss": 0.7925, + "step": 690 + }, + { + "epoch": 0.09, + "grad_norm": 3.2409515380859375, + "learning_rate": 3.043478260869566e-06, + "loss": 0.7863, + "step": 700 + }, + { + "epoch": 0.09, + "grad_norm": 2.5315630435943604, + "learning_rate": 3.0869565217391307e-06, + "loss": 0.7798, + "step": 710 + }, + { + "epoch": 0.1, + "grad_norm": 2.755002498626709, + "learning_rate": 3.130434782608696e-06, + "loss": 0.7736, + "step": 720 + }, + { + "epoch": 0.1, + "grad_norm": 2.5441417694091797, + "learning_rate": 3.1739130434782613e-06, + "loss": 0.7653, + "step": 730 + }, + { + "epoch": 0.1, + "grad_norm": 2.50203537940979, + "learning_rate": 3.217391304347826e-06, + "loss": 0.7751, + "step": 740 + }, + { + "epoch": 0.1, + "grad_norm": 2.8558547496795654, + "learning_rate": 3.2608695652173914e-06, + "loss": 0.7832, + "step": 750 + }, + { + "epoch": 0.1, + "grad_norm": 2.7601280212402344, + "learning_rate": 3.3043478260869567e-06, + "loss": 0.7865, + "step": 760 + }, + { + "epoch": 0.1, + "grad_norm": 4.165918827056885, + "learning_rate": 3.347826086956522e-06, + "loss": 0.7804, + "step": 770 + }, + { + "epoch": 0.1, + "grad_norm": 3.20556378364563, + "learning_rate": 3.391304347826087e-06, + "loss": 0.7787, + "step": 780 + }, + { + "epoch": 0.1, + "grad_norm": 2.259490489959717, + "learning_rate": 3.4347826086956526e-06, + "loss": 0.7776, + "step": 790 + }, + { + "epoch": 0.11, + "grad_norm": 2.482300281524658, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.7909, + "step": 800 + }, + { + "epoch": 0.11, + "grad_norm": 2.6870365142822266, + "learning_rate": 3.5217391304347832e-06, + "loss": 0.7747, + "step": 810 + }, + { + "epoch": 0.11, + "grad_norm": 2.509525775909424, + "learning_rate": 3.565217391304348e-06, + "loss": 0.7675, + "step": 820 + }, + { + "epoch": 0.11, + "grad_norm": 2.8357813358306885, + "learning_rate": 3.6086956521739134e-06, + "loss": 0.7525, + "step": 830 + }, + { + "epoch": 0.11, + "grad_norm": 3.0093586444854736, + "learning_rate": 3.6521739130434787e-06, + "loss": 0.7871, + "step": 840 + }, + { + "epoch": 0.11, + "grad_norm": 2.5198683738708496, + "learning_rate": 3.6956521739130436e-06, + "loss": 0.7602, + "step": 850 + }, + { + "epoch": 0.11, + "grad_norm": 2.4067280292510986, + "learning_rate": 3.739130434782609e-06, + "loss": 0.7709, + "step": 860 + }, + { + "epoch": 0.12, + "grad_norm": 2.968722343444824, + "learning_rate": 3.782608695652174e-06, + "loss": 0.7638, + "step": 870 + }, + { + "epoch": 0.12, + "grad_norm": 2.5267333984375, + "learning_rate": 3.8260869565217395e-06, + "loss": 0.7618, + "step": 880 + }, + { + "epoch": 0.12, + "grad_norm": 2.519435405731201, + "learning_rate": 3.869565217391304e-06, + "loss": 0.7862, + "step": 890 + }, + { + "epoch": 0.12, + "grad_norm": 2.373142957687378, + "learning_rate": 3.91304347826087e-06, + "loss": 0.7785, + "step": 900 + }, + { + "epoch": 0.12, + "grad_norm": 2.939995288848877, + "learning_rate": 3.956521739130435e-06, + "loss": 0.7752, + "step": 910 + }, + { + "epoch": 0.12, + "grad_norm": 2.4909372329711914, + "learning_rate": 4.000000000000001e-06, + "loss": 0.779, + "step": 920 + }, + { + "epoch": 0.12, + "grad_norm": 2.6996891498565674, + "learning_rate": 4.0434782608695655e-06, + "loss": 0.7735, + "step": 930 + }, + { + "epoch": 0.12, + "grad_norm": 2.628506660461426, + "learning_rate": 4.086956521739131e-06, + "loss": 0.7551, + "step": 940 + }, + { + "epoch": 0.13, + "grad_norm": 2.350477457046509, + "learning_rate": 4.130434782608696e-06, + "loss": 0.7685, + "step": 950 + }, + { + "epoch": 0.13, + "grad_norm": 2.2088937759399414, + "learning_rate": 4.173913043478261e-06, + "loss": 0.7714, + "step": 960 + }, + { + "epoch": 0.13, + "grad_norm": 3.055957555770874, + "learning_rate": 4.217391304347827e-06, + "loss": 0.7633, + "step": 970 + }, + { + "epoch": 0.13, + "grad_norm": 2.987377882003784, + "learning_rate": 4.260869565217392e-06, + "loss": 0.7639, + "step": 980 + }, + { + "epoch": 0.13, + "grad_norm": 2.8698835372924805, + "learning_rate": 4.304347826086957e-06, + "loss": 0.7748, + "step": 990 + }, + { + "epoch": 0.13, + "grad_norm": 2.728653907775879, + "learning_rate": 4.347826086956522e-06, + "loss": 0.7531, + "step": 1000 + }, + { + "epoch": 0.13, + "grad_norm": 3.0617196559906006, + "learning_rate": 4.391304347826087e-06, + "loss": 0.7637, + "step": 1010 + }, + { + "epoch": 0.13, + "grad_norm": 9.645702362060547, + "learning_rate": 4.434782608695653e-06, + "loss": 0.7734, + "step": 1020 + }, + { + "epoch": 0.14, + "grad_norm": 3.174217462539673, + "learning_rate": 4.478260869565218e-06, + "loss": 0.7637, + "step": 1030 + }, + { + "epoch": 0.14, + "grad_norm": 2.565565586090088, + "learning_rate": 4.5217391304347826e-06, + "loss": 0.7616, + "step": 1040 + }, + { + "epoch": 0.14, + "grad_norm": 2.3000173568725586, + "learning_rate": 4.565217391304348e-06, + "loss": 0.7605, + "step": 1050 + }, + { + "epoch": 0.14, + "grad_norm": 2.2204582691192627, + "learning_rate": 4.608695652173913e-06, + "loss": 0.7613, + "step": 1060 + }, + { + "epoch": 0.14, + "grad_norm": 2.5566813945770264, + "learning_rate": 4.652173913043478e-06, + "loss": 0.745, + "step": 1070 + }, + { + "epoch": 0.14, + "grad_norm": 2.7924296855926514, + "learning_rate": 4.695652173913044e-06, + "loss": 0.7643, + "step": 1080 + }, + { + "epoch": 0.14, + "grad_norm": 2.78627610206604, + "learning_rate": 4.739130434782609e-06, + "loss": 0.765, + "step": 1090 + }, + { + "epoch": 0.15, + "grad_norm": 2.9410696029663086, + "learning_rate": 4.782608695652174e-06, + "loss": 0.7534, + "step": 1100 + }, + { + "epoch": 0.15, + "grad_norm": 2.6935067176818848, + "learning_rate": 4.826086956521739e-06, + "loss": 0.7501, + "step": 1110 + }, + { + "epoch": 0.15, + "grad_norm": 2.8043696880340576, + "learning_rate": 4.869565217391305e-06, + "loss": 0.7576, + "step": 1120 + }, + { + "epoch": 0.15, + "grad_norm": 2.8394672870635986, + "learning_rate": 4.91304347826087e-06, + "loss": 0.7536, + "step": 1130 + }, + { + "epoch": 0.15, + "grad_norm": 2.5857579708099365, + "learning_rate": 4.9565217391304355e-06, + "loss": 0.7545, + "step": 1140 + }, + { + "epoch": 0.15, + "grad_norm": 2.707064151763916, + "learning_rate": 5e-06, + "loss": 0.7686, + "step": 1150 + }, + { + "epoch": 0.15, + "grad_norm": 2.7348179817199707, + "learning_rate": 5.043478260869565e-06, + "loss": 0.7594, + "step": 1160 + }, + { + "epoch": 0.15, + "grad_norm": 2.8637123107910156, + "learning_rate": 5.08695652173913e-06, + "loss": 0.7719, + "step": 1170 + }, + { + "epoch": 0.16, + "grad_norm": 2.605658769607544, + "learning_rate": 5.130434782608697e-06, + "loss": 0.7609, + "step": 1180 + }, + { + "epoch": 0.16, + "grad_norm": 2.538159132003784, + "learning_rate": 5.173913043478262e-06, + "loss": 0.7447, + "step": 1190 + }, + { + "epoch": 0.16, + "grad_norm": 2.6438486576080322, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.7474, + "step": 1200 + }, + { + "epoch": 0.16, + "grad_norm": 2.5470008850097656, + "learning_rate": 5.260869565217391e-06, + "loss": 0.755, + "step": 1210 + }, + { + "epoch": 0.16, + "grad_norm": 2.9301180839538574, + "learning_rate": 5.304347826086957e-06, + "loss": 0.7507, + "step": 1220 + }, + { + "epoch": 0.16, + "grad_norm": 2.509558916091919, + "learning_rate": 5.347826086956523e-06, + "loss": 0.7568, + "step": 1230 + }, + { + "epoch": 0.16, + "grad_norm": 2.386697292327881, + "learning_rate": 5.391304347826088e-06, + "loss": 0.7615, + "step": 1240 + }, + { + "epoch": 0.17, + "grad_norm": 2.8356692790985107, + "learning_rate": 5.4347826086956525e-06, + "loss": 0.7498, + "step": 1250 + }, + { + "epoch": 0.17, + "grad_norm": 2.812669038772583, + "learning_rate": 5.478260869565217e-06, + "loss": 0.7467, + "step": 1260 + }, + { + "epoch": 0.17, + "grad_norm": 2.591529369354248, + "learning_rate": 5.521739130434783e-06, + "loss": 0.7537, + "step": 1270 + }, + { + "epoch": 0.17, + "grad_norm": 2.7662971019744873, + "learning_rate": 5.565217391304348e-06, + "loss": 0.7508, + "step": 1280 + }, + { + "epoch": 0.17, + "grad_norm": 2.4197189807891846, + "learning_rate": 5.608695652173914e-06, + "loss": 0.7415, + "step": 1290 + }, + { + "epoch": 0.17, + "grad_norm": 2.972205877304077, + "learning_rate": 5.652173913043479e-06, + "loss": 0.7587, + "step": 1300 + }, + { + "epoch": 0.17, + "grad_norm": 3.1979939937591553, + "learning_rate": 5.695652173913044e-06, + "loss": 0.7369, + "step": 1310 + }, + { + "epoch": 0.17, + "grad_norm": 2.4516711235046387, + "learning_rate": 5.739130434782609e-06, + "loss": 0.7438, + "step": 1320 + }, + { + "epoch": 0.18, + "grad_norm": 2.620466470718384, + "learning_rate": 5.782608695652174e-06, + "loss": 0.7465, + "step": 1330 + }, + { + "epoch": 0.18, + "grad_norm": 2.7041525840759277, + "learning_rate": 5.826086956521739e-06, + "loss": 0.7522, + "step": 1340 + }, + { + "epoch": 0.18, + "grad_norm": 2.6662778854370117, + "learning_rate": 5.8695652173913055e-06, + "loss": 0.7567, + "step": 1350 + }, + { + "epoch": 0.18, + "grad_norm": 5.283855438232422, + "learning_rate": 5.91304347826087e-06, + "loss": 0.7307, + "step": 1360 + }, + { + "epoch": 0.18, + "grad_norm": 2.551743745803833, + "learning_rate": 5.956521739130435e-06, + "loss": 0.7466, + "step": 1370 + }, + { + "epoch": 0.18, + "grad_norm": 3.212951898574829, + "learning_rate": 6e-06, + "loss": 0.752, + "step": 1380 + }, + { + "epoch": 0.18, + "grad_norm": 2.417921304702759, + "learning_rate": 6.043478260869565e-06, + "loss": 0.7531, + "step": 1390 + }, + { + "epoch": 0.19, + "grad_norm": 2.751988649368286, + "learning_rate": 6.086956521739132e-06, + "loss": 0.7393, + "step": 1400 + }, + { + "epoch": 0.19, + "grad_norm": 2.8188741207122803, + "learning_rate": 6.1304347826086965e-06, + "loss": 0.748, + "step": 1410 + }, + { + "epoch": 0.19, + "grad_norm": 2.727118730545044, + "learning_rate": 6.173913043478261e-06, + "loss": 0.7404, + "step": 1420 + }, + { + "epoch": 0.19, + "grad_norm": 2.9528021812438965, + "learning_rate": 6.217391304347826e-06, + "loss": 0.7476, + "step": 1430 + }, + { + "epoch": 0.19, + "grad_norm": 3.184258460998535, + "learning_rate": 6.260869565217392e-06, + "loss": 0.7439, + "step": 1440 + }, + { + "epoch": 0.19, + "grad_norm": 3.2081708908081055, + "learning_rate": 6.304347826086958e-06, + "loss": 0.7328, + "step": 1450 + }, + { + "epoch": 0.19, + "grad_norm": 2.437472343444824, + "learning_rate": 6.3478260869565225e-06, + "loss": 0.7447, + "step": 1460 + }, + { + "epoch": 0.19, + "grad_norm": 2.4201667308807373, + "learning_rate": 6.391304347826087e-06, + "loss": 0.7333, + "step": 1470 + }, + { + "epoch": 0.2, + "grad_norm": 3.096134901046753, + "learning_rate": 6.434782608695652e-06, + "loss": 0.7592, + "step": 1480 + }, + { + "epoch": 0.2, + "grad_norm": 2.744535446166992, + "learning_rate": 6.478260869565218e-06, + "loss": 0.7469, + "step": 1490 + }, + { + "epoch": 0.2, + "grad_norm": 2.768773317337036, + "learning_rate": 6.521739130434783e-06, + "loss": 0.7434, + "step": 1500 + }, + { + "epoch": 0.2, + "grad_norm": 3.7912373542785645, + "learning_rate": 6.565217391304349e-06, + "loss": 0.7597, + "step": 1510 + }, + { + "epoch": 0.2, + "grad_norm": 3.1697614192962646, + "learning_rate": 6.6086956521739135e-06, + "loss": 0.7484, + "step": 1520 + }, + { + "epoch": 0.2, + "grad_norm": 3.172487735748291, + "learning_rate": 6.652173913043479e-06, + "loss": 0.7327, + "step": 1530 + }, + { + "epoch": 0.2, + "grad_norm": 2.5283539295196533, + "learning_rate": 6.695652173913044e-06, + "loss": 0.743, + "step": 1540 + }, + { + "epoch": 0.21, + "grad_norm": 3.1751644611358643, + "learning_rate": 6.739130434782609e-06, + "loss": 0.723, + "step": 1550 + }, + { + "epoch": 0.21, + "grad_norm": 2.524111747741699, + "learning_rate": 6.782608695652174e-06, + "loss": 0.7248, + "step": 1560 + }, + { + "epoch": 0.21, + "grad_norm": 5.5174455642700195, + "learning_rate": 6.8260869565217395e-06, + "loss": 0.7399, + "step": 1570 + }, + { + "epoch": 0.21, + "grad_norm": 2.582502841949463, + "learning_rate": 6.869565217391305e-06, + "loss": 0.7428, + "step": 1580 + }, + { + "epoch": 0.21, + "grad_norm": 2.751222848892212, + "learning_rate": 6.91304347826087e-06, + "loss": 0.7353, + "step": 1590 + }, + { + "epoch": 0.21, + "grad_norm": 2.983644485473633, + "learning_rate": 6.956521739130435e-06, + "loss": 0.753, + "step": 1600 + }, + { + "epoch": 0.21, + "grad_norm": 2.416503667831421, + "learning_rate": 7e-06, + "loss": 0.7323, + "step": 1610 + }, + { + "epoch": 0.21, + "grad_norm": 2.5844953060150146, + "learning_rate": 7.0434782608695665e-06, + "loss": 0.7514, + "step": 1620 + }, + { + "epoch": 0.22, + "grad_norm": 2.449826717376709, + "learning_rate": 7.086956521739131e-06, + "loss": 0.7514, + "step": 1630 + }, + { + "epoch": 0.22, + "grad_norm": 2.574061393737793, + "learning_rate": 7.130434782608696e-06, + "loss": 0.7418, + "step": 1640 + }, + { + "epoch": 0.22, + "grad_norm": 2.707425355911255, + "learning_rate": 7.173913043478261e-06, + "loss": 0.7402, + "step": 1650 + }, + { + "epoch": 0.22, + "grad_norm": 2.7220213413238525, + "learning_rate": 7.217391304347827e-06, + "loss": 0.7501, + "step": 1660 + }, + { + "epoch": 0.22, + "grad_norm": 2.730178117752075, + "learning_rate": 7.2608695652173925e-06, + "loss": 0.7351, + "step": 1670 + }, + { + "epoch": 0.22, + "grad_norm": 2.536191940307617, + "learning_rate": 7.304347826086957e-06, + "loss": 0.7362, + "step": 1680 + }, + { + "epoch": 0.22, + "grad_norm": 2.93157958984375, + "learning_rate": 7.347826086956522e-06, + "loss": 0.7369, + "step": 1690 + }, + { + "epoch": 0.22, + "grad_norm": 2.8212029933929443, + "learning_rate": 7.391304347826087e-06, + "loss": 0.7325, + "step": 1700 + }, + { + "epoch": 0.23, + "grad_norm": 3.0014121532440186, + "learning_rate": 7.434782608695653e-06, + "loss": 0.7383, + "step": 1710 + }, + { + "epoch": 0.23, + "grad_norm": 3.5077619552612305, + "learning_rate": 7.478260869565218e-06, + "loss": 0.7317, + "step": 1720 + }, + { + "epoch": 0.23, + "grad_norm": 2.7584381103515625, + "learning_rate": 7.5217391304347835e-06, + "loss": 0.7214, + "step": 1730 + }, + { + "epoch": 0.23, + "grad_norm": 3.4156510829925537, + "learning_rate": 7.565217391304348e-06, + "loss": 0.7367, + "step": 1740 + }, + { + "epoch": 0.23, + "grad_norm": 3.4717941284179688, + "learning_rate": 7.608695652173914e-06, + "loss": 0.7234, + "step": 1750 + }, + { + "epoch": 0.23, + "grad_norm": 2.6128644943237305, + "learning_rate": 7.652173913043479e-06, + "loss": 0.7486, + "step": 1760 + }, + { + "epoch": 0.23, + "grad_norm": 2.3647897243499756, + "learning_rate": 7.695652173913044e-06, + "loss": 0.7186, + "step": 1770 + }, + { + "epoch": 0.24, + "grad_norm": 2.6185524463653564, + "learning_rate": 7.739130434782609e-06, + "loss": 0.7405, + "step": 1780 + }, + { + "epoch": 0.24, + "grad_norm": 3.2258949279785156, + "learning_rate": 7.782608695652174e-06, + "loss": 0.7399, + "step": 1790 + }, + { + "epoch": 0.24, + "grad_norm": 3.954819679260254, + "learning_rate": 7.82608695652174e-06, + "loss": 0.7277, + "step": 1800 + }, + { + "epoch": 0.24, + "grad_norm": 3.0589473247528076, + "learning_rate": 7.869565217391305e-06, + "loss": 0.7282, + "step": 1810 + }, + { + "epoch": 0.24, + "grad_norm": 2.6480607986450195, + "learning_rate": 7.91304347826087e-06, + "loss": 0.7262, + "step": 1820 + }, + { + "epoch": 0.24, + "grad_norm": 2.735381603240967, + "learning_rate": 7.956521739130435e-06, + "loss": 0.7216, + "step": 1830 + }, + { + "epoch": 0.24, + "grad_norm": 5.60382080078125, + "learning_rate": 8.000000000000001e-06, + "loss": 0.7275, + "step": 1840 + }, + { + "epoch": 0.24, + "grad_norm": 2.710845947265625, + "learning_rate": 8.043478260869566e-06, + "loss": 0.739, + "step": 1850 + }, + { + "epoch": 0.25, + "grad_norm": 2.4441914558410645, + "learning_rate": 8.086956521739131e-06, + "loss": 0.7145, + "step": 1860 + }, + { + "epoch": 0.25, + "grad_norm": 2.7932469844818115, + "learning_rate": 8.130434782608696e-06, + "loss": 0.7226, + "step": 1870 + }, + { + "epoch": 0.25, + "grad_norm": 2.782019853591919, + "learning_rate": 8.173913043478263e-06, + "loss": 0.7139, + "step": 1880 + }, + { + "epoch": 0.25, + "grad_norm": 3.049837350845337, + "learning_rate": 8.217391304347827e-06, + "loss": 0.7393, + "step": 1890 + }, + { + "epoch": 0.25, + "grad_norm": 2.894196033477783, + "learning_rate": 8.260869565217392e-06, + "loss": 0.7163, + "step": 1900 + }, + { + "epoch": 0.25, + "grad_norm": 2.4531071186065674, + "learning_rate": 8.304347826086957e-06, + "loss": 0.7204, + "step": 1910 + }, + { + "epoch": 0.25, + "grad_norm": 3.320891857147217, + "learning_rate": 8.347826086956522e-06, + "loss": 0.7248, + "step": 1920 + }, + { + "epoch": 0.26, + "grad_norm": 3.3719470500946045, + "learning_rate": 8.391304347826089e-06, + "loss": 0.7315, + "step": 1930 + }, + { + "epoch": 0.26, + "grad_norm": 2.7417898178100586, + "learning_rate": 8.434782608695653e-06, + "loss": 0.7256, + "step": 1940 + }, + { + "epoch": 0.26, + "grad_norm": 2.6355440616607666, + "learning_rate": 8.478260869565218e-06, + "loss": 0.7305, + "step": 1950 + }, + { + "epoch": 0.26, + "grad_norm": 2.6812551021575928, + "learning_rate": 8.521739130434783e-06, + "loss": 0.7157, + "step": 1960 + }, + { + "epoch": 0.26, + "grad_norm": 3.1449575424194336, + "learning_rate": 8.56521739130435e-06, + "loss": 0.7186, + "step": 1970 + }, + { + "epoch": 0.26, + "grad_norm": 4.587336540222168, + "learning_rate": 8.608695652173915e-06, + "loss": 0.7241, + "step": 1980 + }, + { + "epoch": 0.26, + "grad_norm": 3.474202871322632, + "learning_rate": 8.65217391304348e-06, + "loss": 0.7318, + "step": 1990 + }, + { + "epoch": 0.26, + "grad_norm": 4.36326789855957, + "learning_rate": 8.695652173913044e-06, + "loss": 0.7336, + "step": 2000 + }, + { + "epoch": 0.27, + "grad_norm": 2.8643243312835693, + "learning_rate": 8.73913043478261e-06, + "loss": 0.7282, + "step": 2010 + }, + { + "epoch": 0.27, + "grad_norm": 2.8812708854675293, + "learning_rate": 8.782608695652174e-06, + "loss": 0.7158, + "step": 2020 + }, + { + "epoch": 0.27, + "grad_norm": 2.9906275272369385, + "learning_rate": 8.82608695652174e-06, + "loss": 0.7163, + "step": 2030 + }, + { + "epoch": 0.27, + "grad_norm": 2.6248161792755127, + "learning_rate": 8.869565217391306e-06, + "loss": 0.736, + "step": 2040 + }, + { + "epoch": 0.27, + "grad_norm": 2.564918041229248, + "learning_rate": 8.91304347826087e-06, + "loss": 0.733, + "step": 2050 + }, + { + "epoch": 0.27, + "grad_norm": 2.4388699531555176, + "learning_rate": 8.956521739130435e-06, + "loss": 0.7192, + "step": 2060 + }, + { + "epoch": 0.27, + "grad_norm": 2.6738839149475098, + "learning_rate": 9e-06, + "loss": 0.7107, + "step": 2070 + }, + { + "epoch": 0.28, + "grad_norm": 4.980138778686523, + "learning_rate": 9.043478260869565e-06, + "loss": 0.716, + "step": 2080 + }, + { + "epoch": 0.28, + "grad_norm": 2.9591591358184814, + "learning_rate": 9.086956521739132e-06, + "loss": 0.7197, + "step": 2090 + }, + { + "epoch": 0.28, + "grad_norm": 2.6318516731262207, + "learning_rate": 9.130434782608697e-06, + "loss": 0.7179, + "step": 2100 + }, + { + "epoch": 0.28, + "grad_norm": 2.6253883838653564, + "learning_rate": 9.173913043478261e-06, + "loss": 0.7184, + "step": 2110 + }, + { + "epoch": 0.28, + "grad_norm": 2.5605149269104004, + "learning_rate": 9.217391304347826e-06, + "loss": 0.7305, + "step": 2120 + }, + { + "epoch": 0.28, + "grad_norm": 4.129536151885986, + "learning_rate": 9.260869565217391e-06, + "loss": 0.7088, + "step": 2130 + }, + { + "epoch": 0.28, + "grad_norm": 2.9836220741271973, + "learning_rate": 9.304347826086956e-06, + "loss": 0.7239, + "step": 2140 + }, + { + "epoch": 0.28, + "grad_norm": 2.778383731842041, + "learning_rate": 9.347826086956523e-06, + "loss": 0.7067, + "step": 2150 + }, + { + "epoch": 0.29, + "grad_norm": 2.8585681915283203, + "learning_rate": 9.391304347826087e-06, + "loss": 0.7085, + "step": 2160 + }, + { + "epoch": 0.29, + "grad_norm": 2.595531940460205, + "learning_rate": 9.434782608695652e-06, + "loss": 0.7101, + "step": 2170 + }, + { + "epoch": 0.29, + "grad_norm": 3.7232606410980225, + "learning_rate": 9.478260869565217e-06, + "loss": 0.7333, + "step": 2180 + }, + { + "epoch": 0.29, + "grad_norm": 2.381574869155884, + "learning_rate": 9.521739130434784e-06, + "loss": 0.7236, + "step": 2190 + }, + { + "epoch": 0.29, + "grad_norm": 3.042024612426758, + "learning_rate": 9.565217391304349e-06, + "loss": 0.7261, + "step": 2200 + }, + { + "epoch": 0.29, + "grad_norm": 2.2856943607330322, + "learning_rate": 9.608695652173914e-06, + "loss": 0.7021, + "step": 2210 + }, + { + "epoch": 0.29, + "grad_norm": 3.454638719558716, + "learning_rate": 9.652173913043478e-06, + "loss": 0.711, + "step": 2220 + }, + { + "epoch": 0.3, + "grad_norm": 2.605741500854492, + "learning_rate": 9.695652173913043e-06, + "loss": 0.7078, + "step": 2230 + }, + { + "epoch": 0.3, + "grad_norm": 3.4367051124572754, + "learning_rate": 9.73913043478261e-06, + "loss": 0.7208, + "step": 2240 + }, + { + "epoch": 0.3, + "grad_norm": 2.4078874588012695, + "learning_rate": 9.782608695652175e-06, + "loss": 0.7148, + "step": 2250 + }, + { + "epoch": 0.3, + "grad_norm": 2.6727590560913086, + "learning_rate": 9.82608695652174e-06, + "loss": 0.7276, + "step": 2260 + }, + { + "epoch": 0.3, + "grad_norm": 3.0248544216156006, + "learning_rate": 9.869565217391304e-06, + "loss": 0.7133, + "step": 2270 + }, + { + "epoch": 0.3, + "grad_norm": 2.7144577503204346, + "learning_rate": 9.913043478260871e-06, + "loss": 0.7101, + "step": 2280 + }, + { + "epoch": 0.3, + "grad_norm": 2.7497832775115967, + "learning_rate": 9.956521739130436e-06, + "loss": 0.692, + "step": 2290 + }, + { + "epoch": 0.3, + "grad_norm": 2.990417957305908, + "learning_rate": 1e-05, + "loss": 0.6961, + "step": 2300 + }, + { + "epoch": 0.3, + "eval_loss": 0.7328751683235168, + "eval_runtime": 199.3608, + "eval_samples_per_second": 55.176, + "eval_steps_per_second": 6.897, + "step": 2300 + }, + { + "epoch": 0.31, + "grad_norm": 2.8692731857299805, + "learning_rate": 9.999994241637783e-06, + "loss": 0.715, + "step": 2310 + }, + { + "epoch": 0.31, + "grad_norm": 2.5786540508270264, + "learning_rate": 9.999976966564394e-06, + "loss": 0.7163, + "step": 2320 + }, + { + "epoch": 0.31, + "grad_norm": 3.327714443206787, + "learning_rate": 9.999948174819623e-06, + "loss": 0.7135, + "step": 2330 + }, + { + "epoch": 0.31, + "grad_norm": 2.9145894050598145, + "learning_rate": 9.999907866469787e-06, + "loss": 0.7054, + "step": 2340 + }, + { + "epoch": 0.31, + "grad_norm": 2.5164294242858887, + "learning_rate": 9.999856041607732e-06, + "loss": 0.7149, + "step": 2350 + }, + { + "epoch": 0.31, + "grad_norm": 3.212944984436035, + "learning_rate": 9.999792700352826e-06, + "loss": 0.7022, + "step": 2360 + }, + { + "epoch": 0.31, + "grad_norm": 2.477055072784424, + "learning_rate": 9.99971784285097e-06, + "loss": 0.7057, + "step": 2370 + }, + { + "epoch": 0.31, + "grad_norm": 2.563532590866089, + "learning_rate": 9.99963146927458e-06, + "loss": 0.7117, + "step": 2380 + }, + { + "epoch": 0.32, + "grad_norm": 3.0468506813049316, + "learning_rate": 9.999533579822611e-06, + "loss": 0.7152, + "step": 2390 + }, + { + "epoch": 0.32, + "grad_norm": 2.904538154602051, + "learning_rate": 9.99942417472053e-06, + "loss": 0.72, + "step": 2400 + }, + { + "epoch": 0.32, + "grad_norm": 2.625180244445801, + "learning_rate": 9.999303254220342e-06, + "loss": 0.7097, + "step": 2410 + }, + { + "epoch": 0.32, + "grad_norm": 2.9058775901794434, + "learning_rate": 9.999170818600562e-06, + "loss": 0.7254, + "step": 2420 + }, + { + "epoch": 0.32, + "grad_norm": 3.0165164470672607, + "learning_rate": 9.999026868166238e-06, + "loss": 0.7132, + "step": 2430 + }, + { + "epoch": 0.32, + "grad_norm": 4.671907424926758, + "learning_rate": 9.998871403248936e-06, + "loss": 0.7191, + "step": 2440 + }, + { + "epoch": 0.32, + "grad_norm": 2.6668975353240967, + "learning_rate": 9.998704424206747e-06, + "loss": 0.7066, + "step": 2450 + }, + { + "epoch": 0.33, + "grad_norm": 3.66217041015625, + "learning_rate": 9.998525931424279e-06, + "loss": 0.6917, + "step": 2460 + }, + { + "epoch": 0.33, + "grad_norm": 2.8258302211761475, + "learning_rate": 9.998335925312666e-06, + "loss": 0.6889, + "step": 2470 + }, + { + "epoch": 0.33, + "grad_norm": 2.8596441745758057, + "learning_rate": 9.998134406309555e-06, + "loss": 0.6997, + "step": 2480 + }, + { + "epoch": 0.33, + "grad_norm": 2.9443325996398926, + "learning_rate": 9.997921374879112e-06, + "loss": 0.7082, + "step": 2490 + }, + { + "epoch": 0.33, + "grad_norm": 4.481228828430176, + "learning_rate": 9.997696831512027e-06, + "loss": 0.7007, + "step": 2500 + }, + { + "epoch": 0.33, + "grad_norm": 3.7334978580474854, + "learning_rate": 9.997460776725497e-06, + "loss": 0.708, + "step": 2510 + }, + { + "epoch": 0.33, + "grad_norm": 2.843071937561035, + "learning_rate": 9.997213211063236e-06, + "loss": 0.7201, + "step": 2520 + }, + { + "epoch": 0.33, + "grad_norm": 3.9357028007507324, + "learning_rate": 9.99695413509548e-06, + "loss": 0.7152, + "step": 2530 + }, + { + "epoch": 0.34, + "grad_norm": 2.709559440612793, + "learning_rate": 9.996683549418964e-06, + "loss": 0.7071, + "step": 2540 + }, + { + "epoch": 0.34, + "grad_norm": 3.1510403156280518, + "learning_rate": 9.996401454656941e-06, + "loss": 0.6963, + "step": 2550 + }, + { + "epoch": 0.34, + "grad_norm": 2.642859697341919, + "learning_rate": 9.996107851459175e-06, + "loss": 0.7107, + "step": 2560 + }, + { + "epoch": 0.34, + "grad_norm": 3.1064438819885254, + "learning_rate": 9.995802740501933e-06, + "loss": 0.7045, + "step": 2570 + }, + { + "epoch": 0.34, + "grad_norm": 2.6383016109466553, + "learning_rate": 9.995486122487992e-06, + "loss": 0.6912, + "step": 2580 + }, + { + "epoch": 0.34, + "grad_norm": 3.1221187114715576, + "learning_rate": 9.995157998146633e-06, + "loss": 0.7, + "step": 2590 + }, + { + "epoch": 0.34, + "grad_norm": 2.3788633346557617, + "learning_rate": 9.994818368233639e-06, + "loss": 0.7152, + "step": 2600 + }, + { + "epoch": 0.35, + "grad_norm": 2.5052289962768555, + "learning_rate": 9.994467233531294e-06, + "loss": 0.7041, + "step": 2610 + }, + { + "epoch": 0.35, + "grad_norm": 2.9792628288269043, + "learning_rate": 9.994104594848383e-06, + "loss": 0.707, + "step": 2620 + }, + { + "epoch": 0.35, + "grad_norm": 2.6907389163970947, + "learning_rate": 9.993730453020187e-06, + "loss": 0.6965, + "step": 2630 + }, + { + "epoch": 0.35, + "grad_norm": 2.660902738571167, + "learning_rate": 9.993344808908486e-06, + "loss": 0.6978, + "step": 2640 + }, + { + "epoch": 0.35, + "grad_norm": 2.5266551971435547, + "learning_rate": 9.992947663401548e-06, + "loss": 0.6938, + "step": 2650 + }, + { + "epoch": 0.35, + "grad_norm": 2.86554217338562, + "learning_rate": 9.99253901741414e-06, + "loss": 0.7002, + "step": 2660 + }, + { + "epoch": 0.35, + "grad_norm": 3.3354568481445312, + "learning_rate": 9.992118871887513e-06, + "loss": 0.7191, + "step": 2670 + }, + { + "epoch": 0.35, + "grad_norm": 2.7831523418426514, + "learning_rate": 9.991687227789407e-06, + "loss": 0.7031, + "step": 2680 + }, + { + "epoch": 0.36, + "grad_norm": 2.882188558578491, + "learning_rate": 9.991244086114046e-06, + "loss": 0.6944, + "step": 2690 + }, + { + "epoch": 0.36, + "grad_norm": 2.857550859451294, + "learning_rate": 9.990789447882136e-06, + "loss": 0.694, + "step": 2700 + }, + { + "epoch": 0.36, + "grad_norm": 2.44887638092041, + "learning_rate": 9.990323314140872e-06, + "loss": 0.7152, + "step": 2710 + }, + { + "epoch": 0.36, + "grad_norm": 4.176514148712158, + "learning_rate": 9.989845685963917e-06, + "loss": 0.7048, + "step": 2720 + }, + { + "epoch": 0.36, + "grad_norm": 2.6085798740386963, + "learning_rate": 9.989356564451415e-06, + "loss": 0.6918, + "step": 2730 + }, + { + "epoch": 0.36, + "grad_norm": 2.8457624912261963, + "learning_rate": 9.988855950729979e-06, + "loss": 0.6992, + "step": 2740 + }, + { + "epoch": 0.36, + "grad_norm": 2.9820759296417236, + "learning_rate": 9.988343845952697e-06, + "loss": 0.708, + "step": 2750 + }, + { + "epoch": 0.37, + "grad_norm": 3.1150028705596924, + "learning_rate": 9.987820251299121e-06, + "loss": 0.6925, + "step": 2760 + }, + { + "epoch": 0.37, + "grad_norm": 2.9244368076324463, + "learning_rate": 9.987285167975274e-06, + "loss": 0.6865, + "step": 2770 + }, + { + "epoch": 0.37, + "grad_norm": 2.4057462215423584, + "learning_rate": 9.986738597213633e-06, + "loss": 0.7015, + "step": 2780 + }, + { + "epoch": 0.37, + "grad_norm": 3.322909116744995, + "learning_rate": 9.986180540273143e-06, + "loss": 0.6832, + "step": 2790 + }, + { + "epoch": 0.37, + "grad_norm": 3.0608110427856445, + "learning_rate": 9.985610998439198e-06, + "loss": 0.6943, + "step": 2800 + }, + { + "epoch": 0.37, + "grad_norm": 4.037482261657715, + "learning_rate": 9.98502997302365e-06, + "loss": 0.6892, + "step": 2810 + }, + { + "epoch": 0.37, + "grad_norm": 3.36313796043396, + "learning_rate": 9.984437465364802e-06, + "loss": 0.6965, + "step": 2820 + }, + { + "epoch": 0.37, + "grad_norm": 2.5192835330963135, + "learning_rate": 9.983833476827404e-06, + "loss": 0.7066, + "step": 2830 + }, + { + "epoch": 0.38, + "grad_norm": 2.7486279010772705, + "learning_rate": 9.983218008802648e-06, + "loss": 0.699, + "step": 2840 + }, + { + "epoch": 0.38, + "grad_norm": 2.654358148574829, + "learning_rate": 9.982591062708172e-06, + "loss": 0.6979, + "step": 2850 + }, + { + "epoch": 0.38, + "grad_norm": 3.2748641967773438, + "learning_rate": 9.981952639988046e-06, + "loss": 0.6991, + "step": 2860 + }, + { + "epoch": 0.38, + "grad_norm": 2.578864812850952, + "learning_rate": 9.98130274211278e-06, + "loss": 0.7049, + "step": 2870 + }, + { + "epoch": 0.38, + "grad_norm": 2.6727378368377686, + "learning_rate": 9.98064137057931e-06, + "loss": 0.7018, + "step": 2880 + }, + { + "epoch": 0.38, + "grad_norm": 3.929105043411255, + "learning_rate": 9.979968526911006e-06, + "loss": 0.7024, + "step": 2890 + }, + { + "epoch": 0.38, + "grad_norm": 3.152024030685425, + "learning_rate": 9.979284212657658e-06, + "loss": 0.6998, + "step": 2900 + }, + { + "epoch": 0.39, + "grad_norm": 2.9024596214294434, + "learning_rate": 9.978588429395475e-06, + "loss": 0.6984, + "step": 2910 + }, + { + "epoch": 0.39, + "grad_norm": 2.836294651031494, + "learning_rate": 9.97788117872709e-06, + "loss": 0.6908, + "step": 2920 + }, + { + "epoch": 0.39, + "grad_norm": 2.5680007934570312, + "learning_rate": 9.977162462281544e-06, + "loss": 0.6976, + "step": 2930 + }, + { + "epoch": 0.39, + "grad_norm": 2.9260566234588623, + "learning_rate": 9.976432281714289e-06, + "loss": 0.7054, + "step": 2940 + }, + { + "epoch": 0.39, + "grad_norm": 2.2062673568725586, + "learning_rate": 9.97569063870718e-06, + "loss": 0.6856, + "step": 2950 + }, + { + "epoch": 0.39, + "grad_norm": 4.240058422088623, + "learning_rate": 9.97493753496848e-06, + "loss": 0.7103, + "step": 2960 + }, + { + "epoch": 0.39, + "grad_norm": 2.477383852005005, + "learning_rate": 9.974172972232845e-06, + "loss": 0.6985, + "step": 2970 + }, + { + "epoch": 0.39, + "grad_norm": 3.088667392730713, + "learning_rate": 9.973396952261327e-06, + "loss": 0.6934, + "step": 2980 + }, + { + "epoch": 0.4, + "grad_norm": 3.2433605194091797, + "learning_rate": 9.972609476841368e-06, + "loss": 0.6985, + "step": 2990 + }, + { + "epoch": 0.4, + "grad_norm": 2.7958269119262695, + "learning_rate": 9.971810547786794e-06, + "loss": 0.6962, + "step": 3000 + }, + { + "epoch": 0.4, + "grad_norm": 2.777493953704834, + "learning_rate": 9.971000166937815e-06, + "loss": 0.6986, + "step": 3010 + }, + { + "epoch": 0.4, + "grad_norm": 3.0154001712799072, + "learning_rate": 9.970178336161018e-06, + "loss": 0.6812, + "step": 3020 + }, + { + "epoch": 0.4, + "grad_norm": 2.762068033218384, + "learning_rate": 9.969345057349365e-06, + "loss": 0.6936, + "step": 3030 + }, + { + "epoch": 0.4, + "grad_norm": 2.7011806964874268, + "learning_rate": 9.96850033242218e-06, + "loss": 0.6913, + "step": 3040 + }, + { + "epoch": 0.4, + "grad_norm": 2.9354515075683594, + "learning_rate": 9.967644163325157e-06, + "loss": 0.6717, + "step": 3050 + }, + { + "epoch": 0.4, + "grad_norm": 2.9377188682556152, + "learning_rate": 9.96677655203035e-06, + "loss": 0.703, + "step": 3060 + }, + { + "epoch": 0.41, + "grad_norm": 3.62003231048584, + "learning_rate": 9.965897500536167e-06, + "loss": 0.6982, + "step": 3070 + }, + { + "epoch": 0.41, + "grad_norm": 3.100145101547241, + "learning_rate": 9.965007010867366e-06, + "loss": 0.6869, + "step": 3080 + }, + { + "epoch": 0.41, + "grad_norm": 3.1306726932525635, + "learning_rate": 9.964105085075053e-06, + "loss": 0.6998, + "step": 3090 + }, + { + "epoch": 0.41, + "grad_norm": 2.943037986755371, + "learning_rate": 9.963191725236672e-06, + "loss": 0.6983, + "step": 3100 + }, + { + "epoch": 0.41, + "grad_norm": 2.641789197921753, + "learning_rate": 9.962266933456008e-06, + "loss": 0.7036, + "step": 3110 + }, + { + "epoch": 0.41, + "grad_norm": 2.6184604167938232, + "learning_rate": 9.961330711863175e-06, + "loss": 0.6847, + "step": 3120 + }, + { + "epoch": 0.41, + "grad_norm": 3.769002914428711, + "learning_rate": 9.960383062614614e-06, + "loss": 0.6908, + "step": 3130 + }, + { + "epoch": 0.42, + "grad_norm": 2.812992811203003, + "learning_rate": 9.959423987893086e-06, + "loss": 0.694, + "step": 3140 + }, + { + "epoch": 0.42, + "grad_norm": 2.8881213665008545, + "learning_rate": 9.958453489907673e-06, + "loss": 0.6891, + "step": 3150 + }, + { + "epoch": 0.42, + "grad_norm": 2.4800655841827393, + "learning_rate": 9.957471570893767e-06, + "loss": 0.6945, + "step": 3160 + }, + { + "epoch": 0.42, + "grad_norm": 2.597376585006714, + "learning_rate": 9.956478233113066e-06, + "loss": 0.6879, + "step": 3170 + }, + { + "epoch": 0.42, + "grad_norm": 2.7456142902374268, + "learning_rate": 9.955473478853567e-06, + "loss": 0.6835, + "step": 3180 + }, + { + "epoch": 0.42, + "grad_norm": 3.177309513092041, + "learning_rate": 9.954457310429569e-06, + "loss": 0.6912, + "step": 3190 + }, + { + "epoch": 0.42, + "grad_norm": 2.0441253185272217, + "learning_rate": 9.953429730181653e-06, + "loss": 0.6797, + "step": 3200 + }, + { + "epoch": 0.42, + "grad_norm": 2.5114927291870117, + "learning_rate": 9.952390740476698e-06, + "loss": 0.6952, + "step": 3210 + }, + { + "epoch": 0.43, + "grad_norm": 2.6748149394989014, + "learning_rate": 9.951340343707852e-06, + "loss": 0.6844, + "step": 3220 + }, + { + "epoch": 0.43, + "grad_norm": 3.3017261028289795, + "learning_rate": 9.95027854229454e-06, + "loss": 0.6827, + "step": 3230 + }, + { + "epoch": 0.43, + "grad_norm": 2.5286686420440674, + "learning_rate": 9.94920533868246e-06, + "loss": 0.6895, + "step": 3240 + }, + { + "epoch": 0.43, + "grad_norm": 2.7270824909210205, + "learning_rate": 9.948120735343566e-06, + "loss": 0.6841, + "step": 3250 + }, + { + "epoch": 0.43, + "grad_norm": 3.2802062034606934, + "learning_rate": 9.947024734776076e-06, + "loss": 0.6866, + "step": 3260 + }, + { + "epoch": 0.43, + "grad_norm": 2.7635345458984375, + "learning_rate": 9.945917339504457e-06, + "loss": 0.702, + "step": 3270 + }, + { + "epoch": 0.43, + "grad_norm": 2.5334112644195557, + "learning_rate": 9.944798552079422e-06, + "loss": 0.7038, + "step": 3280 + }, + { + "epoch": 0.44, + "grad_norm": 3.57198429107666, + "learning_rate": 9.943668375077926e-06, + "loss": 0.6817, + "step": 3290 + }, + { + "epoch": 0.44, + "grad_norm": 3.0424792766571045, + "learning_rate": 9.942526811103153e-06, + "loss": 0.6894, + "step": 3300 + }, + { + "epoch": 0.44, + "grad_norm": 2.894702672958374, + "learning_rate": 9.94137386278452e-06, + "loss": 0.6905, + "step": 3310 + }, + { + "epoch": 0.44, + "grad_norm": 3.1093649864196777, + "learning_rate": 9.940209532777666e-06, + "loss": 0.7126, + "step": 3320 + }, + { + "epoch": 0.44, + "grad_norm": 2.57892107963562, + "learning_rate": 9.939033823764443e-06, + "loss": 0.6727, + "step": 3330 + }, + { + "epoch": 0.44, + "grad_norm": 2.641435146331787, + "learning_rate": 9.937846738452914e-06, + "loss": 0.6899, + "step": 3340 + }, + { + "epoch": 0.44, + "grad_norm": 2.499209403991699, + "learning_rate": 9.93664827957735e-06, + "loss": 0.6804, + "step": 3350 + }, + { + "epoch": 0.44, + "grad_norm": 2.339439868927002, + "learning_rate": 9.93543844989821e-06, + "loss": 0.6684, + "step": 3360 + }, + { + "epoch": 0.45, + "grad_norm": 3.1188161373138428, + "learning_rate": 9.93421725220215e-06, + "loss": 0.6885, + "step": 3370 + }, + { + "epoch": 0.45, + "grad_norm": 2.80849552154541, + "learning_rate": 9.932984689302012e-06, + "loss": 0.6861, + "step": 3380 + }, + { + "epoch": 0.45, + "grad_norm": 2.6763992309570312, + "learning_rate": 9.93174076403681e-06, + "loss": 0.6995, + "step": 3390 + }, + { + "epoch": 0.45, + "grad_norm": 2.9029862880706787, + "learning_rate": 9.930485479271735e-06, + "loss": 0.6881, + "step": 3400 + }, + { + "epoch": 0.45, + "grad_norm": 2.9763121604919434, + "learning_rate": 9.929218837898143e-06, + "loss": 0.6877, + "step": 3410 + }, + { + "epoch": 0.45, + "grad_norm": 2.593538999557495, + "learning_rate": 9.92794084283354e-06, + "loss": 0.6901, + "step": 3420 + }, + { + "epoch": 0.45, + "grad_norm": 3.309509038925171, + "learning_rate": 9.926651497021595e-06, + "loss": 0.6841, + "step": 3430 + }, + { + "epoch": 0.46, + "grad_norm": 3.0623817443847656, + "learning_rate": 9.925350803432112e-06, + "loss": 0.664, + "step": 3440 + }, + { + "epoch": 0.46, + "grad_norm": 4.454458236694336, + "learning_rate": 9.924038765061042e-06, + "loss": 0.6818, + "step": 3450 + }, + { + "epoch": 0.46, + "grad_norm": 3.123232126235962, + "learning_rate": 9.922715384930455e-06, + "loss": 0.6685, + "step": 3460 + }, + { + "epoch": 0.46, + "grad_norm": 3.939410448074341, + "learning_rate": 9.921380666088558e-06, + "loss": 0.6869, + "step": 3470 + }, + { + "epoch": 0.46, + "grad_norm": 5.342121601104736, + "learning_rate": 9.920034611609667e-06, + "loss": 0.6801, + "step": 3480 + }, + { + "epoch": 0.46, + "grad_norm": 3.224928379058838, + "learning_rate": 9.918677224594207e-06, + "loss": 0.6746, + "step": 3490 + }, + { + "epoch": 0.46, + "grad_norm": 2.875549793243408, + "learning_rate": 9.917308508168712e-06, + "loss": 0.6964, + "step": 3500 + }, + { + "epoch": 0.46, + "grad_norm": 3.0345466136932373, + "learning_rate": 9.915928465485805e-06, + "loss": 0.6727, + "step": 3510 + }, + { + "epoch": 0.47, + "grad_norm": 2.9075253009796143, + "learning_rate": 9.914537099724204e-06, + "loss": 0.6823, + "step": 3520 + }, + { + "epoch": 0.47, + "grad_norm": 3.458336591720581, + "learning_rate": 9.913134414088698e-06, + "loss": 0.6884, + "step": 3530 + }, + { + "epoch": 0.47, + "grad_norm": 3.051724910736084, + "learning_rate": 9.911720411810163e-06, + "loss": 0.7009, + "step": 3540 + }, + { + "epoch": 0.47, + "grad_norm": 3.8520309925079346, + "learning_rate": 9.91029509614553e-06, + "loss": 0.6858, + "step": 3550 + }, + { + "epoch": 0.47, + "grad_norm": 3.14030122756958, + "learning_rate": 9.908858470377793e-06, + "loss": 0.6847, + "step": 3560 + }, + { + "epoch": 0.47, + "grad_norm": 3.072479009628296, + "learning_rate": 9.907410537815997e-06, + "loss": 0.7003, + "step": 3570 + }, + { + "epoch": 0.47, + "grad_norm": 4.00950813293457, + "learning_rate": 9.905951301795231e-06, + "loss": 0.673, + "step": 3580 + }, + { + "epoch": 0.48, + "grad_norm": 2.5979695320129395, + "learning_rate": 9.904480765676617e-06, + "loss": 0.685, + "step": 3590 + }, + { + "epoch": 0.48, + "grad_norm": 2.7463061809539795, + "learning_rate": 9.902998932847308e-06, + "loss": 0.6966, + "step": 3600 + }, + { + "epoch": 0.48, + "grad_norm": 2.2453413009643555, + "learning_rate": 9.901505806720474e-06, + "loss": 0.6906, + "step": 3610 + }, + { + "epoch": 0.48, + "grad_norm": 3.755852699279785, + "learning_rate": 9.9000013907353e-06, + "loss": 0.7008, + "step": 3620 + }, + { + "epoch": 0.48, + "grad_norm": 3.0272433757781982, + "learning_rate": 9.89848568835698e-06, + "loss": 0.6912, + "step": 3630 + }, + { + "epoch": 0.48, + "grad_norm": 2.8252182006835938, + "learning_rate": 9.896958703076693e-06, + "loss": 0.6806, + "step": 3640 + }, + { + "epoch": 0.48, + "grad_norm": 3.7654056549072266, + "learning_rate": 9.895420438411616e-06, + "loss": 0.6778, + "step": 3650 + }, + { + "epoch": 0.48, + "grad_norm": 2.8534739017486572, + "learning_rate": 9.8938708979049e-06, + "loss": 0.6842, + "step": 3660 + }, + { + "epoch": 0.49, + "grad_norm": 3.0817458629608154, + "learning_rate": 9.892310085125675e-06, + "loss": 0.686, + "step": 3670 + }, + { + "epoch": 0.49, + "grad_norm": 3.03659725189209, + "learning_rate": 9.890738003669029e-06, + "loss": 0.6858, + "step": 3680 + }, + { + "epoch": 0.49, + "grad_norm": 3.211000919342041, + "learning_rate": 9.889154657156008e-06, + "loss": 0.6809, + "step": 3690 + }, + { + "epoch": 0.49, + "grad_norm": 3.543001413345337, + "learning_rate": 9.887560049233606e-06, + "loss": 0.6956, + "step": 3700 + }, + { + "epoch": 0.49, + "grad_norm": 2.351623058319092, + "learning_rate": 9.885954183574753e-06, + "loss": 0.678, + "step": 3710 + }, + { + "epoch": 0.49, + "grad_norm": 3.029533624649048, + "learning_rate": 9.884337063878313e-06, + "loss": 0.6772, + "step": 3720 + }, + { + "epoch": 0.49, + "grad_norm": 3.0510919094085693, + "learning_rate": 9.882708693869071e-06, + "loss": 0.6707, + "step": 3730 + }, + { + "epoch": 0.49, + "grad_norm": 2.8181586265563965, + "learning_rate": 9.881069077297724e-06, + "loss": 0.6768, + "step": 3740 + }, + { + "epoch": 0.5, + "grad_norm": 3.0697293281555176, + "learning_rate": 9.879418217940872e-06, + "loss": 0.6893, + "step": 3750 + }, + { + "epoch": 0.5, + "grad_norm": 2.8345251083374023, + "learning_rate": 9.877756119601018e-06, + "loss": 0.7028, + "step": 3760 + }, + { + "epoch": 0.5, + "grad_norm": 3.112302780151367, + "learning_rate": 9.876082786106546e-06, + "loss": 0.6914, + "step": 3770 + }, + { + "epoch": 0.5, + "grad_norm": 2.735736608505249, + "learning_rate": 9.87439822131172e-06, + "loss": 0.6748, + "step": 3780 + }, + { + "epoch": 0.5, + "grad_norm": 2.8680014610290527, + "learning_rate": 9.87270242909667e-06, + "loss": 0.6785, + "step": 3790 + }, + { + "epoch": 0.5, + "grad_norm": 2.8195078372955322, + "learning_rate": 9.870995413367397e-06, + "loss": 0.675, + "step": 3800 + }, + { + "epoch": 0.5, + "grad_norm": 2.9144508838653564, + "learning_rate": 9.86927717805574e-06, + "loss": 0.685, + "step": 3810 + }, + { + "epoch": 0.51, + "grad_norm": 2.886000871658325, + "learning_rate": 9.867547727119396e-06, + "loss": 0.6904, + "step": 3820 + }, + { + "epoch": 0.51, + "grad_norm": 3.047471284866333, + "learning_rate": 9.865807064541878e-06, + "loss": 0.6943, + "step": 3830 + }, + { + "epoch": 0.51, + "grad_norm": 2.9526615142822266, + "learning_rate": 9.864055194332538e-06, + "loss": 0.6815, + "step": 3840 + }, + { + "epoch": 0.51, + "grad_norm": 3.2787699699401855, + "learning_rate": 9.862292120526536e-06, + "loss": 0.6791, + "step": 3850 + }, + { + "epoch": 0.51, + "grad_norm": 2.6856937408447266, + "learning_rate": 9.860517847184837e-06, + "loss": 0.6978, + "step": 3860 + }, + { + "epoch": 0.51, + "grad_norm": 2.927518367767334, + "learning_rate": 9.858732378394207e-06, + "loss": 0.6904, + "step": 3870 + }, + { + "epoch": 0.51, + "grad_norm": 4.556071758270264, + "learning_rate": 9.856935718267196e-06, + "loss": 0.6889, + "step": 3880 + }, + { + "epoch": 0.51, + "grad_norm": 2.559556245803833, + "learning_rate": 9.855127870942131e-06, + "loss": 0.69, + "step": 3890 + }, + { + "epoch": 0.52, + "grad_norm": 3.2897391319274902, + "learning_rate": 9.85330884058311e-06, + "loss": 0.6872, + "step": 3900 + }, + { + "epoch": 0.52, + "grad_norm": 2.722827196121216, + "learning_rate": 9.851478631379982e-06, + "loss": 0.6865, + "step": 3910 + }, + { + "epoch": 0.52, + "grad_norm": 3.322338581085205, + "learning_rate": 9.849637247548356e-06, + "loss": 0.6919, + "step": 3920 + }, + { + "epoch": 0.52, + "grad_norm": 2.7876293659210205, + "learning_rate": 9.847784693329571e-06, + "loss": 0.6665, + "step": 3930 + }, + { + "epoch": 0.52, + "grad_norm": 2.6033077239990234, + "learning_rate": 9.845920972990702e-06, + "loss": 0.6801, + "step": 3940 + }, + { + "epoch": 0.52, + "grad_norm": 2.751955032348633, + "learning_rate": 9.844046090824533e-06, + "loss": 0.667, + "step": 3950 + }, + { + "epoch": 0.52, + "grad_norm": 2.8029513359069824, + "learning_rate": 9.842160051149568e-06, + "loss": 0.6841, + "step": 3960 + }, + { + "epoch": 0.53, + "grad_norm": 3.345020294189453, + "learning_rate": 9.840262858310007e-06, + "loss": 0.684, + "step": 3970 + }, + { + "epoch": 0.53, + "grad_norm": 2.843904495239258, + "learning_rate": 9.83835451667574e-06, + "loss": 0.6878, + "step": 3980 + }, + { + "epoch": 0.53, + "grad_norm": 2.4852325916290283, + "learning_rate": 9.836435030642335e-06, + "loss": 0.7087, + "step": 3990 + }, + { + "epoch": 0.53, + "grad_norm": 3.3790409564971924, + "learning_rate": 9.834504404631032e-06, + "loss": 0.6913, + "step": 4000 + }, + { + "epoch": 0.53, + "grad_norm": 3.407959461212158, + "learning_rate": 9.832562643088724e-06, + "loss": 0.6912, + "step": 4010 + }, + { + "epoch": 0.53, + "grad_norm": 2.723505973815918, + "learning_rate": 9.830609750487963e-06, + "loss": 0.6927, + "step": 4020 + }, + { + "epoch": 0.53, + "grad_norm": 3.1616594791412354, + "learning_rate": 9.82864573132693e-06, + "loss": 0.6714, + "step": 4030 + }, + { + "epoch": 0.53, + "grad_norm": 3.0670251846313477, + "learning_rate": 9.826670590129442e-06, + "loss": 0.6685, + "step": 4040 + }, + { + "epoch": 0.54, + "grad_norm": 2.4355831146240234, + "learning_rate": 9.824684331444926e-06, + "loss": 0.6839, + "step": 4050 + }, + { + "epoch": 0.54, + "grad_norm": 2.7502357959747314, + "learning_rate": 9.822686959848425e-06, + "loss": 0.6925, + "step": 4060 + }, + { + "epoch": 0.54, + "grad_norm": 3.705983877182007, + "learning_rate": 9.820678479940573e-06, + "loss": 0.6715, + "step": 4070 + }, + { + "epoch": 0.54, + "grad_norm": 2.4137468338012695, + "learning_rate": 9.818658896347591e-06, + "loss": 0.6882, + "step": 4080 + }, + { + "epoch": 0.54, + "grad_norm": 3.260124921798706, + "learning_rate": 9.81662821372128e-06, + "loss": 0.684, + "step": 4090 + }, + { + "epoch": 0.54, + "grad_norm": 3.2827420234680176, + "learning_rate": 9.814586436738998e-06, + "loss": 0.675, + "step": 4100 + }, + { + "epoch": 0.54, + "grad_norm": 2.2685201168060303, + "learning_rate": 9.812533570103663e-06, + "loss": 0.6636, + "step": 4110 + }, + { + "epoch": 0.55, + "grad_norm": 3.1365833282470703, + "learning_rate": 9.810469618543737e-06, + "loss": 0.6911, + "step": 4120 + }, + { + "epoch": 0.55, + "grad_norm": 2.9192795753479004, + "learning_rate": 9.808394586813209e-06, + "loss": 0.6955, + "step": 4130 + }, + { + "epoch": 0.55, + "grad_norm": 2.4191370010375977, + "learning_rate": 9.806308479691595e-06, + "loss": 0.6769, + "step": 4140 + }, + { + "epoch": 0.55, + "grad_norm": 4.024157524108887, + "learning_rate": 9.804211301983919e-06, + "loss": 0.6837, + "step": 4150 + }, + { + "epoch": 0.55, + "grad_norm": 2.6173815727233887, + "learning_rate": 9.802103058520704e-06, + "loss": 0.6703, + "step": 4160 + }, + { + "epoch": 0.55, + "grad_norm": 2.637032985687256, + "learning_rate": 9.799983754157961e-06, + "loss": 0.681, + "step": 4170 + }, + { + "epoch": 0.55, + "grad_norm": 2.4752278327941895, + "learning_rate": 9.797853393777182e-06, + "loss": 0.6667, + "step": 4180 + }, + { + "epoch": 0.55, + "grad_norm": 3.132769823074341, + "learning_rate": 9.795711982285317e-06, + "loss": 0.6903, + "step": 4190 + }, + { + "epoch": 0.56, + "grad_norm": 2.7234723567962646, + "learning_rate": 9.793559524614779e-06, + "loss": 0.6763, + "step": 4200 + }, + { + "epoch": 0.56, + "grad_norm": 2.6191039085388184, + "learning_rate": 9.791396025723418e-06, + "loss": 0.6732, + "step": 4210 + }, + { + "epoch": 0.56, + "grad_norm": 2.8403711318969727, + "learning_rate": 9.78922149059452e-06, + "loss": 0.676, + "step": 4220 + }, + { + "epoch": 0.56, + "grad_norm": 3.4012792110443115, + "learning_rate": 9.787035924236789e-06, + "loss": 0.6576, + "step": 4230 + }, + { + "epoch": 0.56, + "grad_norm": 3.256134271621704, + "learning_rate": 9.784839331684338e-06, + "loss": 0.7017, + "step": 4240 + }, + { + "epoch": 0.56, + "grad_norm": 2.8171510696411133, + "learning_rate": 9.782631717996675e-06, + "loss": 0.6764, + "step": 4250 + }, + { + "epoch": 0.56, + "grad_norm": 3.248218297958374, + "learning_rate": 9.780413088258698e-06, + "loss": 0.6807, + "step": 4260 + }, + { + "epoch": 0.57, + "grad_norm": 3.497915267944336, + "learning_rate": 9.778183447580675e-06, + "loss": 0.6714, + "step": 4270 + }, + { + "epoch": 0.57, + "grad_norm": 3.140228509902954, + "learning_rate": 9.775942801098241e-06, + "loss": 0.7066, + "step": 4280 + }, + { + "epoch": 0.57, + "grad_norm": 3.035100221633911, + "learning_rate": 9.773691153972375e-06, + "loss": 0.6803, + "step": 4290 + }, + { + "epoch": 0.57, + "grad_norm": 2.7118821144104004, + "learning_rate": 9.771428511389395e-06, + "loss": 0.6755, + "step": 4300 + }, + { + "epoch": 0.57, + "grad_norm": 3.035815477371216, + "learning_rate": 9.76915487856095e-06, + "loss": 0.6707, + "step": 4310 + }, + { + "epoch": 0.57, + "grad_norm": 3.3597121238708496, + "learning_rate": 9.766870260724e-06, + "loss": 0.6781, + "step": 4320 + }, + { + "epoch": 0.57, + "grad_norm": 3.151930809020996, + "learning_rate": 9.764574663140807e-06, + "loss": 0.6644, + "step": 4330 + }, + { + "epoch": 0.57, + "grad_norm": 2.971132278442383, + "learning_rate": 9.762268091098926e-06, + "loss": 0.6747, + "step": 4340 + }, + { + "epoch": 0.58, + "grad_norm": 2.8724772930145264, + "learning_rate": 9.759950549911185e-06, + "loss": 0.6802, + "step": 4350 + }, + { + "epoch": 0.58, + "grad_norm": 3.2583391666412354, + "learning_rate": 9.757622044915682e-06, + "loss": 0.6958, + "step": 4360 + }, + { + "epoch": 0.58, + "grad_norm": 2.8724751472473145, + "learning_rate": 9.755282581475769e-06, + "loss": 0.6673, + "step": 4370 + }, + { + "epoch": 0.58, + "grad_norm": 2.53116774559021, + "learning_rate": 9.752932164980033e-06, + "loss": 0.6771, + "step": 4380 + }, + { + "epoch": 0.58, + "grad_norm": 2.5583338737487793, + "learning_rate": 9.750570800842298e-06, + "loss": 0.6835, + "step": 4390 + }, + { + "epoch": 0.58, + "grad_norm": 2.7628743648529053, + "learning_rate": 9.748198494501598e-06, + "loss": 0.6759, + "step": 4400 + }, + { + "epoch": 0.58, + "grad_norm": 2.4657537937164307, + "learning_rate": 9.74581525142217e-06, + "loss": 0.6912, + "step": 4410 + }, + { + "epoch": 0.58, + "grad_norm": 2.4423508644104004, + "learning_rate": 9.74342107709345e-06, + "loss": 0.6584, + "step": 4420 + }, + { + "epoch": 0.59, + "grad_norm": 2.9297876358032227, + "learning_rate": 9.741015977030046e-06, + "loss": 0.6898, + "step": 4430 + }, + { + "epoch": 0.59, + "grad_norm": 4.085515022277832, + "learning_rate": 9.73859995677173e-06, + "loss": 0.6586, + "step": 4440 + }, + { + "epoch": 0.59, + "grad_norm": 3.312915802001953, + "learning_rate": 9.736173021883433e-06, + "loss": 0.6819, + "step": 4450 + }, + { + "epoch": 0.59, + "grad_norm": 2.7743465900421143, + "learning_rate": 9.733735177955219e-06, + "loss": 0.6621, + "step": 4460 + }, + { + "epoch": 0.59, + "grad_norm": 3.45530366897583, + "learning_rate": 9.73128643060229e-06, + "loss": 0.6838, + "step": 4470 + }, + { + "epoch": 0.59, + "grad_norm": 2.886432409286499, + "learning_rate": 9.728826785464948e-06, + "loss": 0.6859, + "step": 4480 + }, + { + "epoch": 0.59, + "grad_norm": 4.044523239135742, + "learning_rate": 9.72635624820861e-06, + "loss": 0.6832, + "step": 4490 + }, + { + "epoch": 0.6, + "grad_norm": 2.746330499649048, + "learning_rate": 9.72387482452377e-06, + "loss": 0.6917, + "step": 4500 + }, + { + "epoch": 0.6, + "grad_norm": 3.0390594005584717, + "learning_rate": 9.72138252012601e-06, + "loss": 0.6656, + "step": 4510 + }, + { + "epoch": 0.6, + "grad_norm": 2.8167319297790527, + "learning_rate": 9.71887934075596e-06, + "loss": 0.6885, + "step": 4520 + }, + { + "epoch": 0.6, + "grad_norm": 2.6761083602905273, + "learning_rate": 9.716365292179309e-06, + "loss": 0.6942, + "step": 4530 + }, + { + "epoch": 0.6, + "grad_norm": 2.7335264682769775, + "learning_rate": 9.713840380186774e-06, + "loss": 0.684, + "step": 4540 + }, + { + "epoch": 0.6, + "grad_norm": 3.458198070526123, + "learning_rate": 9.711304610594104e-06, + "loss": 0.692, + "step": 4550 + }, + { + "epoch": 0.6, + "grad_norm": 2.902730703353882, + "learning_rate": 9.708757989242046e-06, + "loss": 0.6638, + "step": 4560 + }, + { + "epoch": 0.6, + "grad_norm": 2.5396640300750732, + "learning_rate": 9.706200521996348e-06, + "loss": 0.69, + "step": 4570 + }, + { + "epoch": 0.61, + "grad_norm": 2.7966129779815674, + "learning_rate": 9.703632214747742e-06, + "loss": 0.6832, + "step": 4580 + }, + { + "epoch": 0.61, + "grad_norm": 2.746511697769165, + "learning_rate": 9.701053073411923e-06, + "loss": 0.6749, + "step": 4590 + }, + { + "epoch": 0.61, + "grad_norm": 3.5651538372039795, + "learning_rate": 9.698463103929542e-06, + "loss": 0.6722, + "step": 4600 + }, + { + "epoch": 0.61, + "eval_loss": 0.7272596955299377, + "eval_runtime": 198.5906, + "eval_samples_per_second": 55.39, + "eval_steps_per_second": 6.924, + "step": 4600 + }, + { + "epoch": 0.61, + "grad_norm": 2.358306407928467, + "learning_rate": 9.695862312266195e-06, + "loss": 0.6808, + "step": 4610 + }, + { + "epoch": 0.61, + "grad_norm": 2.843318223953247, + "learning_rate": 9.6932507044124e-06, + "loss": 0.6852, + "step": 4620 + }, + { + "epoch": 0.61, + "grad_norm": 2.2861886024475098, + "learning_rate": 9.690628286383593e-06, + "loss": 0.6736, + "step": 4630 + }, + { + "epoch": 0.61, + "grad_norm": 2.2561638355255127, + "learning_rate": 9.687995064220102e-06, + "loss": 0.6789, + "step": 4640 + }, + { + "epoch": 0.62, + "grad_norm": 2.8437678813934326, + "learning_rate": 9.685351043987151e-06, + "loss": 0.6758, + "step": 4650 + }, + { + "epoch": 0.62, + "grad_norm": 2.547785758972168, + "learning_rate": 9.682696231774829e-06, + "loss": 0.6855, + "step": 4660 + }, + { + "epoch": 0.62, + "grad_norm": 2.6620736122131348, + "learning_rate": 9.680030633698083e-06, + "loss": 0.6711, + "step": 4670 + }, + { + "epoch": 0.62, + "grad_norm": 3.146510362625122, + "learning_rate": 9.677354255896706e-06, + "loss": 0.6641, + "step": 4680 + }, + { + "epoch": 0.62, + "grad_norm": 2.8901259899139404, + "learning_rate": 9.674667104535318e-06, + "loss": 0.6898, + "step": 4690 + }, + { + "epoch": 0.62, + "grad_norm": 2.88307523727417, + "learning_rate": 9.671969185803357e-06, + "loss": 0.6848, + "step": 4700 + }, + { + "epoch": 0.62, + "grad_norm": 2.6471335887908936, + "learning_rate": 9.669260505915057e-06, + "loss": 0.668, + "step": 4710 + }, + { + "epoch": 0.62, + "grad_norm": 2.9674232006073, + "learning_rate": 9.666541071109446e-06, + "loss": 0.6849, + "step": 4720 + }, + { + "epoch": 0.63, + "grad_norm": 2.370706081390381, + "learning_rate": 9.66381088765032e-06, + "loss": 0.6819, + "step": 4730 + }, + { + "epoch": 0.63, + "grad_norm": 2.7703073024749756, + "learning_rate": 9.661069961826228e-06, + "loss": 0.6674, + "step": 4740 + }, + { + "epoch": 0.63, + "grad_norm": 2.9082021713256836, + "learning_rate": 9.658318299950473e-06, + "loss": 0.6833, + "step": 4750 + }, + { + "epoch": 0.63, + "grad_norm": 3.0396716594696045, + "learning_rate": 9.65555590836108e-06, + "loss": 0.6657, + "step": 4760 + }, + { + "epoch": 0.63, + "grad_norm": 2.304875373840332, + "learning_rate": 9.652782793420789e-06, + "loss": 0.6964, + "step": 4770 + }, + { + "epoch": 0.63, + "grad_norm": 2.8251640796661377, + "learning_rate": 9.64999896151704e-06, + "loss": 0.6774, + "step": 4780 + }, + { + "epoch": 0.63, + "grad_norm": 3.527707815170288, + "learning_rate": 9.647204419061957e-06, + "loss": 0.6778, + "step": 4790 + }, + { + "epoch": 0.64, + "grad_norm": 2.566895008087158, + "learning_rate": 9.644399172492337e-06, + "loss": 0.681, + "step": 4800 + }, + { + "epoch": 0.64, + "grad_norm": 3.0783915519714355, + "learning_rate": 9.641583228269629e-06, + "loss": 0.6744, + "step": 4810 + }, + { + "epoch": 0.64, + "grad_norm": 2.982912302017212, + "learning_rate": 9.638756592879923e-06, + "loss": 0.6849, + "step": 4820 + }, + { + "epoch": 0.64, + "grad_norm": 2.7487356662750244, + "learning_rate": 9.635919272833938e-06, + "loss": 0.6709, + "step": 4830 + }, + { + "epoch": 0.64, + "grad_norm": 3.3017807006835938, + "learning_rate": 9.633071274666998e-06, + "loss": 0.6698, + "step": 4840 + }, + { + "epoch": 0.64, + "grad_norm": 2.7575645446777344, + "learning_rate": 9.630212604939026e-06, + "loss": 0.6823, + "step": 4850 + }, + { + "epoch": 0.64, + "grad_norm": 3.032663345336914, + "learning_rate": 9.627343270234526e-06, + "loss": 0.6754, + "step": 4860 + }, + { + "epoch": 0.64, + "grad_norm": 2.4695844650268555, + "learning_rate": 9.624463277162563e-06, + "loss": 0.6793, + "step": 4870 + }, + { + "epoch": 0.65, + "grad_norm": 2.7239301204681396, + "learning_rate": 9.621572632356754e-06, + "loss": 0.7041, + "step": 4880 + }, + { + "epoch": 0.65, + "grad_norm": 2.497579336166382, + "learning_rate": 9.618671342475252e-06, + "loss": 0.694, + "step": 4890 + }, + { + "epoch": 0.65, + "grad_norm": 2.7662250995635986, + "learning_rate": 9.615759414200729e-06, + "loss": 0.6739, + "step": 4900 + }, + { + "epoch": 0.65, + "grad_norm": 3.1290366649627686, + "learning_rate": 9.61283685424036e-06, + "loss": 0.6802, + "step": 4910 + }, + { + "epoch": 0.65, + "grad_norm": 2.9241154193878174, + "learning_rate": 9.609903669325807e-06, + "loss": 0.6859, + "step": 4920 + }, + { + "epoch": 0.65, + "grad_norm": 2.5501949787139893, + "learning_rate": 9.606959866213206e-06, + "loss": 0.6608, + "step": 4930 + }, + { + "epoch": 0.65, + "grad_norm": 3.447155475616455, + "learning_rate": 9.604005451683154e-06, + "loss": 0.6813, + "step": 4940 + }, + { + "epoch": 0.66, + "grad_norm": 2.4963529109954834, + "learning_rate": 9.601040432540684e-06, + "loss": 0.6743, + "step": 4950 + }, + { + "epoch": 0.66, + "grad_norm": 2.8612847328186035, + "learning_rate": 9.598064815615259e-06, + "loss": 0.6614, + "step": 4960 + }, + { + "epoch": 0.66, + "grad_norm": 3.20935320854187, + "learning_rate": 9.59507860776075e-06, + "loss": 0.6781, + "step": 4970 + }, + { + "epoch": 0.66, + "grad_norm": 2.885199546813965, + "learning_rate": 9.592081815855425e-06, + "loss": 0.6738, + "step": 4980 + }, + { + "epoch": 0.66, + "grad_norm": 3.2153961658477783, + "learning_rate": 9.589074446801928e-06, + "loss": 0.68, + "step": 4990 + }, + { + "epoch": 0.66, + "grad_norm": 2.5133445262908936, + "learning_rate": 9.586056507527266e-06, + "loss": 0.6822, + "step": 5000 + }, + { + "epoch": 0.66, + "grad_norm": 3.0179901123046875, + "learning_rate": 9.583028004982798e-06, + "loss": 0.675, + "step": 5010 + }, + { + "epoch": 0.66, + "grad_norm": 3.4788663387298584, + "learning_rate": 9.579988946144205e-06, + "loss": 0.6832, + "step": 5020 + }, + { + "epoch": 0.67, + "grad_norm": 2.847745418548584, + "learning_rate": 9.57693933801149e-06, + "loss": 0.6662, + "step": 5030 + }, + { + "epoch": 0.67, + "grad_norm": 2.8910322189331055, + "learning_rate": 9.573879187608954e-06, + "loss": 0.6732, + "step": 5040 + }, + { + "epoch": 0.67, + "grad_norm": 2.582803726196289, + "learning_rate": 9.570808501985176e-06, + "loss": 0.6782, + "step": 5050 + }, + { + "epoch": 0.67, + "grad_norm": 2.1921629905700684, + "learning_rate": 9.567727288213005e-06, + "loss": 0.6881, + "step": 5060 + }, + { + "epoch": 0.67, + "grad_norm": 2.068142890930176, + "learning_rate": 9.56463555338954e-06, + "loss": 0.6717, + "step": 5070 + }, + { + "epoch": 0.67, + "grad_norm": 2.0534846782684326, + "learning_rate": 9.561533304636111e-06, + "loss": 0.6575, + "step": 5080 + }, + { + "epoch": 0.67, + "grad_norm": 2.8238446712493896, + "learning_rate": 9.558420549098269e-06, + "loss": 0.6842, + "step": 5090 + }, + { + "epoch": 0.67, + "grad_norm": 3.434664249420166, + "learning_rate": 9.55529729394576e-06, + "loss": 0.6789, + "step": 5100 + }, + { + "epoch": 0.68, + "grad_norm": 2.864886522293091, + "learning_rate": 9.552163546372521e-06, + "loss": 0.6707, + "step": 5110 + }, + { + "epoch": 0.68, + "grad_norm": 2.919512987136841, + "learning_rate": 9.549019313596652e-06, + "loss": 0.675, + "step": 5120 + }, + { + "epoch": 0.68, + "grad_norm": 2.967341899871826, + "learning_rate": 9.545864602860406e-06, + "loss": 0.6915, + "step": 5130 + }, + { + "epoch": 0.68, + "grad_norm": 4.124980449676514, + "learning_rate": 9.542699421430169e-06, + "loss": 0.6707, + "step": 5140 + }, + { + "epoch": 0.68, + "grad_norm": 2.38964581489563, + "learning_rate": 9.539523776596446e-06, + "loss": 0.6779, + "step": 5150 + }, + { + "epoch": 0.68, + "grad_norm": 2.722057580947876, + "learning_rate": 9.536337675673842e-06, + "loss": 0.6912, + "step": 5160 + }, + { + "epoch": 0.68, + "grad_norm": 4.076712131500244, + "learning_rate": 9.533141126001048e-06, + "loss": 0.6835, + "step": 5170 + }, + { + "epoch": 0.69, + "grad_norm": 2.521733522415161, + "learning_rate": 9.529934134940819e-06, + "loss": 0.6741, + "step": 5180 + }, + { + "epoch": 0.69, + "grad_norm": 2.929415464401245, + "learning_rate": 9.526716709879961e-06, + "loss": 0.6681, + "step": 5190 + }, + { + "epoch": 0.69, + "grad_norm": 2.9488470554351807, + "learning_rate": 9.523488858229313e-06, + "loss": 0.6695, + "step": 5200 + }, + { + "epoch": 0.69, + "grad_norm": 2.68656849861145, + "learning_rate": 9.520250587423733e-06, + "loss": 0.6791, + "step": 5210 + }, + { + "epoch": 0.69, + "grad_norm": 2.4024336338043213, + "learning_rate": 9.517001904922074e-06, + "loss": 0.6861, + "step": 5220 + }, + { + "epoch": 0.69, + "grad_norm": 2.476743221282959, + "learning_rate": 9.513742818207173e-06, + "loss": 0.6895, + "step": 5230 + }, + { + "epoch": 0.69, + "grad_norm": 2.6581668853759766, + "learning_rate": 9.510473334785828e-06, + "loss": 0.677, + "step": 5240 + }, + { + "epoch": 0.69, + "grad_norm": 2.596719264984131, + "learning_rate": 9.507193462188791e-06, + "loss": 0.6842, + "step": 5250 + }, + { + "epoch": 0.7, + "grad_norm": 2.334949016571045, + "learning_rate": 9.503903207970735e-06, + "loss": 0.6732, + "step": 5260 + }, + { + "epoch": 0.7, + "grad_norm": 2.867070436477661, + "learning_rate": 9.500602579710256e-06, + "loss": 0.6676, + "step": 5270 + }, + { + "epoch": 0.7, + "grad_norm": 3.4152629375457764, + "learning_rate": 9.497291585009834e-06, + "loss": 0.6618, + "step": 5280 + }, + { + "epoch": 0.7, + "grad_norm": 2.69191837310791, + "learning_rate": 9.493970231495836e-06, + "loss": 0.6822, + "step": 5290 + }, + { + "epoch": 0.7, + "grad_norm": 3.0387730598449707, + "learning_rate": 9.490638526818482e-06, + "loss": 0.6809, + "step": 5300 + }, + { + "epoch": 0.7, + "grad_norm": 2.516139268875122, + "learning_rate": 9.487296478651838e-06, + "loss": 0.682, + "step": 5310 + }, + { + "epoch": 0.7, + "grad_norm": 2.8808937072753906, + "learning_rate": 9.48394409469379e-06, + "loss": 0.6829, + "step": 5320 + }, + { + "epoch": 0.71, + "grad_norm": 3.1148269176483154, + "learning_rate": 9.480581382666041e-06, + "loss": 0.666, + "step": 5330 + }, + { + "epoch": 0.71, + "grad_norm": 2.933945894241333, + "learning_rate": 9.477208350314072e-06, + "loss": 0.6554, + "step": 5340 + }, + { + "epoch": 0.71, + "grad_norm": 2.9011828899383545, + "learning_rate": 9.47382500540714e-06, + "loss": 0.6724, + "step": 5350 + }, + { + "epoch": 0.71, + "grad_norm": 3.7872607707977295, + "learning_rate": 9.470431355738257e-06, + "loss": 0.6785, + "step": 5360 + }, + { + "epoch": 0.71, + "grad_norm": 2.19966197013855, + "learning_rate": 9.467027409124167e-06, + "loss": 0.6767, + "step": 5370 + }, + { + "epoch": 0.71, + "grad_norm": 2.590169668197632, + "learning_rate": 9.463613173405335e-06, + "loss": 0.6587, + "step": 5380 + }, + { + "epoch": 0.71, + "grad_norm": 3.284235954284668, + "learning_rate": 9.460188656445921e-06, + "loss": 0.6819, + "step": 5390 + }, + { + "epoch": 0.71, + "grad_norm": 2.668703556060791, + "learning_rate": 9.45675386613377e-06, + "loss": 0.6675, + "step": 5400 + }, + { + "epoch": 0.72, + "grad_norm": 3.212360382080078, + "learning_rate": 9.453308810380388e-06, + "loss": 0.6832, + "step": 5410 + }, + { + "epoch": 0.72, + "grad_norm": 3.038121461868286, + "learning_rate": 9.449853497120928e-06, + "loss": 0.6987, + "step": 5420 + }, + { + "epoch": 0.72, + "grad_norm": 2.2262122631073, + "learning_rate": 9.446387934314167e-06, + "loss": 0.6688, + "step": 5430 + }, + { + "epoch": 0.72, + "grad_norm": 2.7394657135009766, + "learning_rate": 9.442912129942491e-06, + "loss": 0.6788, + "step": 5440 + }, + { + "epoch": 0.72, + "grad_norm": 2.70332932472229, + "learning_rate": 9.439426092011877e-06, + "loss": 0.671, + "step": 5450 + }, + { + "epoch": 0.72, + "grad_norm": 2.807642936706543, + "learning_rate": 9.435929828551872e-06, + "loss": 0.6748, + "step": 5460 + }, + { + "epoch": 0.72, + "grad_norm": 3.0139355659484863, + "learning_rate": 9.432423347615578e-06, + "loss": 0.6723, + "step": 5470 + }, + { + "epoch": 0.73, + "grad_norm": 3.2098162174224854, + "learning_rate": 9.428906657279629e-06, + "loss": 0.6717, + "step": 5480 + }, + { + "epoch": 0.73, + "grad_norm": 2.6355557441711426, + "learning_rate": 9.425379765644174e-06, + "loss": 0.6816, + "step": 5490 + }, + { + "epoch": 0.73, + "grad_norm": 2.0204873085021973, + "learning_rate": 9.421842680832862e-06, + "loss": 0.6671, + "step": 5500 + }, + { + "epoch": 0.73, + "grad_norm": 3.1098544597625732, + "learning_rate": 9.418295410992821e-06, + "loss": 0.6911, + "step": 5510 + }, + { + "epoch": 0.73, + "grad_norm": 3.0662097930908203, + "learning_rate": 9.414737964294636e-06, + "loss": 0.6846, + "step": 5520 + }, + { + "epoch": 0.73, + "grad_norm": 3.711937665939331, + "learning_rate": 9.411170348932333e-06, + "loss": 0.6731, + "step": 5530 + }, + { + "epoch": 0.73, + "grad_norm": 2.982342481613159, + "learning_rate": 9.407592573123359e-06, + "loss": 0.6747, + "step": 5540 + }, + { + "epoch": 0.73, + "grad_norm": 2.78140926361084, + "learning_rate": 9.40400464510857e-06, + "loss": 0.6787, + "step": 5550 + }, + { + "epoch": 0.74, + "grad_norm": 3.521254539489746, + "learning_rate": 9.400406573152196e-06, + "loss": 0.6891, + "step": 5560 + }, + { + "epoch": 0.74, + "grad_norm": 2.704249620437622, + "learning_rate": 9.396798365541841e-06, + "loss": 0.6823, + "step": 5570 + }, + { + "epoch": 0.74, + "grad_norm": 2.6704928874969482, + "learning_rate": 9.393180030588454e-06, + "loss": 0.6814, + "step": 5580 + }, + { + "epoch": 0.74, + "grad_norm": 2.9716169834136963, + "learning_rate": 9.389551576626303e-06, + "loss": 0.6786, + "step": 5590 + }, + { + "epoch": 0.74, + "grad_norm": 2.880004405975342, + "learning_rate": 9.385913012012972e-06, + "loss": 0.6775, + "step": 5600 + }, + { + "epoch": 0.74, + "grad_norm": 2.5522663593292236, + "learning_rate": 9.382264345129329e-06, + "loss": 0.6827, + "step": 5610 + }, + { + "epoch": 0.74, + "grad_norm": 2.8483026027679443, + "learning_rate": 9.378605584379515e-06, + "loss": 0.656, + "step": 5620 + }, + { + "epoch": 0.75, + "grad_norm": 2.7561230659484863, + "learning_rate": 9.374936738190913e-06, + "loss": 0.6694, + "step": 5630 + }, + { + "epoch": 0.75, + "grad_norm": 3.149887800216675, + "learning_rate": 9.371257815014145e-06, + "loss": 0.6782, + "step": 5640 + }, + { + "epoch": 0.75, + "grad_norm": 2.629521369934082, + "learning_rate": 9.367568823323039e-06, + "loss": 0.6758, + "step": 5650 + }, + { + "epoch": 0.75, + "grad_norm": 3.5836355686187744, + "learning_rate": 9.363869771614615e-06, + "loss": 0.6738, + "step": 5660 + }, + { + "epoch": 0.75, + "grad_norm": 2.83819317817688, + "learning_rate": 9.360160668409063e-06, + "loss": 0.6734, + "step": 5670 + }, + { + "epoch": 0.75, + "grad_norm": 2.5035979747772217, + "learning_rate": 9.35644152224973e-06, + "loss": 0.6804, + "step": 5680 + }, + { + "epoch": 0.75, + "grad_norm": 2.4892172813415527, + "learning_rate": 9.35271234170309e-06, + "loss": 0.6599, + "step": 5690 + }, + { + "epoch": 0.75, + "grad_norm": 3.170015811920166, + "learning_rate": 9.348973135358734e-06, + "loss": 0.6771, + "step": 5700 + }, + { + "epoch": 0.76, + "grad_norm": 2.2458276748657227, + "learning_rate": 9.345223911829343e-06, + "loss": 0.6785, + "step": 5710 + }, + { + "epoch": 0.76, + "grad_norm": 2.567450761795044, + "learning_rate": 9.341464679750669e-06, + "loss": 0.6732, + "step": 5720 + }, + { + "epoch": 0.76, + "grad_norm": 2.638319730758667, + "learning_rate": 9.337695447781525e-06, + "loss": 0.6753, + "step": 5730 + }, + { + "epoch": 0.76, + "grad_norm": 3.1988255977630615, + "learning_rate": 9.333916224603747e-06, + "loss": 0.6776, + "step": 5740 + }, + { + "epoch": 0.76, + "grad_norm": 2.337787389755249, + "learning_rate": 9.330127018922195e-06, + "loss": 0.6766, + "step": 5750 + }, + { + "epoch": 0.76, + "grad_norm": 2.9823319911956787, + "learning_rate": 9.326327839464711e-06, + "loss": 0.6749, + "step": 5760 + }, + { + "epoch": 0.76, + "grad_norm": 2.39164662361145, + "learning_rate": 9.322518694982119e-06, + "loss": 0.6703, + "step": 5770 + }, + { + "epoch": 0.76, + "grad_norm": 2.7940139770507812, + "learning_rate": 9.318699594248192e-06, + "loss": 0.6612, + "step": 5780 + }, + { + "epoch": 0.77, + "grad_norm": 2.8293371200561523, + "learning_rate": 9.314870546059636e-06, + "loss": 0.6598, + "step": 5790 + }, + { + "epoch": 0.77, + "grad_norm": 2.8672988414764404, + "learning_rate": 9.311031559236067e-06, + "loss": 0.6811, + "step": 5800 + }, + { + "epoch": 0.77, + "grad_norm": 2.592622995376587, + "learning_rate": 9.307182642620001e-06, + "loss": 0.6857, + "step": 5810 + }, + { + "epoch": 0.77, + "grad_norm": 3.7279140949249268, + "learning_rate": 9.303323805076816e-06, + "loss": 0.6606, + "step": 5820 + }, + { + "epoch": 0.77, + "grad_norm": 3.0750820636749268, + "learning_rate": 9.299455055494747e-06, + "loss": 0.6766, + "step": 5830 + }, + { + "epoch": 0.77, + "grad_norm": 3.0468876361846924, + "learning_rate": 9.295576402784858e-06, + "loss": 0.6675, + "step": 5840 + }, + { + "epoch": 0.77, + "grad_norm": 3.2743566036224365, + "learning_rate": 9.291687855881027e-06, + "loss": 0.6842, + "step": 5850 + }, + { + "epoch": 0.78, + "grad_norm": 3.0007123947143555, + "learning_rate": 9.287789423739915e-06, + "loss": 0.6631, + "step": 5860 + }, + { + "epoch": 0.78, + "grad_norm": 2.861750602722168, + "learning_rate": 9.283881115340957e-06, + "loss": 0.6624, + "step": 5870 + }, + { + "epoch": 0.78, + "grad_norm": 2.6511833667755127, + "learning_rate": 9.279962939686333e-06, + "loss": 0.6735, + "step": 5880 + }, + { + "epoch": 0.78, + "grad_norm": 2.709005832672119, + "learning_rate": 9.276034905800957e-06, + "loss": 0.6769, + "step": 5890 + }, + { + "epoch": 0.78, + "grad_norm": 3.223933696746826, + "learning_rate": 9.272097022732444e-06, + "loss": 0.6818, + "step": 5900 + }, + { + "epoch": 0.78, + "grad_norm": 2.787783622741699, + "learning_rate": 9.268149299551095e-06, + "loss": 0.6856, + "step": 5910 + }, + { + "epoch": 0.78, + "grad_norm": 3.5154056549072266, + "learning_rate": 9.264191745349882e-06, + "loss": 0.6682, + "step": 5920 + }, + { + "epoch": 0.78, + "grad_norm": 3.194385528564453, + "learning_rate": 9.260224369244414e-06, + "loss": 0.6659, + "step": 5930 + }, + { + "epoch": 0.79, + "grad_norm": 2.8617637157440186, + "learning_rate": 9.256247180372927e-06, + "loss": 0.6855, + "step": 5940 + }, + { + "epoch": 0.79, + "grad_norm": 2.8789777755737305, + "learning_rate": 9.252260187896257e-06, + "loss": 0.6829, + "step": 5950 + }, + { + "epoch": 0.79, + "grad_norm": 2.897092580795288, + "learning_rate": 9.248263400997826e-06, + "loss": 0.6744, + "step": 5960 + }, + { + "epoch": 0.79, + "grad_norm": 2.86773943901062, + "learning_rate": 9.244256828883611e-06, + "loss": 0.6867, + "step": 5970 + }, + { + "epoch": 0.79, + "grad_norm": 2.738723039627075, + "learning_rate": 9.24024048078213e-06, + "loss": 0.6718, + "step": 5980 + }, + { + "epoch": 0.79, + "grad_norm": 2.6888554096221924, + "learning_rate": 9.236214365944418e-06, + "loss": 0.6711, + "step": 5990 + }, + { + "epoch": 0.79, + "grad_norm": 3.0452592372894287, + "learning_rate": 9.232178493644006e-06, + "loss": 0.6816, + "step": 6000 + }, + { + "epoch": 0.8, + "grad_norm": 2.9094676971435547, + "learning_rate": 9.228132873176899e-06, + "loss": 0.6817, + "step": 6010 + }, + { + "epoch": 0.8, + "grad_norm": 2.685194492340088, + "learning_rate": 9.224077513861556e-06, + "loss": 0.6684, + "step": 6020 + }, + { + "epoch": 0.8, + "grad_norm": 2.4088940620422363, + "learning_rate": 9.22001242503887e-06, + "loss": 0.6707, + "step": 6030 + }, + { + "epoch": 0.8, + "grad_norm": 2.5745279788970947, + "learning_rate": 9.21593761607214e-06, + "loss": 0.6866, + "step": 6040 + }, + { + "epoch": 0.8, + "grad_norm": 2.8593175411224365, + "learning_rate": 9.211853096347059e-06, + "loss": 0.6713, + "step": 6050 + }, + { + "epoch": 0.8, + "grad_norm": 2.8683552742004395, + "learning_rate": 9.207758875271683e-06, + "loss": 0.6566, + "step": 6060 + }, + { + "epoch": 0.8, + "grad_norm": 2.874685287475586, + "learning_rate": 9.203654962276415e-06, + "loss": 0.6791, + "step": 6070 + }, + { + "epoch": 0.8, + "grad_norm": 2.6957099437713623, + "learning_rate": 9.199541366813984e-06, + "loss": 0.6688, + "step": 6080 + }, + { + "epoch": 0.81, + "grad_norm": 3.0089595317840576, + "learning_rate": 9.195418098359417e-06, + "loss": 0.6708, + "step": 6090 + }, + { + "epoch": 0.81, + "grad_norm": 2.352421760559082, + "learning_rate": 9.191285166410023e-06, + "loss": 0.6637, + "step": 6100 + }, + { + "epoch": 0.81, + "grad_norm": 2.6234710216522217, + "learning_rate": 9.18714258048537e-06, + "loss": 0.6764, + "step": 6110 + }, + { + "epoch": 0.81, + "grad_norm": 3.9939608573913574, + "learning_rate": 9.182990350127265e-06, + "loss": 0.6553, + "step": 6120 + }, + { + "epoch": 0.81, + "grad_norm": 3.0175065994262695, + "learning_rate": 9.178828484899724e-06, + "loss": 0.6709, + "step": 6130 + }, + { + "epoch": 0.81, + "grad_norm": 3.120433807373047, + "learning_rate": 9.174656994388957e-06, + "loss": 0.6739, + "step": 6140 + }, + { + "epoch": 0.81, + "grad_norm": 2.466984748840332, + "learning_rate": 9.170475888203348e-06, + "loss": 0.6652, + "step": 6150 + }, + { + "epoch": 0.82, + "grad_norm": 2.471644639968872, + "learning_rate": 9.166285175973424e-06, + "loss": 0.6822, + "step": 6160 + }, + { + "epoch": 0.82, + "grad_norm": 2.809940814971924, + "learning_rate": 9.16208486735184e-06, + "loss": 0.6806, + "step": 6170 + }, + { + "epoch": 0.82, + "grad_norm": 3.266305446624756, + "learning_rate": 9.157874972013361e-06, + "loss": 0.6742, + "step": 6180 + }, + { + "epoch": 0.82, + "grad_norm": 2.68939208984375, + "learning_rate": 9.153655499654824e-06, + "loss": 0.6778, + "step": 6190 + }, + { + "epoch": 0.82, + "grad_norm": 2.9182255268096924, + "learning_rate": 9.149426459995127e-06, + "loss": 0.6688, + "step": 6200 + }, + { + "epoch": 0.82, + "grad_norm": 2.9277961254119873, + "learning_rate": 9.145187862775208e-06, + "loss": 0.676, + "step": 6210 + }, + { + "epoch": 0.82, + "grad_norm": 2.709533214569092, + "learning_rate": 9.140939717758022e-06, + "loss": 0.6713, + "step": 6220 + }, + { + "epoch": 0.82, + "grad_norm": 3.068077325820923, + "learning_rate": 9.136682034728508e-06, + "loss": 0.6623, + "step": 6230 + }, + { + "epoch": 0.83, + "grad_norm": 2.503319263458252, + "learning_rate": 9.13241482349358e-06, + "loss": 0.686, + "step": 6240 + }, + { + "epoch": 0.83, + "grad_norm": 2.557908773422241, + "learning_rate": 9.128138093882098e-06, + "loss": 0.674, + "step": 6250 + }, + { + "epoch": 0.83, + "grad_norm": 2.8420028686523438, + "learning_rate": 9.123851855744842e-06, + "loss": 0.6606, + "step": 6260 + }, + { + "epoch": 0.83, + "grad_norm": 3.4535250663757324, + "learning_rate": 9.119556118954503e-06, + "loss": 0.6702, + "step": 6270 + }, + { + "epoch": 0.83, + "grad_norm": 2.663339138031006, + "learning_rate": 9.115250893405637e-06, + "loss": 0.6788, + "step": 6280 + }, + { + "epoch": 0.83, + "grad_norm": 2.507500410079956, + "learning_rate": 9.110936189014668e-06, + "loss": 0.6631, + "step": 6290 + }, + { + "epoch": 0.83, + "grad_norm": 2.51786732673645, + "learning_rate": 9.106612015719845e-06, + "loss": 0.6617, + "step": 6300 + }, + { + "epoch": 0.84, + "grad_norm": 3.4296956062316895, + "learning_rate": 9.102278383481235e-06, + "loss": 0.6818, + "step": 6310 + }, + { + "epoch": 0.84, + "grad_norm": 2.9477152824401855, + "learning_rate": 9.097935302280682e-06, + "loss": 0.6797, + "step": 6320 + }, + { + "epoch": 0.84, + "grad_norm": 2.557518243789673, + "learning_rate": 9.093582782121805e-06, + "loss": 0.6741, + "step": 6330 + }, + { + "epoch": 0.84, + "grad_norm": 2.746699571609497, + "learning_rate": 9.089220833029957e-06, + "loss": 0.6732, + "step": 6340 + }, + { + "epoch": 0.84, + "grad_norm": 2.7738733291625977, + "learning_rate": 9.08484946505221e-06, + "loss": 0.672, + "step": 6350 + }, + { + "epoch": 0.84, + "grad_norm": 3.2371561527252197, + "learning_rate": 9.080468688257334e-06, + "loss": 0.6836, + "step": 6360 + }, + { + "epoch": 0.84, + "grad_norm": 2.3297362327575684, + "learning_rate": 9.07607851273577e-06, + "loss": 0.6739, + "step": 6370 + }, + { + "epoch": 0.84, + "grad_norm": 2.604583740234375, + "learning_rate": 9.0716789485996e-06, + "loss": 0.6861, + "step": 6380 + }, + { + "epoch": 0.85, + "grad_norm": 2.323979616165161, + "learning_rate": 9.067270005982545e-06, + "loss": 0.673, + "step": 6390 + }, + { + "epoch": 0.85, + "grad_norm": 3.385627269744873, + "learning_rate": 9.062851695039915e-06, + "loss": 0.6733, + "step": 6400 + }, + { + "epoch": 0.85, + "grad_norm": 3.809943914413452, + "learning_rate": 9.058424025948609e-06, + "loss": 0.6802, + "step": 6410 + }, + { + "epoch": 0.85, + "grad_norm": 3.2964417934417725, + "learning_rate": 9.053987008907071e-06, + "loss": 0.6912, + "step": 6420 + }, + { + "epoch": 0.85, + "grad_norm": 3.160759210586548, + "learning_rate": 9.049540654135285e-06, + "loss": 0.6672, + "step": 6430 + }, + { + "epoch": 0.85, + "grad_norm": 3.6694741249084473, + "learning_rate": 9.045084971874738e-06, + "loss": 0.6843, + "step": 6440 + }, + { + "epoch": 0.85, + "grad_norm": 3.6869866847991943, + "learning_rate": 9.040619972388402e-06, + "loss": 0.671, + "step": 6450 + }, + { + "epoch": 0.85, + "grad_norm": 2.74727463722229, + "learning_rate": 9.036145665960715e-06, + "loss": 0.6783, + "step": 6460 + }, + { + "epoch": 0.86, + "grad_norm": 2.88826322555542, + "learning_rate": 9.03166206289754e-06, + "loss": 0.6547, + "step": 6470 + }, + { + "epoch": 0.86, + "grad_norm": 2.761207103729248, + "learning_rate": 9.02716917352617e-06, + "loss": 0.6739, + "step": 6480 + }, + { + "epoch": 0.86, + "grad_norm": 2.660529375076294, + "learning_rate": 9.022667008195273e-06, + "loss": 0.6595, + "step": 6490 + }, + { + "epoch": 0.86, + "grad_norm": 2.8409204483032227, + "learning_rate": 9.018155577274891e-06, + "loss": 0.6881, + "step": 6500 + }, + { + "epoch": 0.86, + "grad_norm": 2.212315320968628, + "learning_rate": 9.013634891156404e-06, + "loss": 0.6872, + "step": 6510 + }, + { + "epoch": 0.86, + "grad_norm": 3.3113622665405273, + "learning_rate": 9.009104960252513e-06, + "loss": 0.6761, + "step": 6520 + }, + { + "epoch": 0.86, + "grad_norm": 2.797126293182373, + "learning_rate": 9.004565794997209e-06, + "loss": 0.6741, + "step": 6530 + }, + { + "epoch": 0.87, + "grad_norm": 2.9894332885742188, + "learning_rate": 9.000017405845755e-06, + "loss": 0.6835, + "step": 6540 + }, + { + "epoch": 0.87, + "grad_norm": 2.9207139015197754, + "learning_rate": 8.995459803274664e-06, + "loss": 0.674, + "step": 6550 + }, + { + "epoch": 0.87, + "grad_norm": 2.9169623851776123, + "learning_rate": 8.990892997781661e-06, + "loss": 0.6419, + "step": 6560 + }, + { + "epoch": 0.87, + "grad_norm": 2.9211723804473877, + "learning_rate": 8.986316999885678e-06, + "loss": 0.6581, + "step": 6570 + }, + { + "epoch": 0.87, + "grad_norm": 2.616330623626709, + "learning_rate": 8.981731820126816e-06, + "loss": 0.6741, + "step": 6580 + }, + { + "epoch": 0.87, + "grad_norm": 2.5580813884735107, + "learning_rate": 8.977137469066321e-06, + "loss": 0.6741, + "step": 6590 + }, + { + "epoch": 0.87, + "grad_norm": 2.785909414291382, + "learning_rate": 8.972533957286574e-06, + "loss": 0.6784, + "step": 6600 + }, + { + "epoch": 0.87, + "grad_norm": 3.340866804122925, + "learning_rate": 8.967921295391046e-06, + "loss": 0.6687, + "step": 6610 + }, + { + "epoch": 0.88, + "grad_norm": 2.7479071617126465, + "learning_rate": 8.963299494004292e-06, + "loss": 0.6738, + "step": 6620 + }, + { + "epoch": 0.88, + "grad_norm": 2.5367236137390137, + "learning_rate": 8.958668563771911e-06, + "loss": 0.6776, + "step": 6630 + }, + { + "epoch": 0.88, + "grad_norm": 2.654205322265625, + "learning_rate": 8.954028515360535e-06, + "loss": 0.6664, + "step": 6640 + }, + { + "epoch": 0.88, + "grad_norm": 2.851851463317871, + "learning_rate": 8.949379359457795e-06, + "loss": 0.6765, + "step": 6650 + }, + { + "epoch": 0.88, + "grad_norm": 2.946833848953247, + "learning_rate": 8.944721106772298e-06, + "loss": 0.6642, + "step": 6660 + }, + { + "epoch": 0.88, + "grad_norm": 3.0130770206451416, + "learning_rate": 8.94005376803361e-06, + "loss": 0.6775, + "step": 6670 + }, + { + "epoch": 0.88, + "grad_norm": 3.278046131134033, + "learning_rate": 8.935377353992222e-06, + "loss": 0.6853, + "step": 6680 + }, + { + "epoch": 0.89, + "grad_norm": 2.2684073448181152, + "learning_rate": 8.930691875419525e-06, + "loss": 0.6704, + "step": 6690 + }, + { + "epoch": 0.89, + "grad_norm": 3.093383550643921, + "learning_rate": 8.925997343107796e-06, + "loss": 0.6665, + "step": 6700 + }, + { + "epoch": 0.89, + "grad_norm": 2.568152666091919, + "learning_rate": 8.921293767870157e-06, + "loss": 0.6643, + "step": 6710 + }, + { + "epoch": 0.89, + "grad_norm": 3.2096123695373535, + "learning_rate": 8.91658116054057e-06, + "loss": 0.679, + "step": 6720 + }, + { + "epoch": 0.89, + "grad_norm": 2.912968397140503, + "learning_rate": 8.91185953197379e-06, + "loss": 0.6856, + "step": 6730 + }, + { + "epoch": 0.89, + "grad_norm": 2.980971097946167, + "learning_rate": 8.907128893045359e-06, + "loss": 0.6672, + "step": 6740 + }, + { + "epoch": 0.89, + "grad_norm": 3.0038115978240967, + "learning_rate": 8.902389254651568e-06, + "loss": 0.6738, + "step": 6750 + }, + { + "epoch": 0.89, + "grad_norm": 3.099684000015259, + "learning_rate": 8.897640627709441e-06, + "loss": 0.6924, + "step": 6760 + }, + { + "epoch": 0.9, + "grad_norm": 2.444758892059326, + "learning_rate": 8.892883023156703e-06, + "loss": 0.6689, + "step": 6770 + }, + { + "epoch": 0.9, + "grad_norm": 2.7014451026916504, + "learning_rate": 8.888116451951755e-06, + "loss": 0.683, + "step": 6780 + }, + { + "epoch": 0.9, + "grad_norm": 3.183800458908081, + "learning_rate": 8.88334092507366e-06, + "loss": 0.6799, + "step": 6790 + }, + { + "epoch": 0.9, + "grad_norm": 2.8283169269561768, + "learning_rate": 8.8785564535221e-06, + "loss": 0.6546, + "step": 6800 + }, + { + "epoch": 0.9, + "grad_norm": 2.629321575164795, + "learning_rate": 8.873763048317363e-06, + "loss": 0.6774, + "step": 6810 + }, + { + "epoch": 0.9, + "grad_norm": 2.320882558822632, + "learning_rate": 8.868960720500314e-06, + "loss": 0.6646, + "step": 6820 + }, + { + "epoch": 0.9, + "grad_norm": 3.840276002883911, + "learning_rate": 8.86414948113237e-06, + "loss": 0.6768, + "step": 6830 + }, + { + "epoch": 0.91, + "grad_norm": 2.9596545696258545, + "learning_rate": 8.85932934129548e-06, + "loss": 0.6873, + "step": 6840 + }, + { + "epoch": 0.91, + "grad_norm": 2.690166473388672, + "learning_rate": 8.854500312092081e-06, + "loss": 0.6867, + "step": 6850 + }, + { + "epoch": 0.91, + "grad_norm": 3.0237040519714355, + "learning_rate": 8.849662404645097e-06, + "loss": 0.664, + "step": 6860 + }, + { + "epoch": 0.91, + "grad_norm": 3.3142759799957275, + "learning_rate": 8.844815630097896e-06, + "loss": 0.6776, + "step": 6870 + }, + { + "epoch": 0.91, + "grad_norm": 2.4475510120391846, + "learning_rate": 8.839959999614272e-06, + "loss": 0.6755, + "step": 6880 + }, + { + "epoch": 0.91, + "grad_norm": 2.689811944961548, + "learning_rate": 8.835095524378413e-06, + "loss": 0.677, + "step": 6890 + }, + { + "epoch": 0.91, + "grad_norm": 2.6768815517425537, + "learning_rate": 8.83022221559489e-06, + "loss": 0.6867, + "step": 6900 + }, + { + "epoch": 0.91, + "eval_loss": 0.7290458083152771, + "eval_runtime": 197.9624, + "eval_samples_per_second": 55.566, + "eval_steps_per_second": 6.946, + "step": 6900 + }, + { + "epoch": 0.91, + "grad_norm": 2.7535643577575684, + "learning_rate": 8.82534008448861e-06, + "loss": 0.6751, + "step": 6910 + }, + { + "epoch": 0.92, + "grad_norm": 2.4862465858459473, + "learning_rate": 8.820449142304805e-06, + "loss": 0.6745, + "step": 6920 + }, + { + "epoch": 0.92, + "grad_norm": 2.895951747894287, + "learning_rate": 8.815549400309002e-06, + "loss": 0.6701, + "step": 6930 + }, + { + "epoch": 0.92, + "grad_norm": 2.7666430473327637, + "learning_rate": 8.810640869786994e-06, + "loss": 0.6522, + "step": 6940 + }, + { + "epoch": 0.92, + "grad_norm": 3.116729259490967, + "learning_rate": 8.805723562044825e-06, + "loss": 0.6613, + "step": 6950 + }, + { + "epoch": 0.92, + "grad_norm": 2.8136725425720215, + "learning_rate": 8.800797488408746e-06, + "loss": 0.6739, + "step": 6960 + }, + { + "epoch": 0.92, + "grad_norm": 2.925579309463501, + "learning_rate": 8.795862660225205e-06, + "loss": 0.655, + "step": 6970 + }, + { + "epoch": 0.92, + "grad_norm": 2.7187774181365967, + "learning_rate": 8.790919088860815e-06, + "loss": 0.662, + "step": 6980 + }, + { + "epoch": 0.93, + "grad_norm": 3.002408742904663, + "learning_rate": 8.785966785702323e-06, + "loss": 0.6677, + "step": 6990 + }, + { + "epoch": 0.93, + "grad_norm": 2.7283411026000977, + "learning_rate": 8.781005762156593e-06, + "loss": 0.6775, + "step": 7000 + }, + { + "epoch": 0.93, + "grad_norm": 2.955579996109009, + "learning_rate": 8.776036029650573e-06, + "loss": 0.6777, + "step": 7010 + }, + { + "epoch": 0.93, + "grad_norm": 2.8140358924865723, + "learning_rate": 8.77105759963127e-06, + "loss": 0.6734, + "step": 7020 + }, + { + "epoch": 0.93, + "grad_norm": 2.5485150814056396, + "learning_rate": 8.766070483565726e-06, + "loss": 0.6805, + "step": 7030 + }, + { + "epoch": 0.93, + "grad_norm": 2.6847705841064453, + "learning_rate": 8.76107469294099e-06, + "loss": 0.6564, + "step": 7040 + }, + { + "epoch": 0.93, + "grad_norm": 3.189195156097412, + "learning_rate": 8.756070239264089e-06, + "loss": 0.6794, + "step": 7050 + }, + { + "epoch": 0.93, + "grad_norm": 2.842616319656372, + "learning_rate": 8.75105713406201e-06, + "loss": 0.6784, + "step": 7060 + }, + { + "epoch": 0.94, + "grad_norm": 3.710517168045044, + "learning_rate": 8.746035388881655e-06, + "loss": 0.6786, + "step": 7070 + }, + { + "epoch": 0.94, + "grad_norm": 2.5461089611053467, + "learning_rate": 8.741005015289843e-06, + "loss": 0.6865, + "step": 7080 + }, + { + "epoch": 0.94, + "grad_norm": 3.374018669128418, + "learning_rate": 8.735966024873257e-06, + "loss": 0.6646, + "step": 7090 + }, + { + "epoch": 0.94, + "grad_norm": 3.168987989425659, + "learning_rate": 8.730918429238429e-06, + "loss": 0.6818, + "step": 7100 + }, + { + "epoch": 0.94, + "grad_norm": 3.4316864013671875, + "learning_rate": 8.72586224001171e-06, + "loss": 0.6523, + "step": 7110 + }, + { + "epoch": 0.94, + "grad_norm": 3.518183946609497, + "learning_rate": 8.720797468839255e-06, + "loss": 0.6692, + "step": 7120 + }, + { + "epoch": 0.94, + "grad_norm": 3.6533937454223633, + "learning_rate": 8.715724127386971e-06, + "loss": 0.6729, + "step": 7130 + }, + { + "epoch": 0.94, + "grad_norm": 2.501375913619995, + "learning_rate": 8.710642227340518e-06, + "loss": 0.6692, + "step": 7140 + }, + { + "epoch": 0.95, + "grad_norm": 3.3970723152160645, + "learning_rate": 8.705551780405264e-06, + "loss": 0.6726, + "step": 7150 + }, + { + "epoch": 0.95, + "grad_norm": 2.5831968784332275, + "learning_rate": 8.70045279830626e-06, + "loss": 0.684, + "step": 7160 + }, + { + "epoch": 0.95, + "grad_norm": 2.8925693035125732, + "learning_rate": 8.695345292788223e-06, + "loss": 0.6587, + "step": 7170 + }, + { + "epoch": 0.95, + "grad_norm": 3.127180576324463, + "learning_rate": 8.690229275615503e-06, + "loss": 0.67, + "step": 7180 + }, + { + "epoch": 0.95, + "grad_norm": 2.4886677265167236, + "learning_rate": 8.685104758572047e-06, + "loss": 0.6666, + "step": 7190 + }, + { + "epoch": 0.95, + "grad_norm": 4.1692070960998535, + "learning_rate": 8.679971753461388e-06, + "loss": 0.6668, + "step": 7200 + }, + { + "epoch": 0.95, + "grad_norm": 2.864614486694336, + "learning_rate": 8.674830272106604e-06, + "loss": 0.6658, + "step": 7210 + }, + { + "epoch": 0.96, + "grad_norm": 3.166633129119873, + "learning_rate": 8.669680326350303e-06, + "loss": 0.6726, + "step": 7220 + }, + { + "epoch": 0.96, + "grad_norm": 2.520665407180786, + "learning_rate": 8.664521928054585e-06, + "loss": 0.6693, + "step": 7230 + }, + { + "epoch": 0.96, + "grad_norm": 3.273986339569092, + "learning_rate": 8.659355089101021e-06, + "loss": 0.6762, + "step": 7240 + }, + { + "epoch": 0.96, + "grad_norm": 2.5142078399658203, + "learning_rate": 8.65417982139062e-06, + "loss": 0.6659, + "step": 7250 + }, + { + "epoch": 0.96, + "grad_norm": 3.154919385910034, + "learning_rate": 8.648996136843814e-06, + "loss": 0.671, + "step": 7260 + }, + { + "epoch": 0.96, + "grad_norm": 2.0966339111328125, + "learning_rate": 8.643804047400412e-06, + "loss": 0.6641, + "step": 7270 + }, + { + "epoch": 0.96, + "grad_norm": 3.209606885910034, + "learning_rate": 8.638603565019588e-06, + "loss": 0.6677, + "step": 7280 + }, + { + "epoch": 0.96, + "grad_norm": 2.470289468765259, + "learning_rate": 8.633394701679847e-06, + "loss": 0.6628, + "step": 7290 + }, + { + "epoch": 0.97, + "grad_norm": 2.4616212844848633, + "learning_rate": 8.628177469378995e-06, + "loss": 0.6772, + "step": 7300 + }, + { + "epoch": 0.97, + "grad_norm": 3.5428974628448486, + "learning_rate": 8.622951880134122e-06, + "loss": 0.6737, + "step": 7310 + }, + { + "epoch": 0.97, + "grad_norm": 3.179137945175171, + "learning_rate": 8.617717945981558e-06, + "loss": 0.6855, + "step": 7320 + }, + { + "epoch": 0.97, + "grad_norm": 3.3000833988189697, + "learning_rate": 8.612475678976861e-06, + "loss": 0.6805, + "step": 7330 + }, + { + "epoch": 0.97, + "grad_norm": 2.7179107666015625, + "learning_rate": 8.60722509119478e-06, + "loss": 0.6742, + "step": 7340 + }, + { + "epoch": 0.97, + "grad_norm": 2.58263897895813, + "learning_rate": 8.601966194729228e-06, + "loss": 0.6746, + "step": 7350 + }, + { + "epoch": 0.97, + "grad_norm": 2.772865056991577, + "learning_rate": 8.596699001693257e-06, + "loss": 0.6624, + "step": 7360 + }, + { + "epoch": 0.98, + "grad_norm": 2.8705618381500244, + "learning_rate": 8.59142352421903e-06, + "loss": 0.6768, + "step": 7370 + }, + { + "epoch": 0.98, + "grad_norm": 2.4928457736968994, + "learning_rate": 8.586139774457791e-06, + "loss": 0.6582, + "step": 7380 + }, + { + "epoch": 0.98, + "grad_norm": 2.9825477600097656, + "learning_rate": 8.58084776457984e-06, + "loss": 0.686, + "step": 7390 + }, + { + "epoch": 0.98, + "grad_norm": 2.839447021484375, + "learning_rate": 8.575547506774498e-06, + "loss": 0.6847, + "step": 7400 + }, + { + "epoch": 0.98, + "grad_norm": 4.549822807312012, + "learning_rate": 8.570239013250089e-06, + "loss": 0.6599, + "step": 7410 + }, + { + "epoch": 0.98, + "grad_norm": 2.622835874557495, + "learning_rate": 8.5649222962339e-06, + "loss": 0.6447, + "step": 7420 + }, + { + "epoch": 0.98, + "grad_norm": 2.558931350708008, + "learning_rate": 8.559597367972168e-06, + "loss": 0.6653, + "step": 7430 + }, + { + "epoch": 0.98, + "grad_norm": 2.8210113048553467, + "learning_rate": 8.554264240730042e-06, + "loss": 0.671, + "step": 7440 + }, + { + "epoch": 0.99, + "grad_norm": 2.7722108364105225, + "learning_rate": 8.548922926791545e-06, + "loss": 0.6872, + "step": 7450 + }, + { + "epoch": 0.99, + "grad_norm": 2.9724175930023193, + "learning_rate": 8.543573438459573e-06, + "loss": 0.6752, + "step": 7460 + }, + { + "epoch": 0.99, + "grad_norm": 2.862396717071533, + "learning_rate": 8.538215788055839e-06, + "loss": 0.6667, + "step": 7470 + }, + { + "epoch": 0.99, + "grad_norm": 3.055391788482666, + "learning_rate": 8.532849987920859e-06, + "loss": 0.6695, + "step": 7480 + }, + { + "epoch": 0.99, + "grad_norm": 2.379634141921997, + "learning_rate": 8.527476050413922e-06, + "loss": 0.6535, + "step": 7490 + }, + { + "epoch": 0.99, + "grad_norm": 3.219191551208496, + "learning_rate": 8.522093987913063e-06, + "loss": 0.6617, + "step": 7500 + }, + { + "epoch": 0.99, + "grad_norm": 3.1809182167053223, + "learning_rate": 8.516703812815024e-06, + "loss": 0.6672, + "step": 7510 + }, + { + "epoch": 1.0, + "grad_norm": 2.790281057357788, + "learning_rate": 8.511305537535238e-06, + "loss": 0.6722, + "step": 7520 + }, + { + "epoch": 1.0, + "grad_norm": 3.374549388885498, + "learning_rate": 8.505899174507793e-06, + "loss": 0.6622, + "step": 7530 + }, + { + "epoch": 1.0, + "grad_norm": 2.377967119216919, + "learning_rate": 8.500484736185412e-06, + "loss": 0.6531, + "step": 7540 + }, + { + "epoch": 1.0, + "grad_norm": 2.5226247310638428, + "learning_rate": 8.49506223503941e-06, + "loss": 0.6491, + "step": 7550 + } + ], + "logging_steps": 10, + "max_steps": 23000, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1, + "total_flos": 5.621498429758977e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}