{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.730659025787966, "eval_steps": 10, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 31.5, "learning_rate": 8.163265306122449e-07, "loss": 2.1773, "step": 10 }, { "epoch": 0.03, "eval_loss": 2.17791485786438, "eval_runtime": 1.0288, "eval_samples_per_second": 151.63, "eval_steps_per_second": 7.776, "step": 10 }, { "epoch": 0.06, "grad_norm": 28.0, "learning_rate": 1.6326530612244897e-06, "loss": 2.1852, "step": 20 }, { "epoch": 0.06, "eval_loss": 2.0857961177825928, "eval_runtime": 1.027, "eval_samples_per_second": 151.901, "eval_steps_per_second": 7.79, "step": 20 }, { "epoch": 0.09, "grad_norm": 34.75, "learning_rate": 2.4489795918367347e-06, "loss": 1.9915, "step": 30 }, { "epoch": 0.09, "eval_loss": 1.8705158233642578, "eval_runtime": 1.027, "eval_samples_per_second": 151.892, "eval_steps_per_second": 7.789, "step": 30 }, { "epoch": 0.11, "grad_norm": 11.9375, "learning_rate": 3.2653061224489794e-06, "loss": 1.7352, "step": 40 }, { "epoch": 0.11, "eval_loss": 1.5546011924743652, "eval_runtime": 1.0295, "eval_samples_per_second": 151.524, "eval_steps_per_second": 7.77, "step": 40 }, { "epoch": 0.14, "grad_norm": 13.25, "learning_rate": 4.081632653061225e-06, "loss": 1.318, "step": 50 }, { "epoch": 0.14, "eval_loss": 1.0024280548095703, "eval_runtime": 1.0273, "eval_samples_per_second": 151.848, "eval_steps_per_second": 7.787, "step": 50 }, { "epoch": 0.17, "grad_norm": 11.625, "learning_rate": 4.897959183673469e-06, "loss": 0.9294, "step": 60 }, { "epoch": 0.17, "eval_loss": 0.8546950221061707, "eval_runtime": 1.0283, "eval_samples_per_second": 151.702, "eval_steps_per_second": 7.78, "step": 60 }, { "epoch": 0.2, "grad_norm": 10.75, "learning_rate": 5.7142857142857145e-06, "loss": 0.8652, "step": 70 }, { "epoch": 0.2, "eval_loss": 0.8076146245002747, "eval_runtime": 1.0288, "eval_samples_per_second": 151.638, "eval_steps_per_second": 7.776, "step": 70 }, { "epoch": 0.23, "grad_norm": 6.625, "learning_rate": 6.530612244897959e-06, "loss": 0.7794, "step": 80 }, { "epoch": 0.23, "eval_loss": 0.770220160484314, "eval_runtime": 1.0308, "eval_samples_per_second": 151.338, "eval_steps_per_second": 7.761, "step": 80 }, { "epoch": 0.26, "grad_norm": 5.6875, "learning_rate": 7.346938775510205e-06, "loss": 0.8108, "step": 90 }, { "epoch": 0.26, "eval_loss": 0.7469730973243713, "eval_runtime": 1.0319, "eval_samples_per_second": 151.174, "eval_steps_per_second": 7.752, "step": 90 }, { "epoch": 0.29, "grad_norm": 8.125, "learning_rate": 8.16326530612245e-06, "loss": 0.7018, "step": 100 }, { "epoch": 0.29, "eval_loss": 0.7343297600746155, "eval_runtime": 1.0334, "eval_samples_per_second": 150.952, "eval_steps_per_second": 7.741, "step": 100 }, { "epoch": 0.32, "grad_norm": 6.46875, "learning_rate": 8.979591836734695e-06, "loss": 0.7466, "step": 110 }, { "epoch": 0.32, "eval_loss": 0.7234026789665222, "eval_runtime": 1.03, "eval_samples_per_second": 151.46, "eval_steps_per_second": 7.767, "step": 110 }, { "epoch": 0.34, "grad_norm": 3.984375, "learning_rate": 9.795918367346939e-06, "loss": 0.7353, "step": 120 }, { "epoch": 0.34, "eval_loss": 0.7116101384162903, "eval_runtime": 1.0285, "eval_samples_per_second": 151.673, "eval_steps_per_second": 7.778, "step": 120 }, { "epoch": 0.37, "grad_norm": 3.90625, "learning_rate": 1.0612244897959186e-05, "loss": 0.768, "step": 130 }, { "epoch": 0.37, "eval_loss": 0.702937662601471, "eval_runtime": 1.0286, "eval_samples_per_second": 151.657, "eval_steps_per_second": 7.777, "step": 130 }, { "epoch": 0.4, "grad_norm": 4.125, "learning_rate": 1.1428571428571429e-05, "loss": 0.7104, "step": 140 }, { "epoch": 0.4, "eval_loss": 0.6971867680549622, "eval_runtime": 1.0279, "eval_samples_per_second": 151.763, "eval_steps_per_second": 7.783, "step": 140 }, { "epoch": 0.43, "grad_norm": 7.03125, "learning_rate": 1.2244897959183674e-05, "loss": 0.7251, "step": 150 }, { "epoch": 0.43, "eval_loss": 0.6914936900138855, "eval_runtime": 1.0288, "eval_samples_per_second": 151.633, "eval_steps_per_second": 7.776, "step": 150 }, { "epoch": 0.46, "grad_norm": 8.3125, "learning_rate": 1.3061224489795918e-05, "loss": 0.673, "step": 160 }, { "epoch": 0.46, "eval_loss": 0.6913846135139465, "eval_runtime": 1.0281, "eval_samples_per_second": 151.729, "eval_steps_per_second": 7.781, "step": 160 }, { "epoch": 0.49, "grad_norm": 3.46875, "learning_rate": 1.3877551020408165e-05, "loss": 0.7261, "step": 170 }, { "epoch": 0.49, "eval_loss": 0.6843172311782837, "eval_runtime": 1.0285, "eval_samples_per_second": 151.672, "eval_steps_per_second": 7.778, "step": 170 }, { "epoch": 0.52, "grad_norm": 4.0, "learning_rate": 1.469387755102041e-05, "loss": 0.7126, "step": 180 }, { "epoch": 0.52, "eval_loss": 0.6830589175224304, "eval_runtime": 1.0283, "eval_samples_per_second": 151.702, "eval_steps_per_second": 7.78, "step": 180 }, { "epoch": 0.54, "grad_norm": 4.6875, "learning_rate": 1.5510204081632655e-05, "loss": 0.6471, "step": 190 }, { "epoch": 0.54, "eval_loss": 0.6809999346733093, "eval_runtime": 1.0283, "eval_samples_per_second": 151.706, "eval_steps_per_second": 7.78, "step": 190 }, { "epoch": 0.57, "grad_norm": 13.25, "learning_rate": 1.63265306122449e-05, "loss": 0.7056, "step": 200 }, { "epoch": 0.57, "eval_loss": 0.6814644932746887, "eval_runtime": 1.0279, "eval_samples_per_second": 151.767, "eval_steps_per_second": 7.783, "step": 200 }, { "epoch": 0.6, "grad_norm": 9.1875, "learning_rate": 1.7142857142857142e-05, "loss": 0.7437, "step": 210 }, { "epoch": 0.6, "eval_loss": 0.6767860651016235, "eval_runtime": 1.0287, "eval_samples_per_second": 151.643, "eval_steps_per_second": 7.777, "step": 210 }, { "epoch": 0.63, "grad_norm": 13.6875, "learning_rate": 1.795918367346939e-05, "loss": 0.698, "step": 220 }, { "epoch": 0.63, "eval_loss": 0.6760666966438293, "eval_runtime": 1.0302, "eval_samples_per_second": 151.422, "eval_steps_per_second": 7.765, "step": 220 }, { "epoch": 0.66, "grad_norm": 7.0625, "learning_rate": 1.8775510204081636e-05, "loss": 0.6675, "step": 230 }, { "epoch": 0.66, "eval_loss": 0.6715320944786072, "eval_runtime": 1.0272, "eval_samples_per_second": 151.869, "eval_steps_per_second": 7.788, "step": 230 }, { "epoch": 0.69, "grad_norm": 9.125, "learning_rate": 1.9591836734693877e-05, "loss": 0.7056, "step": 240 }, { "epoch": 0.69, "eval_loss": 0.6754013299942017, "eval_runtime": 1.0283, "eval_samples_per_second": 151.704, "eval_steps_per_second": 7.78, "step": 240 }, { "epoch": 0.72, "grad_norm": 10.125, "learning_rate": 1.9999744640197457e-05, "loss": 0.6884, "step": 250 }, { "epoch": 0.72, "eval_loss": 0.672713577747345, "eval_runtime": 1.0287, "eval_samples_per_second": 151.648, "eval_steps_per_second": 7.777, "step": 250 }, { "epoch": 0.74, "grad_norm": 7.3125, "learning_rate": 1.999770184002678e-05, "loss": 0.6497, "step": 260 }, { "epoch": 0.74, "eval_loss": 0.6706756949424744, "eval_runtime": 1.035, "eval_samples_per_second": 150.73, "eval_steps_per_second": 7.73, "step": 260 }, { "epoch": 0.77, "grad_norm": 9.4375, "learning_rate": 1.999361665699934e-05, "loss": 0.6699, "step": 270 }, { "epoch": 0.77, "eval_loss": 0.6661806106567383, "eval_runtime": 1.0289, "eval_samples_per_second": 151.622, "eval_steps_per_second": 7.775, "step": 270 }, { "epoch": 0.8, "grad_norm": 9.6875, "learning_rate": 1.99874899256577e-05, "loss": 0.6864, "step": 280 }, { "epoch": 0.8, "eval_loss": 0.6693017482757568, "eval_runtime": 1.0292, "eval_samples_per_second": 151.572, "eval_steps_per_second": 7.773, "step": 280 }, { "epoch": 0.83, "grad_norm": 7.1875, "learning_rate": 1.997932289760261e-05, "loss": 0.6304, "step": 290 }, { "epoch": 0.83, "eval_loss": 0.6646809577941895, "eval_runtime": 1.0284, "eval_samples_per_second": 151.69, "eval_steps_per_second": 7.779, "step": 290 }, { "epoch": 0.86, "grad_norm": 2.796875, "learning_rate": 1.9969117241237302e-05, "loss": 0.6425, "step": 300 }, { "epoch": 0.86, "eval_loss": 0.6653949022293091, "eval_runtime": 1.0295, "eval_samples_per_second": 151.53, "eval_steps_per_second": 7.771, "step": 300 }, { "epoch": 0.89, "grad_norm": 3.421875, "learning_rate": 1.995687504142667e-05, "loss": 0.7201, "step": 310 }, { "epoch": 0.89, "eval_loss": 0.664738118648529, "eval_runtime": 1.0308, "eval_samples_per_second": 151.342, "eval_steps_per_second": 7.761, "step": 310 }, { "epoch": 0.92, "grad_norm": 3.0625, "learning_rate": 1.994259879907137e-05, "loss": 0.6947, "step": 320 }, { "epoch": 0.92, "eval_loss": 0.6645806431770325, "eval_runtime": 1.0295, "eval_samples_per_second": 151.523, "eval_steps_per_second": 7.77, "step": 320 }, { "epoch": 0.95, "grad_norm": 3.4375, "learning_rate": 1.99262914305969e-05, "loss": 0.6722, "step": 330 }, { "epoch": 0.95, "eval_loss": 0.6644319295883179, "eval_runtime": 1.0282, "eval_samples_per_second": 151.722, "eval_steps_per_second": 7.781, "step": 330 }, { "epoch": 0.97, "grad_norm": 3.59375, "learning_rate": 1.990795626735784e-05, "loss": 0.616, "step": 340 }, { "epoch": 0.97, "eval_loss": 0.668482780456543, "eval_runtime": 1.029, "eval_samples_per_second": 151.601, "eval_steps_per_second": 7.774, "step": 340 }, { "epoch": 1.0, "grad_norm": 3.078125, "learning_rate": 1.9887597054957304e-05, "loss": 0.6448, "step": 350 }, { "epoch": 1.0, "eval_loss": 0.6687355041503906, "eval_runtime": 1.0294, "eval_samples_per_second": 151.549, "eval_steps_per_second": 7.772, "step": 350 }, { "epoch": 1.03, "grad_norm": 4.25, "learning_rate": 1.986521795248175e-05, "loss": 0.5472, "step": 360 }, { "epoch": 1.03, "eval_loss": 0.674789547920227, "eval_runtime": 1.03, "eval_samples_per_second": 151.454, "eval_steps_per_second": 7.767, "step": 360 }, { "epoch": 1.06, "grad_norm": 5.375, "learning_rate": 1.9840823531651357e-05, "loss": 0.5425, "step": 370 }, { "epoch": 1.06, "eval_loss": 0.6699011325836182, "eval_runtime": 1.0281, "eval_samples_per_second": 151.729, "eval_steps_per_second": 7.781, "step": 370 }, { "epoch": 1.09, "grad_norm": 6.9375, "learning_rate": 1.9814418775886083e-05, "loss": 0.5447, "step": 380 }, { "epoch": 1.09, "eval_loss": 0.6747943162918091, "eval_runtime": 1.0283, "eval_samples_per_second": 151.708, "eval_steps_per_second": 7.78, "step": 380 }, { "epoch": 1.12, "grad_norm": 4.28125, "learning_rate": 1.978600907928764e-05, "loss": 0.5412, "step": 390 }, { "epoch": 1.12, "eval_loss": 0.6677992939949036, "eval_runtime": 1.0285, "eval_samples_per_second": 151.683, "eval_steps_per_second": 7.779, "step": 390 }, { "epoch": 1.15, "grad_norm": 4.5625, "learning_rate": 1.9755600245537522e-05, "loss": 0.5712, "step": 400 }, { "epoch": 1.15, "eval_loss": 0.6665674448013306, "eval_runtime": 1.029, "eval_samples_per_second": 151.602, "eval_steps_per_second": 7.774, "step": 400 }, { "epoch": 1.17, "grad_norm": 3.828125, "learning_rate": 1.972319848671145e-05, "loss": 0.5601, "step": 410 }, { "epoch": 1.17, "eval_loss": 0.6693459749221802, "eval_runtime": 1.0316, "eval_samples_per_second": 151.221, "eval_steps_per_second": 7.755, "step": 410 }, { "epoch": 1.2, "grad_norm": 2.515625, "learning_rate": 1.968881042201029e-05, "loss": 0.5176, "step": 420 }, { "epoch": 1.2, "eval_loss": 0.6797675490379333, "eval_runtime": 1.0313, "eval_samples_per_second": 151.268, "eval_steps_per_second": 7.757, "step": 420 }, { "epoch": 1.23, "grad_norm": 3.140625, "learning_rate": 1.9652443076407884e-05, "loss": 0.5072, "step": 430 }, { "epoch": 1.23, "eval_loss": 0.6783838272094727, "eval_runtime": 1.0297, "eval_samples_per_second": 151.493, "eval_steps_per_second": 7.769, "step": 430 }, { "epoch": 1.26, "grad_norm": 3.375, "learning_rate": 1.9614103879215917e-05, "loss": 0.5552, "step": 440 }, { "epoch": 1.26, "eval_loss": 0.6780270338058472, "eval_runtime": 1.0303, "eval_samples_per_second": 151.416, "eval_steps_per_second": 7.765, "step": 440 }, { "epoch": 1.29, "grad_norm": 6.75, "learning_rate": 1.9573800662566257e-05, "loss": 0.5193, "step": 450 }, { "epoch": 1.29, "eval_loss": 0.6783522367477417, "eval_runtime": 1.0281, "eval_samples_per_second": 151.739, "eval_steps_per_second": 7.781, "step": 450 }, { "epoch": 1.32, "grad_norm": 6.0, "learning_rate": 1.9531541659810927e-05, "loss": 0.5513, "step": 460 }, { "epoch": 1.32, "eval_loss": 0.6770094037055969, "eval_runtime": 1.0314, "eval_samples_per_second": 151.253, "eval_steps_per_second": 7.757, "step": 460 }, { "epoch": 1.35, "grad_norm": 9.625, "learning_rate": 1.9487335503840186e-05, "loss": 0.5697, "step": 470 }, { "epoch": 1.35, "eval_loss": 0.6728355288505554, "eval_runtime": 1.0282, "eval_samples_per_second": 151.728, "eval_steps_per_second": 7.781, "step": 470 }, { "epoch": 1.38, "grad_norm": 3.40625, "learning_rate": 1.9441191225318934e-05, "loss": 0.5153, "step": 480 }, { "epoch": 1.38, "eval_loss": 0.6756737232208252, "eval_runtime": 1.0283, "eval_samples_per_second": 151.706, "eval_steps_per_second": 7.78, "step": 480 }, { "epoch": 1.4, "grad_norm": 3.203125, "learning_rate": 1.9393118250841897e-05, "loss": 0.5808, "step": 490 }, { "epoch": 1.4, "eval_loss": 0.6708477139472961, "eval_runtime": 1.0277, "eval_samples_per_second": 151.79, "eval_steps_per_second": 7.784, "step": 490 }, { "epoch": 1.43, "grad_norm": 4.0625, "learning_rate": 1.9343126401007893e-05, "loss": 0.5525, "step": 500 }, { "epoch": 1.43, "eval_loss": 0.6698306202888489, "eval_runtime": 1.0275, "eval_samples_per_second": 151.828, "eval_steps_per_second": 7.786, "step": 500 }, { "epoch": 1.46, "grad_norm": 5.84375, "learning_rate": 1.9291225888413652e-05, "loss": 0.5412, "step": 510 }, { "epoch": 1.46, "eval_loss": 0.6722325086593628, "eval_runtime": 1.0528, "eval_samples_per_second": 148.179, "eval_steps_per_second": 7.599, "step": 510 }, { "epoch": 1.49, "grad_norm": 3.65625, "learning_rate": 1.923742731556752e-05, "loss": 0.5236, "step": 520 }, { "epoch": 1.49, "eval_loss": 0.6703674793243408, "eval_runtime": 1.0436, "eval_samples_per_second": 149.477, "eval_steps_per_second": 7.666, "step": 520 }, { "epoch": 1.52, "grad_norm": 3.59375, "learning_rate": 1.9181741672723516e-05, "loss": 0.5794, "step": 530 }, { "epoch": 1.52, "eval_loss": 0.6666515469551086, "eval_runtime": 1.0472, "eval_samples_per_second": 148.969, "eval_steps_per_second": 7.639, "step": 530 }, { "epoch": 1.55, "grad_norm": 4.21875, "learning_rate": 1.9124180335636213e-05, "loss": 0.4933, "step": 540 }, { "epoch": 1.55, "eval_loss": 0.6719895601272583, "eval_runtime": 1.0512, "eval_samples_per_second": 148.395, "eval_steps_per_second": 7.61, "step": 540 }, { "epoch": 1.58, "grad_norm": 6.09375, "learning_rate": 1.90647550632368e-05, "loss": 0.5346, "step": 550 }, { "epoch": 1.58, "eval_loss": 0.6691425442695618, "eval_runtime": 1.0539, "eval_samples_per_second": 148.025, "eval_steps_per_second": 7.591, "step": 550 }, { "epoch": 1.6, "grad_norm": 4.6875, "learning_rate": 1.9003477995230942e-05, "loss": 0.5743, "step": 560 }, { "epoch": 1.6, "eval_loss": 0.6666125059127808, "eval_runtime": 1.0572, "eval_samples_per_second": 147.555, "eval_steps_per_second": 7.567, "step": 560 }, { "epoch": 1.63, "grad_norm": 3.25, "learning_rate": 1.894036164961879e-05, "loss": 0.5662, "step": 570 }, { "epoch": 1.63, "eval_loss": 0.66337651014328, "eval_runtime": 1.0492, "eval_samples_per_second": 148.684, "eval_steps_per_second": 7.625, "step": 570 }, { "epoch": 1.66, "grad_norm": 3.359375, "learning_rate": 1.8875418920137764e-05, "loss": 0.5924, "step": 580 }, { "epoch": 1.66, "eval_loss": 0.6664850115776062, "eval_runtime": 1.0522, "eval_samples_per_second": 148.261, "eval_steps_per_second": 7.603, "step": 580 }, { "epoch": 1.69, "grad_norm": 3.640625, "learning_rate": 1.880866307362853e-05, "loss": 0.5186, "step": 590 }, { "epoch": 1.69, "eval_loss": 0.6705337762832642, "eval_runtime": 1.0529, "eval_samples_per_second": 148.163, "eval_steps_per_second": 7.598, "step": 590 }, { "epoch": 1.72, "grad_norm": 4.125, "learning_rate": 1.8740107747324785e-05, "loss": 0.5155, "step": 600 }, { "epoch": 1.72, "eval_loss": 0.6648845076560974, "eval_runtime": 1.0494, "eval_samples_per_second": 148.657, "eval_steps_per_second": 7.623, "step": 600 }, { "epoch": 1.75, "grad_norm": 3.171875, "learning_rate": 1.8669766946067398e-05, "loss": 0.5317, "step": 610 }, { "epoch": 1.75, "eval_loss": 0.6645151972770691, "eval_runtime": 1.029, "eval_samples_per_second": 151.608, "eval_steps_per_second": 7.775, "step": 610 }, { "epoch": 1.78, "grad_norm": 3.390625, "learning_rate": 1.8597655039443384e-05, "loss": 0.5151, "step": 620 }, { "epoch": 1.78, "eval_loss": 0.6744809150695801, "eval_runtime": 1.0296, "eval_samples_per_second": 151.522, "eval_steps_per_second": 7.77, "step": 620 }, { "epoch": 1.81, "grad_norm": 2.78125, "learning_rate": 1.8523786758850436e-05, "loss": 0.5269, "step": 630 }, { "epoch": 1.81, "eval_loss": 0.6621160507202148, "eval_runtime": 1.0281, "eval_samples_per_second": 151.733, "eval_steps_per_second": 7.781, "step": 630 }, { "epoch": 1.83, "grad_norm": 3.15625, "learning_rate": 1.8448177194487524e-05, "loss": 0.5522, "step": 640 }, { "epoch": 1.83, "eval_loss": 0.6631208658218384, "eval_runtime": 1.0283, "eval_samples_per_second": 151.707, "eval_steps_per_second": 7.78, "step": 640 }, { "epoch": 1.86, "grad_norm": 3.03125, "learning_rate": 1.837084179227217e-05, "loss": 0.573, "step": 650 }, { "epoch": 1.86, "eval_loss": 0.6646651029586792, "eval_runtime": 1.0278, "eval_samples_per_second": 151.773, "eval_steps_per_second": 7.783, "step": 650 }, { "epoch": 1.89, "grad_norm": 2.71875, "learning_rate": 1.829179635068509e-05, "loss": 0.5536, "step": 660 }, { "epoch": 1.89, "eval_loss": 0.6614721417427063, "eval_runtime": 1.0275, "eval_samples_per_second": 151.818, "eval_steps_per_second": 7.786, "step": 660 }, { "epoch": 1.92, "grad_norm": 2.640625, "learning_rate": 1.821105701754279e-05, "loss": 0.5298, "step": 670 }, { "epoch": 1.92, "eval_loss": 0.6630841493606567, "eval_runtime": 1.0294, "eval_samples_per_second": 151.541, "eval_steps_per_second": 7.771, "step": 670 }, { "epoch": 1.95, "grad_norm": 2.984375, "learning_rate": 1.812864028669881e-05, "loss": 0.5331, "step": 680 }, { "epoch": 1.95, "eval_loss": 0.6609542369842529, "eval_runtime": 1.0282, "eval_samples_per_second": 151.722, "eval_steps_per_second": 7.781, "step": 680 }, { "epoch": 1.98, "grad_norm": 3.6875, "learning_rate": 1.8044562994674266e-05, "loss": 0.505, "step": 690 }, { "epoch": 1.98, "eval_loss": 0.66065913438797, "eval_runtime": 1.0278, "eval_samples_per_second": 151.78, "eval_steps_per_second": 7.784, "step": 690 }, { "epoch": 2.01, "grad_norm": 2.90625, "learning_rate": 1.7958842317218413e-05, "loss": 0.4993, "step": 700 }, { "epoch": 2.01, "eval_loss": 0.658078134059906, "eval_runtime": 1.0276, "eval_samples_per_second": 151.803, "eval_steps_per_second": 7.785, "step": 700 }, { "epoch": 2.03, "grad_norm": 3.84375, "learning_rate": 1.7871495765799875e-05, "loss": 0.3185, "step": 710 }, { "epoch": 2.03, "eval_loss": 0.7684138417243958, "eval_runtime": 1.0281, "eval_samples_per_second": 151.733, "eval_steps_per_second": 7.781, "step": 710 }, { "epoch": 2.06, "grad_norm": 2.0, "learning_rate": 1.7782541184029316e-05, "loss": 0.3207, "step": 720 }, { "epoch": 2.06, "eval_loss": 0.6996563076972961, "eval_runtime": 1.0282, "eval_samples_per_second": 151.729, "eval_steps_per_second": 7.781, "step": 720 }, { "epoch": 2.09, "grad_norm": 4.21875, "learning_rate": 1.769199674401427e-05, "loss": 0.3208, "step": 730 }, { "epoch": 2.09, "eval_loss": 0.7326269745826721, "eval_runtime": 1.0283, "eval_samples_per_second": 151.714, "eval_steps_per_second": 7.78, "step": 730 }, { "epoch": 2.12, "grad_norm": 3.015625, "learning_rate": 1.759988094264682e-05, "loss": 0.2998, "step": 740 }, { "epoch": 2.12, "eval_loss": 0.7228878736495972, "eval_runtime": 1.0298, "eval_samples_per_second": 151.481, "eval_steps_per_second": 7.768, "step": 740 }, { "epoch": 2.15, "grad_norm": 2.65625, "learning_rate": 1.7506212597824976e-05, "loss": 0.3, "step": 750 }, { "epoch": 2.15, "eval_loss": 0.716642439365387, "eval_runtime": 1.0288, "eval_samples_per_second": 151.63, "eval_steps_per_second": 7.776, "step": 750 }, { "epoch": 2.18, "grad_norm": 3.296875, "learning_rate": 1.7411010844608448e-05, "loss": 0.3212, "step": 760 }, { "epoch": 2.18, "eval_loss": 0.722019374370575, "eval_runtime": 1.0268, "eval_samples_per_second": 151.933, "eval_steps_per_second": 7.791, "step": 760 }, { "epoch": 2.21, "grad_norm": 3.5625, "learning_rate": 1.731429513130964e-05, "loss": 0.3192, "step": 770 }, { "epoch": 2.21, "eval_loss": 0.7228615880012512, "eval_runtime": 1.0284, "eval_samples_per_second": 151.694, "eval_steps_per_second": 7.779, "step": 770 }, { "epoch": 2.23, "grad_norm": 3.359375, "learning_rate": 1.7216085215520644e-05, "loss": 0.298, "step": 780 }, { "epoch": 2.23, "eval_loss": 0.7244638204574585, "eval_runtime": 1.0294, "eval_samples_per_second": 151.546, "eval_steps_per_second": 7.772, "step": 780 }, { "epoch": 2.26, "grad_norm": 4.0, "learning_rate": 1.711640116007706e-05, "loss": 0.3152, "step": 790 }, { "epoch": 2.26, "eval_loss": 0.7284898161888123, "eval_runtime": 1.0282, "eval_samples_per_second": 151.719, "eval_steps_per_second": 7.78, "step": 790 }, { "epoch": 2.29, "grad_norm": 4.28125, "learning_rate": 1.701526332895945e-05, "loss": 0.2995, "step": 800 }, { "epoch": 2.29, "eval_loss": 0.7223809957504272, "eval_runtime": 1.0287, "eval_samples_per_second": 151.648, "eval_steps_per_second": 7.777, "step": 800 }, { "epoch": 2.32, "grad_norm": 3.46875, "learning_rate": 1.6912692383133276e-05, "loss": 0.313, "step": 810 }, { "epoch": 2.32, "eval_loss": 0.7280778288841248, "eval_runtime": 1.0279, "eval_samples_per_second": 151.759, "eval_steps_per_second": 7.783, "step": 810 }, { "epoch": 2.35, "grad_norm": 3.1875, "learning_rate": 1.680870927632818e-05, "loss": 0.2983, "step": 820 }, { "epoch": 2.35, "eval_loss": 0.7330511212348938, "eval_runtime": 1.0274, "eval_samples_per_second": 151.84, "eval_steps_per_second": 7.787, "step": 820 }, { "epoch": 2.38, "grad_norm": 2.515625, "learning_rate": 1.6703335250757428e-05, "loss": 0.3204, "step": 830 }, { "epoch": 2.38, "eval_loss": 0.7196641564369202, "eval_runtime": 1.0283, "eval_samples_per_second": 151.708, "eval_steps_per_second": 7.78, "step": 830 }, { "epoch": 2.41, "grad_norm": 3.40625, "learning_rate": 1.659659183277847e-05, "loss": 0.2942, "step": 840 }, { "epoch": 2.41, "eval_loss": 0.7265558242797852, "eval_runtime": 1.0302, "eval_samples_per_second": 151.425, "eval_steps_per_second": 7.765, "step": 840 }, { "epoch": 2.44, "grad_norm": 4.0, "learning_rate": 1.6488500828495384e-05, "loss": 0.3184, "step": 850 }, { "epoch": 2.44, "eval_loss": 0.7329144477844238, "eval_runtime": 1.0277, "eval_samples_per_second": 151.798, "eval_steps_per_second": 7.784, "step": 850 }, { "epoch": 2.46, "grad_norm": 3.15625, "learning_rate": 1.6379084319304245e-05, "loss": 0.3255, "step": 860 }, { "epoch": 2.46, "eval_loss": 0.729471743106842, "eval_runtime": 1.0288, "eval_samples_per_second": 151.628, "eval_steps_per_second": 7.776, "step": 860 }, { "epoch": 2.49, "grad_norm": 3.375, "learning_rate": 1.6268364657382196e-05, "loss": 0.3239, "step": 870 }, { "epoch": 2.49, "eval_loss": 0.7192624807357788, "eval_runtime": 1.0271, "eval_samples_per_second": 151.878, "eval_steps_per_second": 7.789, "step": 870 }, { "epoch": 2.52, "grad_norm": 3.578125, "learning_rate": 1.6156364461121255e-05, "loss": 0.3255, "step": 880 }, { "epoch": 2.52, "eval_loss": 0.7301867604255676, "eval_runtime": 1.0282, "eval_samples_per_second": 151.724, "eval_steps_per_second": 7.781, "step": 880 }, { "epoch": 2.55, "grad_norm": 4.0, "learning_rate": 1.6043106610507683e-05, "loss": 0.3167, "step": 890 }, { "epoch": 2.55, "eval_loss": 0.7298671007156372, "eval_runtime": 1.0278, "eval_samples_per_second": 151.783, "eval_steps_per_second": 7.784, "step": 890 }, { "epoch": 2.58, "grad_norm": 3.140625, "learning_rate": 1.5928614242447965e-05, "loss": 0.3157, "step": 900 }, { "epoch": 2.58, "eval_loss": 0.726569652557373, "eval_runtime": 1.0303, "eval_samples_per_second": 151.412, "eval_steps_per_second": 7.765, "step": 900 }, { "epoch": 2.61, "grad_norm": 3.28125, "learning_rate": 1.581291074604226e-05, "loss": 0.3128, "step": 910 }, { "epoch": 2.61, "eval_loss": 0.7234473824501038, "eval_runtime": 1.0294, "eval_samples_per_second": 151.547, "eval_steps_per_second": 7.772, "step": 910 }, { "epoch": 2.64, "grad_norm": 3.1875, "learning_rate": 1.5696019757806373e-05, "loss": 0.2989, "step": 920 }, { "epoch": 2.64, "eval_loss": 0.7260815501213074, "eval_runtime": 1.0277, "eval_samples_per_second": 151.791, "eval_steps_per_second": 7.784, "step": 920 }, { "epoch": 2.66, "grad_norm": 3.28125, "learning_rate": 1.5577965156843153e-05, "loss": 0.3263, "step": 930 }, { "epoch": 2.66, "eval_loss": 0.7242666482925415, "eval_runtime": 1.0291, "eval_samples_per_second": 151.596, "eval_steps_per_second": 7.774, "step": 930 }, { "epoch": 2.69, "grad_norm": 3.484375, "learning_rate": 1.5458771059964348e-05, "loss": 0.3189, "step": 940 }, { "epoch": 2.69, "eval_loss": 0.7277385592460632, "eval_runtime": 1.0279, "eval_samples_per_second": 151.759, "eval_steps_per_second": 7.783, "step": 940 }, { "epoch": 2.72, "grad_norm": 3.5625, "learning_rate": 1.533846181676389e-05, "loss": 0.3129, "step": 950 }, { "epoch": 2.72, "eval_loss": 0.729342520236969, "eval_runtime": 1.0269, "eval_samples_per_second": 151.913, "eval_steps_per_second": 7.79, "step": 950 }, { "epoch": 2.75, "grad_norm": 3.328125, "learning_rate": 1.521706200464364e-05, "loss": 0.305, "step": 960 }, { "epoch": 2.75, "eval_loss": 0.7262902855873108, "eval_runtime": 1.0289, "eval_samples_per_second": 151.621, "eval_steps_per_second": 7.775, "step": 960 }, { "epoch": 2.78, "grad_norm": 2.296875, "learning_rate": 1.509459642379259e-05, "loss": 0.3115, "step": 970 }, { "epoch": 2.78, "eval_loss": 0.7204219698905945, "eval_runtime": 1.0284, "eval_samples_per_second": 151.688, "eval_steps_per_second": 7.779, "step": 970 }, { "epoch": 2.81, "grad_norm": 2.34375, "learning_rate": 1.4971090092120544e-05, "loss": 0.302, "step": 980 }, { "epoch": 2.81, "eval_loss": 0.7241439819335938, "eval_runtime": 1.0293, "eval_samples_per_second": 151.563, "eval_steps_per_second": 7.772, "step": 980 }, { "epoch": 2.84, "grad_norm": 2.953125, "learning_rate": 1.4846568240147327e-05, "loss": 0.3218, "step": 990 }, { "epoch": 2.84, "eval_loss": 0.7206692695617676, "eval_runtime": 1.0291, "eval_samples_per_second": 151.585, "eval_steps_per_second": 7.774, "step": 990 }, { "epoch": 2.87, "grad_norm": 3.328125, "learning_rate": 1.4721056305848571e-05, "loss": 0.3202, "step": 1000 }, { "epoch": 2.87, "eval_loss": 0.728424608707428, "eval_runtime": 1.0285, "eval_samples_per_second": 151.678, "eval_steps_per_second": 7.778, "step": 1000 }, { "epoch": 2.89, "grad_norm": 2.640625, "learning_rate": 1.4594579929459107e-05, "loss": 0.328, "step": 1010 }, { "epoch": 2.89, "eval_loss": 0.7225694060325623, "eval_runtime": 1.024, "eval_samples_per_second": 152.344, "eval_steps_per_second": 7.813, "step": 1010 }, { "epoch": 2.92, "grad_norm": 3.921875, "learning_rate": 1.446716494823504e-05, "loss": 0.3139, "step": 1020 }, { "epoch": 2.92, "eval_loss": 0.7302761673927307, "eval_runtime": 1.0249, "eval_samples_per_second": 152.213, "eval_steps_per_second": 7.806, "step": 1020 }, { "epoch": 2.95, "grad_norm": 3.546875, "learning_rate": 1.4338837391175582e-05, "loss": 0.3337, "step": 1030 }, { "epoch": 2.95, "eval_loss": 0.72865229845047, "eval_runtime": 1.0269, "eval_samples_per_second": 151.911, "eval_steps_per_second": 7.79, "step": 1030 }, { "epoch": 2.98, "grad_norm": 3.296875, "learning_rate": 1.4209623473705722e-05, "loss": 0.3162, "step": 1040 }, { "epoch": 2.98, "eval_loss": 0.7287803292274475, "eval_runtime": 1.0261, "eval_samples_per_second": 152.029, "eval_steps_per_second": 7.796, "step": 1040 }, { "epoch": 3.01, "grad_norm": 3.203125, "learning_rate": 1.4079549592320782e-05, "loss": 0.2787, "step": 1050 }, { "epoch": 3.01, "eval_loss": 0.7424219250679016, "eval_runtime": 1.0318, "eval_samples_per_second": 151.185, "eval_steps_per_second": 7.753, "step": 1050 }, { "epoch": 3.04, "grad_norm": 3.3125, "learning_rate": 1.3948642319194021e-05, "loss": 0.178, "step": 1060 }, { "epoch": 3.04, "eval_loss": 0.8832307457923889, "eval_runtime": 1.0289, "eval_samples_per_second": 151.625, "eval_steps_per_second": 7.776, "step": 1060 }, { "epoch": 3.07, "grad_norm": 3.265625, "learning_rate": 1.3816928396748321e-05, "loss": 0.1773, "step": 1070 }, { "epoch": 3.07, "eval_loss": 0.8206979632377625, "eval_runtime": 1.0285, "eval_samples_per_second": 151.673, "eval_steps_per_second": 7.778, "step": 1070 }, { "epoch": 3.09, "grad_norm": 2.703125, "learning_rate": 1.3684434732193107e-05, "loss": 0.1742, "step": 1080 }, { "epoch": 3.09, "eval_loss": 0.8235818147659302, "eval_runtime": 1.0294, "eval_samples_per_second": 151.551, "eval_steps_per_second": 7.772, "step": 1080 }, { "epoch": 3.12, "grad_norm": 3.0, "learning_rate": 1.3551188392027606e-05, "loss": 0.1864, "step": 1090 }, { "epoch": 3.12, "eval_loss": 0.8448302149772644, "eval_runtime": 1.0291, "eval_samples_per_second": 151.583, "eval_steps_per_second": 7.773, "step": 1090 }, { "epoch": 3.15, "grad_norm": 2.71875, "learning_rate": 1.3417216596511557e-05, "loss": 0.1687, "step": 1100 }, { "epoch": 3.15, "eval_loss": 0.8366988897323608, "eval_runtime": 1.0292, "eval_samples_per_second": 151.568, "eval_steps_per_second": 7.773, "step": 1100 }, { "epoch": 3.18, "grad_norm": 2.828125, "learning_rate": 1.328254671410452e-05, "loss": 0.1856, "step": 1110 }, { "epoch": 3.18, "eval_loss": 0.8296992182731628, "eval_runtime": 1.03, "eval_samples_per_second": 151.449, "eval_steps_per_second": 7.767, "step": 1110 }, { "epoch": 3.21, "grad_norm": 3.28125, "learning_rate": 1.3147206255874886e-05, "loss": 0.1802, "step": 1120 }, { "epoch": 3.21, "eval_loss": 0.8381376266479492, "eval_runtime": 1.0295, "eval_samples_per_second": 151.53, "eval_steps_per_second": 7.771, "step": 1120 }, { "epoch": 3.24, "grad_norm": 2.34375, "learning_rate": 1.3011222869879796e-05, "loss": 0.1795, "step": 1130 }, { "epoch": 3.24, "eval_loss": 0.8237268328666687, "eval_runtime": 1.0284, "eval_samples_per_second": 151.688, "eval_steps_per_second": 7.779, "step": 1130 }, { "epoch": 3.27, "grad_norm": 2.71875, "learning_rate": 1.287462433551704e-05, "loss": 0.1837, "step": 1140 }, { "epoch": 3.27, "eval_loss": 0.8503643870353699, "eval_runtime": 1.0292, "eval_samples_per_second": 151.576, "eval_steps_per_second": 7.773, "step": 1140 }, { "epoch": 3.3, "grad_norm": 3.375, "learning_rate": 1.273743855785012e-05, "loss": 0.1706, "step": 1150 }, { "epoch": 3.3, "eval_loss": 0.8583101034164429, "eval_runtime": 1.0279, "eval_samples_per_second": 151.76, "eval_steps_per_second": 7.783, "step": 1150 }, { "epoch": 3.32, "grad_norm": 2.421875, "learning_rate": 1.25996935619077e-05, "loss": 0.1832, "step": 1160 }, { "epoch": 3.32, "eval_loss": 0.8319224715232849, "eval_runtime": 1.0282, "eval_samples_per_second": 151.714, "eval_steps_per_second": 7.78, "step": 1160 }, { "epoch": 3.35, "grad_norm": 3.34375, "learning_rate": 1.2461417486958463e-05, "loss": 0.196, "step": 1170 }, { "epoch": 3.35, "eval_loss": 0.8305267095565796, "eval_runtime": 1.0279, "eval_samples_per_second": 151.768, "eval_steps_per_second": 7.783, "step": 1170 }, { "epoch": 3.38, "grad_norm": 3.0, "learning_rate": 1.2322638580762684e-05, "loss": 0.1769, "step": 1180 }, { "epoch": 3.38, "eval_loss": 0.8342500329017639, "eval_runtime": 1.028, "eval_samples_per_second": 151.75, "eval_steps_per_second": 7.782, "step": 1180 }, { "epoch": 3.41, "grad_norm": 2.28125, "learning_rate": 1.2183385193801655e-05, "loss": 0.175, "step": 1190 }, { "epoch": 3.41, "eval_loss": 0.8393198847770691, "eval_runtime": 1.0287, "eval_samples_per_second": 151.644, "eval_steps_per_second": 7.777, "step": 1190 }, { "epoch": 3.44, "grad_norm": 3.078125, "learning_rate": 1.2043685773486073e-05, "loss": 0.1795, "step": 1200 }, { "epoch": 3.44, "eval_loss": 0.8404285907745361, "eval_runtime": 1.029, "eval_samples_per_second": 151.601, "eval_steps_per_second": 7.774, "step": 1200 }, { "epoch": 3.47, "grad_norm": 2.734375, "learning_rate": 1.1903568858344667e-05, "loss": 0.1908, "step": 1210 }, { "epoch": 3.47, "eval_loss": 0.8395776152610779, "eval_runtime": 1.03, "eval_samples_per_second": 151.454, "eval_steps_per_second": 7.767, "step": 1210 }, { "epoch": 3.5, "grad_norm": 2.65625, "learning_rate": 1.1763063072194181e-05, "loss": 0.1937, "step": 1220 }, { "epoch": 3.5, "eval_loss": 0.8442292809486389, "eval_runtime": 1.029, "eval_samples_per_second": 151.598, "eval_steps_per_second": 7.774, "step": 1220 }, { "epoch": 3.52, "grad_norm": 3.78125, "learning_rate": 1.1622197118291982e-05, "loss": 0.1923, "step": 1230 }, { "epoch": 3.52, "eval_loss": 0.8377634882926941, "eval_runtime": 1.0302, "eval_samples_per_second": 151.425, "eval_steps_per_second": 7.765, "step": 1230 }, { "epoch": 3.55, "grad_norm": 3.46875, "learning_rate": 1.148099977347238e-05, "loss": 0.1879, "step": 1240 }, { "epoch": 3.55, "eval_loss": 0.8426746726036072, "eval_runtime": 1.0276, "eval_samples_per_second": 151.807, "eval_steps_per_second": 7.785, "step": 1240 }, { "epoch": 3.58, "grad_norm": 2.28125, "learning_rate": 1.1339499882267955e-05, "loss": 0.177, "step": 1250 }, { "epoch": 3.58, "eval_loss": 0.8470408916473389, "eval_runtime": 1.0317, "eval_samples_per_second": 151.209, "eval_steps_per_second": 7.754, "step": 1250 }, { "epoch": 3.61, "grad_norm": 3.234375, "learning_rate": 1.1197726351017052e-05, "loss": 0.1779, "step": 1260 }, { "epoch": 3.61, "eval_loss": 0.8529857993125916, "eval_runtime": 1.0272, "eval_samples_per_second": 151.863, "eval_steps_per_second": 7.788, "step": 1260 }, { "epoch": 3.64, "grad_norm": 2.5, "learning_rate": 1.1055708141958634e-05, "loss": 0.1921, "step": 1270 }, { "epoch": 3.64, "eval_loss": 0.8385663032531738, "eval_runtime": 1.0289, "eval_samples_per_second": 151.612, "eval_steps_per_second": 7.775, "step": 1270 }, { "epoch": 3.67, "grad_norm": 2.84375, "learning_rate": 1.091347426731573e-05, "loss": 0.181, "step": 1280 }, { "epoch": 3.67, "eval_loss": 0.8298706412315369, "eval_runtime": 1.0284, "eval_samples_per_second": 151.696, "eval_steps_per_second": 7.779, "step": 1280 }, { "epoch": 3.7, "grad_norm": 2.859375, "learning_rate": 1.0771053783368647e-05, "loss": 0.1741, "step": 1290 }, { "epoch": 3.7, "eval_loss": 0.8360311985015869, "eval_runtime": 1.0276, "eval_samples_per_second": 151.811, "eval_steps_per_second": 7.785, "step": 1290 }, { "epoch": 3.72, "grad_norm": 2.46875, "learning_rate": 1.0628475784519248e-05, "loss": 0.1729, "step": 1300 }, { "epoch": 3.72, "eval_loss": 0.8473387360572815, "eval_runtime": 1.0281, "eval_samples_per_second": 151.736, "eval_steps_per_second": 7.781, "step": 1300 }, { "epoch": 3.75, "grad_norm": 3.015625, "learning_rate": 1.0485769397347335e-05, "loss": 0.1778, "step": 1310 }, { "epoch": 3.75, "eval_loss": 0.8452015519142151, "eval_runtime": 1.028, "eval_samples_per_second": 151.746, "eval_steps_per_second": 7.782, "step": 1310 }, { "epoch": 3.78, "grad_norm": 3.515625, "learning_rate": 1.0342963774660566e-05, "loss": 0.1754, "step": 1320 }, { "epoch": 3.78, "eval_loss": 0.842901349067688, "eval_runtime": 1.0283, "eval_samples_per_second": 151.702, "eval_steps_per_second": 7.78, "step": 1320 }, { "epoch": 3.81, "grad_norm": 3.15625, "learning_rate": 1.0200088089538944e-05, "loss": 0.179, "step": 1330 }, { "epoch": 3.81, "eval_loss": 0.8430735468864441, "eval_runtime": 1.0292, "eval_samples_per_second": 151.567, "eval_steps_per_second": 7.773, "step": 1330 }, { "epoch": 3.84, "grad_norm": 2.203125, "learning_rate": 1.0057171529375192e-05, "loss": 0.1788, "step": 1340 }, { "epoch": 3.84, "eval_loss": 0.8360325694084167, "eval_runtime": 1.0296, "eval_samples_per_second": 151.513, "eval_steps_per_second": 7.77, "step": 1340 }, { "epoch": 3.87, "grad_norm": 2.453125, "learning_rate": 9.9142432899122e-06, "loss": 0.1776, "step": 1350 }, { "epoch": 3.87, "eval_loss": 0.8384283185005188, "eval_runtime": 1.0278, "eval_samples_per_second": 151.775, "eval_steps_per_second": 7.783, "step": 1350 }, { "epoch": 3.9, "grad_norm": 2.703125, "learning_rate": 9.77133256927877e-06, "loss": 0.1716, "step": 1360 }, { "epoch": 3.9, "eval_loss": 0.8421617150306702, "eval_runtime": 1.0297, "eval_samples_per_second": 151.496, "eval_steps_per_second": 7.769, "step": 1360 }, { "epoch": 3.93, "grad_norm": 2.625, "learning_rate": 9.628468562024858e-06, "loss": 0.1885, "step": 1370 }, { "epoch": 3.93, "eval_loss": 0.8416627645492554, "eval_runtime": 1.0264, "eval_samples_per_second": 151.987, "eval_steps_per_second": 7.794, "step": 1370 }, { "epoch": 3.95, "grad_norm": 3.140625, "learning_rate": 9.48568045315758e-06, "loss": 0.1858, "step": 1380 }, { "epoch": 3.95, "eval_loss": 0.8395534157752991, "eval_runtime": 1.0274, "eval_samples_per_second": 151.842, "eval_steps_per_second": 7.787, "step": 1380 }, { "epoch": 3.98, "grad_norm": 3.046875, "learning_rate": 9.342997412179123e-06, "loss": 0.1765, "step": 1390 }, { "epoch": 3.98, "eval_loss": 0.8416285514831543, "eval_runtime": 1.0276, "eval_samples_per_second": 151.814, "eval_steps_per_second": 7.785, "step": 1390 }, { "epoch": 4.01, "grad_norm": 1.84375, "learning_rate": 9.200448587127852e-06, "loss": 0.16, "step": 1400 }, { "epoch": 4.01, "eval_loss": 0.8607451319694519, "eval_runtime": 1.0298, "eval_samples_per_second": 151.48, "eval_steps_per_second": 7.768, "step": 1400 }, { "epoch": 4.04, "grad_norm": 1.96875, "learning_rate": 9.058063098623808e-06, "loss": 0.1175, "step": 1410 }, { "epoch": 4.04, "eval_loss": 0.9675983786582947, "eval_runtime": 1.0302, "eval_samples_per_second": 151.433, "eval_steps_per_second": 7.766, "step": 1410 }, { "epoch": 4.07, "grad_norm": 2.9375, "learning_rate": 8.915870033919785e-06, "loss": 0.1266, "step": 1420 }, { "epoch": 4.07, "eval_loss": 0.9722825884819031, "eval_runtime": 1.0285, "eval_samples_per_second": 151.675, "eval_steps_per_second": 7.778, "step": 1420 }, { "epoch": 4.1, "grad_norm": 2.140625, "learning_rate": 8.77389844095923e-06, "loss": 0.1232, "step": 1430 }, { "epoch": 4.1, "eval_loss": 0.9478564262390137, "eval_runtime": 1.0286, "eval_samples_per_second": 151.668, "eval_steps_per_second": 7.778, "step": 1430 }, { "epoch": 4.13, "grad_norm": 1.9296875, "learning_rate": 8.632177322442198e-06, "loss": 0.1202, "step": 1440 }, { "epoch": 4.13, "eval_loss": 0.958242654800415, "eval_runtime": 1.0294, "eval_samples_per_second": 151.538, "eval_steps_per_second": 7.771, "step": 1440 }, { "epoch": 4.15, "grad_norm": 2.234375, "learning_rate": 8.490735629900497e-06, "loss": 0.1163, "step": 1450 }, { "epoch": 4.15, "eval_loss": 0.9646920561790466, "eval_runtime": 1.0278, "eval_samples_per_second": 151.781, "eval_steps_per_second": 7.784, "step": 1450 }, { "epoch": 4.18, "grad_norm": 2.1875, "learning_rate": 8.349602257783347e-06, "loss": 0.1179, "step": 1460 }, { "epoch": 4.18, "eval_loss": 0.968169093132019, "eval_runtime": 1.028, "eval_samples_per_second": 151.752, "eval_steps_per_second": 7.782, "step": 1460 }, { "epoch": 4.21, "grad_norm": 2.828125, "learning_rate": 8.208806037554645e-06, "loss": 0.1281, "step": 1470 }, { "epoch": 4.21, "eval_loss": 0.9741702079772949, "eval_runtime": 1.027, "eval_samples_per_second": 151.895, "eval_steps_per_second": 7.789, "step": 1470 }, { "epoch": 4.24, "grad_norm": 2.9375, "learning_rate": 8.068375731803151e-06, "loss": 0.1259, "step": 1480 }, { "epoch": 4.24, "eval_loss": 0.9701966643333435, "eval_runtime": 1.027, "eval_samples_per_second": 151.902, "eval_steps_per_second": 7.79, "step": 1480 }, { "epoch": 4.27, "grad_norm": 2.578125, "learning_rate": 7.928340028366687e-06, "loss": 0.1242, "step": 1490 }, { "epoch": 4.27, "eval_loss": 0.9706182479858398, "eval_runtime": 1.0282, "eval_samples_per_second": 151.719, "eval_steps_per_second": 7.78, "step": 1490 }, { "epoch": 4.3, "grad_norm": 2.953125, "learning_rate": 7.788727534471655e-06, "loss": 0.1237, "step": 1500 }, { "epoch": 4.3, "eval_loss": 0.9681195020675659, "eval_runtime": 1.0278, "eval_samples_per_second": 151.775, "eval_steps_per_second": 7.783, "step": 1500 }, { "epoch": 4.33, "grad_norm": 2.375, "learning_rate": 7.649566770889003e-06, "loss": 0.1213, "step": 1510 }, { "epoch": 4.33, "eval_loss": 0.9657440185546875, "eval_runtime": 1.0262, "eval_samples_per_second": 152.022, "eval_steps_per_second": 7.796, "step": 1510 }, { "epoch": 4.36, "grad_norm": 2.34375, "learning_rate": 7.510886166107833e-06, "loss": 0.1263, "step": 1520 }, { "epoch": 4.36, "eval_loss": 0.9629148840904236, "eval_runtime": 1.0251, "eval_samples_per_second": 152.175, "eval_steps_per_second": 7.804, "step": 1520 }, { "epoch": 4.38, "grad_norm": 2.5625, "learning_rate": 7.3727140505279045e-06, "loss": 0.1371, "step": 1530 }, { "epoch": 4.38, "eval_loss": 0.9612699747085571, "eval_runtime": 1.0271, "eval_samples_per_second": 151.883, "eval_steps_per_second": 7.789, "step": 1530 }, { "epoch": 4.41, "grad_norm": 2.328125, "learning_rate": 7.235078650672141e-06, "loss": 0.1247, "step": 1540 }, { "epoch": 4.41, "eval_loss": 0.9614297151565552, "eval_runtime": 1.0264, "eval_samples_per_second": 151.994, "eval_steps_per_second": 7.795, "step": 1540 }, { "epoch": 4.44, "grad_norm": 2.46875, "learning_rate": 7.098008083420359e-06, "loss": 0.1283, "step": 1550 }, { "epoch": 4.44, "eval_loss": 0.9646217226982117, "eval_runtime": 1.027, "eval_samples_per_second": 151.9, "eval_steps_per_second": 7.79, "step": 1550 }, { "epoch": 4.47, "grad_norm": 2.671875, "learning_rate": 6.961530350265427e-06, "loss": 0.1299, "step": 1560 }, { "epoch": 4.47, "eval_loss": 0.9594274759292603, "eval_runtime": 1.028, "eval_samples_per_second": 151.748, "eval_steps_per_second": 7.782, "step": 1560 }, { "epoch": 4.5, "grad_norm": 2.390625, "learning_rate": 6.825673331592952e-06, "loss": 0.128, "step": 1570 }, { "epoch": 4.5, "eval_loss": 0.9570373892784119, "eval_runtime": 1.0284, "eval_samples_per_second": 151.689, "eval_steps_per_second": 7.779, "step": 1570 }, { "epoch": 4.53, "grad_norm": 1.8203125, "learning_rate": 6.6904647809857484e-06, "loss": 0.1214, "step": 1580 }, { "epoch": 4.53, "eval_loss": 0.9542241096496582, "eval_runtime": 1.0301, "eval_samples_per_second": 151.448, "eval_steps_per_second": 7.767, "step": 1580 }, { "epoch": 4.56, "grad_norm": 3.125, "learning_rate": 6.55593231955417e-06, "loss": 0.1231, "step": 1590 }, { "epoch": 4.56, "eval_loss": 0.9523473978042603, "eval_runtime": 1.0294, "eval_samples_per_second": 151.54, "eval_steps_per_second": 7.771, "step": 1590 }, { "epoch": 4.58, "grad_norm": 2.3125, "learning_rate": 6.42210343029354e-06, "loss": 0.1224, "step": 1600 }, { "epoch": 4.58, "eval_loss": 0.9537341594696045, "eval_runtime": 1.0282, "eval_samples_per_second": 151.725, "eval_steps_per_second": 7.781, "step": 1600 }, { "epoch": 4.61, "grad_norm": 2.171875, "learning_rate": 6.289005452469778e-06, "loss": 0.1273, "step": 1610 }, { "epoch": 4.61, "eval_loss": 0.9624239802360535, "eval_runtime": 1.0299, "eval_samples_per_second": 151.472, "eval_steps_per_second": 7.768, "step": 1610 }, { "epoch": 4.64, "grad_norm": 3.28125, "learning_rate": 6.156665576034383e-06, "loss": 0.1304, "step": 1620 }, { "epoch": 4.64, "eval_loss": 0.9658766388893127, "eval_runtime": 1.0299, "eval_samples_per_second": 151.466, "eval_steps_per_second": 7.768, "step": 1620 }, { "epoch": 4.67, "grad_norm": 2.234375, "learning_rate": 6.025110836069939e-06, "loss": 0.1258, "step": 1630 }, { "epoch": 4.67, "eval_loss": 0.9638926386833191, "eval_runtime": 1.0292, "eval_samples_per_second": 151.567, "eval_steps_per_second": 7.773, "step": 1630 }, { "epoch": 4.7, "grad_norm": 1.7109375, "learning_rate": 5.89436810726725e-06, "loss": 0.1184, "step": 1640 }, { "epoch": 4.7, "eval_loss": 0.965703547000885, "eval_runtime": 1.0286, "eval_samples_per_second": 151.668, "eval_steps_per_second": 7.778, "step": 1640 }, { "epoch": 4.73, "grad_norm": 2.265625, "learning_rate": 5.764464098435216e-06, "loss": 0.1231, "step": 1650 }, { "epoch": 4.73, "eval_loss": 0.9644750952720642, "eval_runtime": 1.0266, "eval_samples_per_second": 151.951, "eval_steps_per_second": 7.792, "step": 1650 }, { "epoch": 4.76, "grad_norm": 2.71875, "learning_rate": 5.635425347044639e-06, "loss": 0.1278, "step": 1660 }, { "epoch": 4.76, "eval_loss": 0.9634093046188354, "eval_runtime": 1.0271, "eval_samples_per_second": 151.887, "eval_steps_per_second": 7.789, "step": 1660 }, { "epoch": 4.79, "grad_norm": 2.296875, "learning_rate": 5.507278213807009e-06, "loss": 0.1294, "step": 1670 }, { "epoch": 4.79, "eval_loss": 0.9675451517105103, "eval_runtime": 1.0289, "eval_samples_per_second": 151.623, "eval_steps_per_second": 7.776, "step": 1670 }, { "epoch": 4.81, "grad_norm": 2.609375, "learning_rate": 5.380048877289381e-06, "loss": 0.1236, "step": 1680 }, { "epoch": 4.81, "eval_loss": 0.9684876799583435, "eval_runtime": 1.027, "eval_samples_per_second": 151.904, "eval_steps_per_second": 7.79, "step": 1680 }, { "epoch": 4.84, "grad_norm": 2.546875, "learning_rate": 5.253763328566494e-06, "loss": 0.1237, "step": 1690 }, { "epoch": 4.84, "eval_loss": 0.968744158744812, "eval_runtime": 1.0279, "eval_samples_per_second": 151.763, "eval_steps_per_second": 7.783, "step": 1690 }, { "epoch": 4.87, "grad_norm": 2.515625, "learning_rate": 5.128447365911185e-06, "loss": 0.1274, "step": 1700 }, { "epoch": 4.87, "eval_loss": 0.9674803614616394, "eval_runtime": 1.0277, "eval_samples_per_second": 151.797, "eval_steps_per_second": 7.784, "step": 1700 }, { "epoch": 4.9, "grad_norm": 3.546875, "learning_rate": 5.004126589524174e-06, "loss": 0.1307, "step": 1710 }, { "epoch": 4.9, "eval_loss": 0.9692357182502747, "eval_runtime": 1.0277, "eval_samples_per_second": 151.793, "eval_steps_per_second": 7.784, "step": 1710 }, { "epoch": 4.93, "grad_norm": 2.0625, "learning_rate": 4.880826396304312e-06, "loss": 0.1161, "step": 1720 }, { "epoch": 4.93, "eval_loss": 0.9701640605926514, "eval_runtime": 1.0289, "eval_samples_per_second": 151.617, "eval_steps_per_second": 7.775, "step": 1720 }, { "epoch": 4.96, "grad_norm": 2.171875, "learning_rate": 4.75857197466039e-06, "loss": 0.1221, "step": 1730 }, { "epoch": 4.96, "eval_loss": 0.9716487526893616, "eval_runtime": 1.0275, "eval_samples_per_second": 151.819, "eval_steps_per_second": 7.786, "step": 1730 }, { "epoch": 4.99, "grad_norm": 2.03125, "learning_rate": 4.6373882993655e-06, "loss": 0.1238, "step": 1740 }, { "epoch": 4.99, "eval_loss": 0.9733632206916809, "eval_runtime": 1.0281, "eval_samples_per_second": 151.736, "eval_steps_per_second": 7.781, "step": 1740 }, { "epoch": 5.01, "grad_norm": 1.6953125, "learning_rate": 4.5173001264550665e-06, "loss": 0.1129, "step": 1750 }, { "epoch": 5.01, "eval_loss": 0.9793078899383545, "eval_runtime": 1.0286, "eval_samples_per_second": 151.66, "eval_steps_per_second": 7.777, "step": 1750 }, { "epoch": 5.04, "grad_norm": 2.140625, "learning_rate": 4.398331988169559e-06, "loss": 0.1042, "step": 1760 }, { "epoch": 5.04, "eval_loss": 0.9991005063056946, "eval_runtime": 1.0276, "eval_samples_per_second": 151.816, "eval_steps_per_second": 7.785, "step": 1760 }, { "epoch": 5.07, "grad_norm": 2.234375, "learning_rate": 4.280508187942913e-06, "loss": 0.1076, "step": 1770 }, { "epoch": 5.07, "eval_loss": 1.0183870792388916, "eval_runtime": 1.0276, "eval_samples_per_second": 151.815, "eval_steps_per_second": 7.785, "step": 1770 }, { "epoch": 5.1, "grad_norm": 2.71875, "learning_rate": 4.1638527954376865e-06, "loss": 0.1061, "step": 1780 }, { "epoch": 5.1, "eval_loss": 1.0275365114212036, "eval_runtime": 1.0276, "eval_samples_per_second": 151.803, "eval_steps_per_second": 7.785, "step": 1780 }, { "epoch": 5.13, "grad_norm": 1.8046875, "learning_rate": 4.048389641628002e-06, "loss": 0.1016, "step": 1790 }, { "epoch": 5.13, "eval_loss": 1.0309237241744995, "eval_runtime": 1.0283, "eval_samples_per_second": 151.702, "eval_steps_per_second": 7.78, "step": 1790 }, { "epoch": 5.16, "grad_norm": 2.015625, "learning_rate": 3.934142313931215e-06, "loss": 0.1067, "step": 1800 }, { "epoch": 5.16, "eval_loss": 1.0333725214004517, "eval_runtime": 1.0272, "eval_samples_per_second": 151.866, "eval_steps_per_second": 7.788, "step": 1800 }, { "epoch": 5.19, "grad_norm": 2.3125, "learning_rate": 3.821134151389367e-06, "loss": 0.104, "step": 1810 }, { "epoch": 5.19, "eval_loss": 1.0376129150390625, "eval_runtime": 1.0294, "eval_samples_per_second": 151.541, "eval_steps_per_second": 7.771, "step": 1810 }, { "epoch": 5.21, "grad_norm": 2.328125, "learning_rate": 3.7093882399013504e-06, "loss": 0.1086, "step": 1820 }, { "epoch": 5.21, "eval_loss": 1.0411092042922974, "eval_runtime": 1.0277, "eval_samples_per_second": 151.8, "eval_steps_per_second": 7.785, "step": 1820 }, { "epoch": 5.24, "grad_norm": 1.5625, "learning_rate": 3.5989274075068005e-06, "loss": 0.1025, "step": 1830 }, { "epoch": 5.24, "eval_loss": 1.0420621633529663, "eval_runtime": 1.0283, "eval_samples_per_second": 151.71, "eval_steps_per_second": 7.78, "step": 1830 }, { "epoch": 5.27, "grad_norm": 2.421875, "learning_rate": 3.489774219722689e-06, "loss": 0.107, "step": 1840 }, { "epoch": 5.27, "eval_loss": 1.0420392751693726, "eval_runtime": 1.0306, "eval_samples_per_second": 151.368, "eval_steps_per_second": 7.762, "step": 1840 }, { "epoch": 5.3, "grad_norm": 2.0, "learning_rate": 3.3819509749334843e-06, "loss": 0.0982, "step": 1850 }, { "epoch": 5.3, "eval_loss": 1.0423657894134521, "eval_runtime": 1.0283, "eval_samples_per_second": 151.71, "eval_steps_per_second": 7.78, "step": 1850 }, { "epoch": 5.33, "grad_norm": 1.6796875, "learning_rate": 3.2754796998359427e-06, "loss": 0.1067, "step": 1860 }, { "epoch": 5.33, "eval_loss": 1.0411757230758667, "eval_runtime": 1.0278, "eval_samples_per_second": 151.776, "eval_steps_per_second": 7.783, "step": 1860 }, { "epoch": 5.36, "grad_norm": 1.96875, "learning_rate": 3.1703821449393856e-06, "loss": 0.1079, "step": 1870 }, { "epoch": 5.36, "eval_loss": 1.04030179977417, "eval_runtime": 1.0281, "eval_samples_per_second": 151.741, "eval_steps_per_second": 7.782, "step": 1870 }, { "epoch": 5.39, "grad_norm": 1.2109375, "learning_rate": 3.066679780122368e-06, "loss": 0.1027, "step": 1880 }, { "epoch": 5.39, "eval_loss": 1.0408295392990112, "eval_runtime": 1.0296, "eval_samples_per_second": 151.52, "eval_steps_per_second": 7.77, "step": 1880 }, { "epoch": 5.42, "grad_norm": 1.75, "learning_rate": 2.964393790246728e-06, "loss": 0.1035, "step": 1890 }, { "epoch": 5.42, "eval_loss": 1.0431201457977295, "eval_runtime": 1.0283, "eval_samples_per_second": 151.702, "eval_steps_per_second": 7.78, "step": 1890 }, { "epoch": 5.44, "grad_norm": 1.5390625, "learning_rate": 2.863545070829804e-06, "loss": 0.1021, "step": 1900 }, { "epoch": 5.44, "eval_loss": 1.044161081314087, "eval_runtime": 1.028, "eval_samples_per_second": 151.75, "eval_steps_per_second": 7.782, "step": 1900 }, { "epoch": 5.47, "grad_norm": 2.15625, "learning_rate": 2.7641542237758134e-06, "loss": 0.1031, "step": 1910 }, { "epoch": 5.47, "eval_loss": 1.044289231300354, "eval_runtime": 1.031, "eval_samples_per_second": 151.303, "eval_steps_per_second": 7.759, "step": 1910 }, { "epoch": 5.5, "grad_norm": 2.59375, "learning_rate": 2.666241553167175e-06, "loss": 0.1064, "step": 1920 }, { "epoch": 5.5, "eval_loss": 1.0444979667663574, "eval_runtime": 1.0289, "eval_samples_per_second": 151.613, "eval_steps_per_second": 7.775, "step": 1920 }, { "epoch": 5.53, "grad_norm": 1.796875, "learning_rate": 2.569827061116684e-06, "loss": 0.1083, "step": 1930 }, { "epoch": 5.53, "eval_loss": 1.044089436531067, "eval_runtime": 1.0275, "eval_samples_per_second": 151.832, "eval_steps_per_second": 7.786, "step": 1930 }, { "epoch": 5.56, "grad_norm": 1.984375, "learning_rate": 2.474930443681377e-06, "loss": 0.1111, "step": 1940 }, { "epoch": 5.56, "eval_loss": 1.0448068380355835, "eval_runtime": 1.0281, "eval_samples_per_second": 151.742, "eval_steps_per_second": 7.782, "step": 1940 }, { "epoch": 5.59, "grad_norm": 2.109375, "learning_rate": 2.3815710868389274e-06, "loss": 0.1093, "step": 1950 }, { "epoch": 5.59, "eval_loss": 1.045329213142395, "eval_runtime": 1.0265, "eval_samples_per_second": 151.979, "eval_steps_per_second": 7.794, "step": 1950 }, { "epoch": 5.62, "grad_norm": 2.203125, "learning_rate": 2.2897680625273623e-06, "loss": 0.1062, "step": 1960 }, { "epoch": 5.62, "eval_loss": 1.0454120635986328, "eval_runtime": 1.0278, "eval_samples_per_second": 151.787, "eval_steps_per_second": 7.784, "step": 1960 }, { "epoch": 5.64, "grad_norm": 2.015625, "learning_rate": 2.199540124748957e-06, "loss": 0.1072, "step": 1970 }, { "epoch": 5.64, "eval_loss": 1.0454870462417603, "eval_runtime": 1.0277, "eval_samples_per_second": 151.79, "eval_steps_per_second": 7.784, "step": 1970 }, { "epoch": 5.67, "grad_norm": 1.90625, "learning_rate": 2.110905705739069e-06, "loss": 0.1046, "step": 1980 }, { "epoch": 5.67, "eval_loss": 1.0458413362503052, "eval_runtime": 1.0268, "eval_samples_per_second": 151.927, "eval_steps_per_second": 7.791, "step": 1980 }, { "epoch": 5.7, "grad_norm": 1.984375, "learning_rate": 2.0238829122006944e-06, "loss": 0.1078, "step": 1990 }, { "epoch": 5.7, "eval_loss": 1.0452886819839478, "eval_runtime": 1.0274, "eval_samples_per_second": 151.835, "eval_steps_per_second": 7.786, "step": 1990 }, { "epoch": 5.73, "grad_norm": 1.75, "learning_rate": 1.9384895216055533e-06, "loss": 0.1076, "step": 2000 }, { "epoch": 5.73, "eval_loss": 1.0452741384506226, "eval_runtime": 1.0267, "eval_samples_per_second": 151.95, "eval_steps_per_second": 7.792, "step": 2000 } ], "logging_steps": 10, "max_steps": 2443, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "total_flos": 9.135415373280051e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }