{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.99825479930192, "global_step": 2860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 2.9999999999999997e-05, "loss": 12.9083, "step": 10 }, { "epoch": 0.07, "learning_rate": 5.9999999999999995e-05, "loss": 12.5233, "step": 20 }, { "epoch": 0.1, "learning_rate": 8.699999999999999e-05, "loss": 12.1986, "step": 30 }, { "epoch": 0.14, "learning_rate": 0.00011099999999999999, "loss": 9.567, "step": 40 }, { "epoch": 0.17, "learning_rate": 0.000138, "loss": 6.7421, "step": 50 }, { "epoch": 0.21, "learning_rate": 0.000168, "loss": 4.3567, "step": 60 }, { "epoch": 0.24, "learning_rate": 0.000198, "loss": 3.5698, "step": 70 }, { "epoch": 0.28, "learning_rate": 0.00022799999999999999, "loss": 3.2567, "step": 80 }, { "epoch": 0.31, "learning_rate": 0.000258, "loss": 3.145, "step": 90 }, { "epoch": 0.35, "learning_rate": 0.00028799999999999995, "loss": 3.2802, "step": 100 }, { "epoch": 0.35, "eval_loss": 3.8881919384002686, "eval_runtime": 112.7654, "eval_samples_per_second": 12.867, "eval_steps_per_second": 0.807, "step": 100 }, { "epoch": 0.38, "learning_rate": 0.0002993478260869565, "loss": 3.6105, "step": 110 }, { "epoch": 0.42, "learning_rate": 0.00029826086956521737, "loss": 3.1704, "step": 120 }, { "epoch": 0.45, "learning_rate": 0.00029717391304347825, "loss": 3.1124, "step": 130 }, { "epoch": 0.49, "learning_rate": 0.0002960869565217391, "loss": 3.1266, "step": 140 }, { "epoch": 0.52, "learning_rate": 0.00029499999999999996, "loss": 3.2521, "step": 150 }, { "epoch": 0.56, "learning_rate": 0.00029391304347826084, "loss": 3.6024, "step": 160 }, { "epoch": 0.59, "learning_rate": 0.0002928260869565217, "loss": 3.2086, "step": 170 }, { "epoch": 0.63, "learning_rate": 0.0002917391304347826, "loss": 3.1616, "step": 180 }, { "epoch": 0.66, "learning_rate": 0.00029065217391304344, "loss": 3.1408, "step": 190 }, { "epoch": 0.7, "learning_rate": 0.0002895652173913043, "loss": 3.2254, "step": 200 }, { "epoch": 0.7, "eval_loss": 3.885490894317627, "eval_runtime": 111.0614, "eval_samples_per_second": 13.065, "eval_steps_per_second": 0.819, "step": 200 }, { "epoch": 0.73, "learning_rate": 0.0002884782608695652, "loss": 3.6835, "step": 210 }, { "epoch": 0.77, "learning_rate": 0.00028739130434782603, "loss": 3.1449, "step": 220 }, { "epoch": 0.8, "learning_rate": 0.0002863043478260869, "loss": 3.2003, "step": 230 }, { "epoch": 0.84, "learning_rate": 0.0002852173913043478, "loss": 3.0836, "step": 240 }, { "epoch": 0.87, "learning_rate": 0.0002841304347826087, "loss": 3.1128, "step": 250 }, { "epoch": 0.91, "learning_rate": 0.0002830434782608695, "loss": 3.3853, "step": 260 }, { "epoch": 0.94, "learning_rate": 0.0002819565217391304, "loss": 3.102, "step": 270 }, { "epoch": 0.98, "learning_rate": 0.0002808695652173913, "loss": 3.0516, "step": 280 }, { "epoch": 1.01, "learning_rate": 0.0002797826086956521, "loss": 3.5681, "step": 290 }, { "epoch": 1.05, "learning_rate": 0.00027869565217391305, "loss": 3.1697, "step": 300 }, { "epoch": 1.05, "eval_loss": 3.419605255126953, "eval_runtime": 111.0117, "eval_samples_per_second": 13.071, "eval_steps_per_second": 0.82, "step": 300 }, { "epoch": 1.08, "learning_rate": 0.0002776086956521739, "loss": 3.0489, "step": 310 }, { "epoch": 1.12, "learning_rate": 0.00027652173913043476, "loss": 3.0347, "step": 320 }, { "epoch": 1.15, "learning_rate": 0.00027543478260869564, "loss": 3.0201, "step": 330 }, { "epoch": 1.19, "learning_rate": 0.00027434782608695647, "loss": 3.2079, "step": 340 }, { "epoch": 1.22, "learning_rate": 0.00027326086956521735, "loss": 3.0785, "step": 350 }, { "epoch": 1.26, "learning_rate": 0.00027217391304347824, "loss": 2.9747, "step": 360 }, { "epoch": 1.29, "learning_rate": 0.0002710869565217391, "loss": 2.9664, "step": 370 }, { "epoch": 1.33, "learning_rate": 0.00027, "loss": 2.9724, "step": 380 }, { "epoch": 1.36, "learning_rate": 0.00026891304347826083, "loss": 3.1082, "step": 390 }, { "epoch": 1.4, "learning_rate": 0.0002678260869565217, "loss": 3.0258, "step": 400 }, { "epoch": 1.4, "eval_loss": 3.195385217666626, "eval_runtime": 110.895, "eval_samples_per_second": 13.084, "eval_steps_per_second": 0.821, "step": 400 }, { "epoch": 1.43, "learning_rate": 0.0002667391304347826, "loss": 2.9719, "step": 410 }, { "epoch": 1.47, "learning_rate": 0.00026565217391304343, "loss": 2.9884, "step": 420 }, { "epoch": 1.5, "learning_rate": 0.00026456521739130437, "loss": 2.9625, "step": 430 }, { "epoch": 1.54, "learning_rate": 0.0002634782608695652, "loss": 3.0735, "step": 440 }, { "epoch": 1.57, "learning_rate": 0.0002623913043478261, "loss": 2.9998, "step": 450 }, { "epoch": 1.61, "learning_rate": 0.00026130434782608696, "loss": 2.9621, "step": 460 }, { "epoch": 1.64, "learning_rate": 0.0002602173913043478, "loss": 2.9433, "step": 470 }, { "epoch": 1.68, "learning_rate": 0.0002591304347826087, "loss": 2.9256, "step": 480 }, { "epoch": 1.71, "learning_rate": 0.00025804347826086956, "loss": 2.9996, "step": 490 }, { "epoch": 1.75, "learning_rate": 0.00025695652173913044, "loss": 2.8553, "step": 500 }, { "epoch": 1.75, "eval_loss": 3.017104387283325, "eval_runtime": 110.9455, "eval_samples_per_second": 13.078, "eval_steps_per_second": 0.82, "step": 500 }, { "epoch": 1.78, "learning_rate": 0.00025586956521739127, "loss": 2.7029, "step": 510 }, { "epoch": 1.82, "learning_rate": 0.00025478260869565215, "loss": 2.4464, "step": 520 }, { "epoch": 1.85, "learning_rate": 0.00025369565217391303, "loss": 2.2426, "step": 530 }, { "epoch": 1.89, "learning_rate": 0.00025260869565217386, "loss": 2.2914, "step": 540 }, { "epoch": 1.92, "learning_rate": 0.00025152173913043475, "loss": 1.9541, "step": 550 }, { "epoch": 1.96, "learning_rate": 0.00025043478260869563, "loss": 1.5822, "step": 560 }, { "epoch": 1.99, "learning_rate": 0.0002493478260869565, "loss": 1.5446, "step": 570 }, { "epoch": 2.03, "learning_rate": 0.0002482608695652174, "loss": 2.0892, "step": 580 }, { "epoch": 2.06, "learning_rate": 0.0002471739130434782, "loss": 1.3977, "step": 590 }, { "epoch": 2.1, "learning_rate": 0.0002460869565217391, "loss": 1.2436, "step": 600 }, { "epoch": 2.1, "eval_loss": 2.155662775039673, "eval_runtime": 111.1811, "eval_samples_per_second": 13.051, "eval_steps_per_second": 0.818, "step": 600 }, { "epoch": 2.13, "learning_rate": 0.000245, "loss": 1.1344, "step": 610 }, { "epoch": 2.17, "learning_rate": 0.00024391304347826085, "loss": 1.1795, "step": 620 }, { "epoch": 2.2, "learning_rate": 0.0002428260869565217, "loss": 2.1046, "step": 630 }, { "epoch": 2.24, "learning_rate": 0.00024173913043478261, "loss": 1.0792, "step": 640 }, { "epoch": 2.27, "learning_rate": 0.00024065217391304347, "loss": 1.157, "step": 650 }, { "epoch": 2.31, "learning_rate": 0.00023956521739130433, "loss": 1.0428, "step": 660 }, { "epoch": 2.34, "learning_rate": 0.00023847826086956518, "loss": 1.1063, "step": 670 }, { "epoch": 2.38, "learning_rate": 0.00023739130434782607, "loss": 1.6716, "step": 680 }, { "epoch": 2.41, "learning_rate": 0.00023630434782608692, "loss": 0.9602, "step": 690 }, { "epoch": 2.45, "learning_rate": 0.00023521739130434778, "loss": 0.9642, "step": 700 }, { "epoch": 2.45, "eval_loss": 1.5505317449569702, "eval_runtime": 111.9226, "eval_samples_per_second": 12.964, "eval_steps_per_second": 0.813, "step": 700 }, { "epoch": 2.48, "learning_rate": 0.0002341304347826087, "loss": 0.8874, "step": 710 }, { "epoch": 2.52, "learning_rate": 0.00023304347826086954, "loss": 0.9596, "step": 720 }, { "epoch": 2.55, "learning_rate": 0.00023195652173913043, "loss": 1.3932, "step": 730 }, { "epoch": 2.59, "learning_rate": 0.00023086956521739128, "loss": 0.8699, "step": 740 }, { "epoch": 2.62, "learning_rate": 0.00022978260869565214, "loss": 0.8153, "step": 750 }, { "epoch": 2.66, "learning_rate": 0.00022869565217391302, "loss": 0.9178, "step": 760 }, { "epoch": 2.69, "learning_rate": 0.00022760869565217388, "loss": 0.9647, "step": 770 }, { "epoch": 2.73, "learning_rate": 0.00022652173913043476, "loss": 1.0555, "step": 780 }, { "epoch": 2.76, "learning_rate": 0.00022543478260869565, "loss": 0.8834, "step": 790 }, { "epoch": 2.8, "learning_rate": 0.0002243478260869565, "loss": 0.752, "step": 800 }, { "epoch": 2.8, "eval_loss": 0.9008368253707886, "eval_runtime": 111.0482, "eval_samples_per_second": 13.066, "eval_steps_per_second": 0.819, "step": 800 }, { "epoch": 2.83, "learning_rate": 0.00022326086956521736, "loss": 0.8197, "step": 810 }, { "epoch": 2.87, "learning_rate": 0.00022217391304347824, "loss": 0.7771, "step": 820 }, { "epoch": 2.9, "learning_rate": 0.0002210869565217391, "loss": 0.9506, "step": 830 }, { "epoch": 2.94, "learning_rate": 0.00021999999999999995, "loss": 0.7518, "step": 840 }, { "epoch": 2.97, "learning_rate": 0.00021891304347826086, "loss": 0.7297, "step": 850 }, { "epoch": 3.01, "learning_rate": 0.00021782608695652172, "loss": 0.8188, "step": 860 }, { "epoch": 3.04, "learning_rate": 0.0002167391304347826, "loss": 0.6478, "step": 870 }, { "epoch": 3.08, "learning_rate": 0.00021565217391304346, "loss": 0.6546, "step": 880 }, { "epoch": 3.11, "learning_rate": 0.00021456521739130432, "loss": 0.7012, "step": 890 }, { "epoch": 3.15, "learning_rate": 0.0002134782608695652, "loss": 0.6164, "step": 900 }, { "epoch": 3.15, "eval_loss": 0.6567979454994202, "eval_runtime": 111.2282, "eval_samples_per_second": 13.045, "eval_steps_per_second": 0.818, "step": 900 }, { "epoch": 3.18, "learning_rate": 0.00021239130434782605, "loss": 0.7296, "step": 910 }, { "epoch": 3.22, "learning_rate": 0.00021130434782608694, "loss": 0.6166, "step": 920 }, { "epoch": 3.25, "learning_rate": 0.00021021739130434782, "loss": 0.6354, "step": 930 }, { "epoch": 3.29, "learning_rate": 0.00020913043478260868, "loss": 0.6388, "step": 940 }, { "epoch": 3.32, "learning_rate": 0.00020804347826086953, "loss": 0.6069, "step": 950 }, { "epoch": 3.36, "learning_rate": 0.00020695652173913042, "loss": 0.7107, "step": 960 }, { "epoch": 3.39, "learning_rate": 0.00020586956521739127, "loss": 0.6162, "step": 970 }, { "epoch": 3.43, "learning_rate": 0.00020478260869565213, "loss": 0.567, "step": 980 }, { "epoch": 3.46, "learning_rate": 0.00020369565217391304, "loss": 0.5679, "step": 990 }, { "epoch": 3.5, "learning_rate": 0.0002026086956521739, "loss": 0.6362, "step": 1000 }, { "epoch": 3.5, "eval_loss": 0.5460886359214783, "eval_runtime": 110.911, "eval_samples_per_second": 13.083, "eval_steps_per_second": 0.82, "step": 1000 }, { "epoch": 3.53, "learning_rate": 0.00020152173913043478, "loss": 0.6771, "step": 1010 }, { "epoch": 3.57, "learning_rate": 0.00020043478260869563, "loss": 0.6144, "step": 1020 }, { "epoch": 3.6, "learning_rate": 0.0001993478260869565, "loss": 0.583, "step": 1030 }, { "epoch": 3.64, "learning_rate": 0.00019826086956521737, "loss": 0.5773, "step": 1040 }, { "epoch": 3.67, "learning_rate": 0.00019717391304347826, "loss": 0.5981, "step": 1050 }, { "epoch": 3.71, "learning_rate": 0.0001960869565217391, "loss": 0.6162, "step": 1060 }, { "epoch": 3.74, "learning_rate": 0.000195, "loss": 0.5387, "step": 1070 }, { "epoch": 3.77, "learning_rate": 0.00019391304347826085, "loss": 0.5651, "step": 1080 }, { "epoch": 3.81, "learning_rate": 0.0001928260869565217, "loss": 0.544, "step": 1090 }, { "epoch": 3.84, "learning_rate": 0.0001917391304347826, "loss": 0.5539, "step": 1100 }, { "epoch": 3.84, "eval_loss": 0.5009284615516663, "eval_runtime": 111.9745, "eval_samples_per_second": 12.958, "eval_steps_per_second": 0.813, "step": 1100 }, { "epoch": 3.88, "learning_rate": 0.00019065217391304345, "loss": 0.6894, "step": 1110 }, { "epoch": 3.91, "learning_rate": 0.00018956521739130436, "loss": 0.5409, "step": 1120 }, { "epoch": 3.95, "learning_rate": 0.00018847826086956521, "loss": 0.5676, "step": 1130 }, { "epoch": 3.98, "learning_rate": 0.00018739130434782607, "loss": 0.5543, "step": 1140 }, { "epoch": 4.02, "learning_rate": 0.00018630434782608695, "loss": 0.6413, "step": 1150 }, { "epoch": 4.06, "learning_rate": 0.0001852173913043478, "loss": 0.4664, "step": 1160 }, { "epoch": 4.09, "learning_rate": 0.00018413043478260867, "loss": 0.5098, "step": 1170 }, { "epoch": 4.13, "learning_rate": 0.00018304347826086955, "loss": 0.4832, "step": 1180 }, { "epoch": 4.16, "learning_rate": 0.00018195652173913043, "loss": 0.4859, "step": 1190 }, { "epoch": 4.2, "learning_rate": 0.0001808695652173913, "loss": 0.5144, "step": 1200 }, { "epoch": 4.2, "eval_loss": 0.4911543130874634, "eval_runtime": 111.7063, "eval_samples_per_second": 12.989, "eval_steps_per_second": 0.815, "step": 1200 }, { "epoch": 4.23, "learning_rate": 0.00017978260869565217, "loss": 0.4853, "step": 1210 }, { "epoch": 4.27, "learning_rate": 0.00017869565217391303, "loss": 0.5149, "step": 1220 }, { "epoch": 4.3, "learning_rate": 0.00017760869565217388, "loss": 0.4608, "step": 1230 }, { "epoch": 4.34, "learning_rate": 0.00017652173913043477, "loss": 0.4917, "step": 1240 }, { "epoch": 4.37, "learning_rate": 0.00017543478260869562, "loss": 0.4615, "step": 1250 }, { "epoch": 4.4, "learning_rate": 0.00017434782608695653, "loss": 0.4175, "step": 1260 }, { "epoch": 4.44, "learning_rate": 0.0001732608695652174, "loss": 0.4752, "step": 1270 }, { "epoch": 4.47, "learning_rate": 0.00017217391304347825, "loss": 0.4979, "step": 1280 }, { "epoch": 4.51, "learning_rate": 0.00017108695652173913, "loss": 0.4272, "step": 1290 }, { "epoch": 4.54, "learning_rate": 0.00016999999999999999, "loss": 0.5067, "step": 1300 }, { "epoch": 4.54, "eval_loss": 0.4602111279964447, "eval_runtime": 111.5487, "eval_samples_per_second": 13.008, "eval_steps_per_second": 0.816, "step": 1300 }, { "epoch": 4.58, "learning_rate": 0.00016891304347826084, "loss": 0.483, "step": 1310 }, { "epoch": 4.61, "learning_rate": 0.0001678260869565217, "loss": 0.4671, "step": 1320 }, { "epoch": 4.65, "learning_rate": 0.0001667391304347826, "loss": 0.4806, "step": 1330 }, { "epoch": 4.68, "learning_rate": 0.00016565217391304346, "loss": 0.4798, "step": 1340 }, { "epoch": 4.72, "learning_rate": 0.00016456521739130435, "loss": 0.5021, "step": 1350 }, { "epoch": 4.75, "learning_rate": 0.0001634782608695652, "loss": 0.4445, "step": 1360 }, { "epoch": 4.79, "learning_rate": 0.00016239130434782606, "loss": 0.4462, "step": 1370 }, { "epoch": 4.82, "learning_rate": 0.00016130434782608694, "loss": 0.5033, "step": 1380 }, { "epoch": 4.86, "learning_rate": 0.0001602173913043478, "loss": 0.4577, "step": 1390 }, { "epoch": 4.89, "learning_rate": 0.0001591304347826087, "loss": 0.4999, "step": 1400 }, { "epoch": 4.89, "eval_loss": 0.4281094968318939, "eval_runtime": 111.6556, "eval_samples_per_second": 12.995, "eval_steps_per_second": 0.815, "step": 1400 }, { "epoch": 4.93, "learning_rate": 0.00015804347826086956, "loss": 0.4366, "step": 1410 }, { "epoch": 4.96, "learning_rate": 0.00015695652173913042, "loss": 0.4696, "step": 1420 }, { "epoch": 5.0, "learning_rate": 0.00015586956521739128, "loss": 0.491, "step": 1430 }, { "epoch": 5.03, "learning_rate": 0.00015478260869565216, "loss": 0.4202, "step": 1440 }, { "epoch": 5.07, "learning_rate": 0.00015369565217391302, "loss": 0.4307, "step": 1450 }, { "epoch": 5.1, "learning_rate": 0.00015260869565217387, "loss": 0.4103, "step": 1460 }, { "epoch": 5.14, "learning_rate": 0.00015152173913043478, "loss": 0.4157, "step": 1470 }, { "epoch": 5.17, "learning_rate": 0.00015043478260869564, "loss": 0.4326, "step": 1480 }, { "epoch": 5.21, "learning_rate": 0.00014934782608695652, "loss": 0.3842, "step": 1490 }, { "epoch": 5.24, "learning_rate": 0.00014826086956521738, "loss": 0.4072, "step": 1500 }, { "epoch": 5.24, "eval_loss": 0.416363924741745, "eval_runtime": 111.6811, "eval_samples_per_second": 12.992, "eval_steps_per_second": 0.815, "step": 1500 }, { "epoch": 5.28, "learning_rate": 0.00014717391304347823, "loss": 0.4093, "step": 1510 }, { "epoch": 5.31, "learning_rate": 0.00014608695652173912, "loss": 0.4013, "step": 1520 }, { "epoch": 5.35, "learning_rate": 0.000145, "loss": 0.4534, "step": 1530 }, { "epoch": 5.38, "learning_rate": 0.00014391304347826086, "loss": 0.3509, "step": 1540 }, { "epoch": 5.42, "learning_rate": 0.00014282608695652174, "loss": 0.4058, "step": 1550 }, { "epoch": 5.45, "learning_rate": 0.0001417391304347826, "loss": 0.3956, "step": 1560 }, { "epoch": 5.49, "learning_rate": 0.00014065217391304345, "loss": 0.415, "step": 1570 }, { "epoch": 5.52, "learning_rate": 0.00013956521739130434, "loss": 0.4255, "step": 1580 }, { "epoch": 5.56, "learning_rate": 0.00013847826086956522, "loss": 0.3925, "step": 1590 }, { "epoch": 5.59, "learning_rate": 0.00013739130434782607, "loss": 0.3855, "step": 1600 }, { "epoch": 5.59, "eval_loss": 0.43422192335128784, "eval_runtime": 111.3707, "eval_samples_per_second": 13.029, "eval_steps_per_second": 0.817, "step": 1600 }, { "epoch": 5.63, "learning_rate": 0.00013630434782608693, "loss": 0.4364, "step": 1610 }, { "epoch": 5.66, "learning_rate": 0.00013521739130434781, "loss": 0.3908, "step": 1620 }, { "epoch": 5.7, "learning_rate": 0.0001341304347826087, "loss": 0.4419, "step": 1630 }, { "epoch": 5.73, "learning_rate": 0.00013304347826086955, "loss": 0.3667, "step": 1640 }, { "epoch": 5.77, "learning_rate": 0.0001319565217391304, "loss": 0.3971, "step": 1650 }, { "epoch": 5.8, "learning_rate": 0.0001308695652173913, "loss": 0.4102, "step": 1660 }, { "epoch": 5.84, "learning_rate": 0.00012978260869565218, "loss": 0.4063, "step": 1670 }, { "epoch": 5.87, "learning_rate": 0.00012869565217391303, "loss": 0.4576, "step": 1680 }, { "epoch": 5.91, "learning_rate": 0.00012760869565217392, "loss": 0.3524, "step": 1690 }, { "epoch": 5.94, "learning_rate": 0.00012652173913043477, "loss": 0.4075, "step": 1700 }, { "epoch": 5.94, "eval_loss": 0.40860703587532043, "eval_runtime": 112.0658, "eval_samples_per_second": 12.948, "eval_steps_per_second": 0.812, "step": 1700 }, { "epoch": 5.98, "learning_rate": 0.00012543478260869563, "loss": 0.3744, "step": 1710 }, { "epoch": 6.01, "learning_rate": 0.0001243478260869565, "loss": 0.3863, "step": 1720 }, { "epoch": 6.05, "learning_rate": 0.0001232608695652174, "loss": 0.338, "step": 1730 }, { "epoch": 6.08, "learning_rate": 0.00012217391304347825, "loss": 0.3789, "step": 1740 }, { "epoch": 6.12, "learning_rate": 0.00012108695652173912, "loss": 0.3253, "step": 1750 }, { "epoch": 6.15, "learning_rate": 0.00011999999999999999, "loss": 0.3878, "step": 1760 }, { "epoch": 6.19, "learning_rate": 0.00011891304347826086, "loss": 0.3732, "step": 1770 }, { "epoch": 6.22, "learning_rate": 0.00011782608695652173, "loss": 0.3494, "step": 1780 }, { "epoch": 6.26, "learning_rate": 0.00011673913043478258, "loss": 0.3701, "step": 1790 }, { "epoch": 6.29, "learning_rate": 0.00011565217391304347, "loss": 0.347, "step": 1800 }, { "epoch": 6.29, "eval_loss": 0.42197009921073914, "eval_runtime": 111.9078, "eval_samples_per_second": 12.966, "eval_steps_per_second": 0.813, "step": 1800 }, { "epoch": 6.33, "learning_rate": 0.00011456521739130434, "loss": 0.3484, "step": 1810 }, { "epoch": 6.36, "learning_rate": 0.00011347826086956521, "loss": 0.3328, "step": 1820 }, { "epoch": 6.4, "learning_rate": 0.00011239130434782608, "loss": 0.3444, "step": 1830 }, { "epoch": 6.43, "learning_rate": 0.00011130434782608695, "loss": 0.321, "step": 1840 }, { "epoch": 6.47, "learning_rate": 0.00011021739130434782, "loss": 0.4089, "step": 1850 }, { "epoch": 6.5, "learning_rate": 0.00010913043478260867, "loss": 0.335, "step": 1860 }, { "epoch": 6.54, "learning_rate": 0.00010804347826086956, "loss": 0.376, "step": 1870 }, { "epoch": 6.57, "learning_rate": 0.00010695652173913043, "loss": 0.3324, "step": 1880 }, { "epoch": 6.61, "learning_rate": 0.0001058695652173913, "loss": 0.3359, "step": 1890 }, { "epoch": 6.64, "learning_rate": 0.00010478260869565216, "loss": 0.3708, "step": 1900 }, { "epoch": 6.64, "eval_loss": 0.4211980998516083, "eval_runtime": 113.1099, "eval_samples_per_second": 12.828, "eval_steps_per_second": 0.805, "step": 1900 }, { "epoch": 6.68, "learning_rate": 0.00010369565217391303, "loss": 0.3512, "step": 1910 }, { "epoch": 6.71, "learning_rate": 0.0001026086956521739, "loss": 0.3381, "step": 1920 }, { "epoch": 6.75, "learning_rate": 0.00010152173913043479, "loss": 0.2925, "step": 1930 }, { "epoch": 6.78, "learning_rate": 0.00010043478260869564, "loss": 0.3805, "step": 1940 }, { "epoch": 6.82, "learning_rate": 9.934782608695651e-05, "loss": 0.3803, "step": 1950 }, { "epoch": 6.85, "learning_rate": 9.826086956521738e-05, "loss": 0.3499, "step": 1960 }, { "epoch": 6.89, "learning_rate": 9.717391304347825e-05, "loss": 0.3419, "step": 1970 }, { "epoch": 6.92, "learning_rate": 9.608695652173912e-05, "loss": 0.2918, "step": 1980 }, { "epoch": 6.96, "learning_rate": 9.499999999999999e-05, "loss": 0.3602, "step": 1990 }, { "epoch": 6.99, "learning_rate": 9.391304347826087e-05, "loss": 0.3788, "step": 2000 }, { "epoch": 6.99, "eval_loss": 0.419994056224823, "eval_runtime": 111.5298, "eval_samples_per_second": 13.01, "eval_steps_per_second": 0.816, "step": 2000 }, { "epoch": 7.03, "learning_rate": 9.282608695652173e-05, "loss": 0.3368, "step": 2010 }, { "epoch": 7.06, "learning_rate": 9.17391304347826e-05, "loss": 0.3186, "step": 2020 }, { "epoch": 7.1, "learning_rate": 9.065217391304346e-05, "loss": 0.3134, "step": 2030 }, { "epoch": 7.13, "learning_rate": 8.956521739130434e-05, "loss": 0.3021, "step": 2040 }, { "epoch": 7.17, "learning_rate": 8.847826086956521e-05, "loss": 0.3168, "step": 2050 }, { "epoch": 7.2, "learning_rate": 8.739130434782608e-05, "loss": 0.2573, "step": 2060 }, { "epoch": 7.24, "learning_rate": 8.630434782608696e-05, "loss": 0.3107, "step": 2070 }, { "epoch": 7.27, "learning_rate": 8.521739130434782e-05, "loss": 0.2961, "step": 2080 }, { "epoch": 7.31, "learning_rate": 8.413043478260869e-05, "loss": 0.3487, "step": 2090 }, { "epoch": 7.34, "learning_rate": 8.304347826086954e-05, "loss": 0.337, "step": 2100 }, { "epoch": 7.34, "eval_loss": 0.41391661763191223, "eval_runtime": 111.525, "eval_samples_per_second": 13.011, "eval_steps_per_second": 0.816, "step": 2100 }, { "epoch": 7.38, "learning_rate": 8.195652173913043e-05, "loss": 0.3379, "step": 2110 }, { "epoch": 7.41, "learning_rate": 8.08695652173913e-05, "loss": 0.3168, "step": 2120 }, { "epoch": 7.45, "learning_rate": 7.978260869565217e-05, "loss": 0.3339, "step": 2130 }, { "epoch": 7.48, "learning_rate": 7.869565217391304e-05, "loss": 0.3712, "step": 2140 }, { "epoch": 7.52, "learning_rate": 7.76086956521739e-05, "loss": 0.2807, "step": 2150 }, { "epoch": 7.55, "learning_rate": 7.652173913043478e-05, "loss": 0.3018, "step": 2160 }, { "epoch": 7.59, "learning_rate": 7.543478260869563e-05, "loss": 0.311, "step": 2170 }, { "epoch": 7.62, "learning_rate": 7.434782608695652e-05, "loss": 0.309, "step": 2180 }, { "epoch": 7.66, "learning_rate": 7.326086956521738e-05, "loss": 0.3175, "step": 2190 }, { "epoch": 7.69, "learning_rate": 7.217391304347825e-05, "loss": 0.3045, "step": 2200 }, { "epoch": 7.69, "eval_loss": 0.4140127897262573, "eval_runtime": 111.986, "eval_samples_per_second": 12.957, "eval_steps_per_second": 0.813, "step": 2200 }, { "epoch": 7.73, "learning_rate": 7.108695652173912e-05, "loss": 0.3065, "step": 2210 }, { "epoch": 7.76, "learning_rate": 7e-05, "loss": 0.2784, "step": 2220 }, { "epoch": 7.8, "learning_rate": 6.891304347826086e-05, "loss": 0.3367, "step": 2230 }, { "epoch": 7.83, "learning_rate": 6.782608695652173e-05, "loss": 0.3176, "step": 2240 }, { "epoch": 7.87, "learning_rate": 6.67391304347826e-05, "loss": 0.3126, "step": 2250 }, { "epoch": 7.9, "learning_rate": 6.565217391304347e-05, "loss": 0.3586, "step": 2260 }, { "epoch": 7.94, "learning_rate": 6.456521739130434e-05, "loss": 0.3199, "step": 2270 }, { "epoch": 7.97, "learning_rate": 6.347826086956521e-05, "loss": 0.3302, "step": 2280 }, { "epoch": 8.01, "learning_rate": 6.239130434782608e-05, "loss": 0.346, "step": 2290 }, { "epoch": 8.04, "learning_rate": 6.130434782608695e-05, "loss": 0.2547, "step": 2300 }, { "epoch": 8.04, "eval_loss": 0.41661086678504944, "eval_runtime": 111.4943, "eval_samples_per_second": 13.014, "eval_steps_per_second": 0.816, "step": 2300 }, { "epoch": 8.08, "learning_rate": 6.021739130434782e-05, "loss": 0.2923, "step": 2310 }, { "epoch": 8.11, "learning_rate": 5.913043478260869e-05, "loss": 0.2889, "step": 2320 }, { "epoch": 8.15, "learning_rate": 5.804347826086956e-05, "loss": 0.2858, "step": 2330 }, { "epoch": 8.18, "learning_rate": 5.695652173913043e-05, "loss": 0.3082, "step": 2340 }, { "epoch": 8.22, "learning_rate": 5.58695652173913e-05, "loss": 0.2721, "step": 2350 }, { "epoch": 8.25, "learning_rate": 5.478260869565217e-05, "loss": 0.2911, "step": 2360 }, { "epoch": 8.29, "learning_rate": 5.3695652173913046e-05, "loss": 0.3015, "step": 2370 }, { "epoch": 8.32, "learning_rate": 5.260869565217391e-05, "loss": 0.3471, "step": 2380 }, { "epoch": 8.36, "learning_rate": 5.152173913043478e-05, "loss": 0.3053, "step": 2390 }, { "epoch": 8.39, "learning_rate": 5.043478260869565e-05, "loss": 0.2584, "step": 2400 }, { "epoch": 8.39, "eval_loss": 0.408309668302536, "eval_runtime": 111.5385, "eval_samples_per_second": 13.009, "eval_steps_per_second": 0.816, "step": 2400 }, { "epoch": 8.43, "learning_rate": 4.934782608695652e-05, "loss": 0.3029, "step": 2410 }, { "epoch": 8.46, "learning_rate": 4.826086956521738e-05, "loss": 0.2933, "step": 2420 }, { "epoch": 8.5, "learning_rate": 4.717391304347826e-05, "loss": 0.2807, "step": 2430 }, { "epoch": 8.53, "learning_rate": 4.6086956521739126e-05, "loss": 0.2899, "step": 2440 }, { "epoch": 8.57, "learning_rate": 4.4999999999999996e-05, "loss": 0.229, "step": 2450 }, { "epoch": 8.6, "learning_rate": 4.3913043478260866e-05, "loss": 0.2919, "step": 2460 }, { "epoch": 8.64, "learning_rate": 4.2826086956521735e-05, "loss": 0.2989, "step": 2470 }, { "epoch": 8.67, "learning_rate": 4.1739130434782605e-05, "loss": 0.2919, "step": 2480 }, { "epoch": 8.71, "learning_rate": 4.065217391304348e-05, "loss": 0.2988, "step": 2490 }, { "epoch": 8.74, "learning_rate": 3.9565217391304344e-05, "loss": 0.2673, "step": 2500 }, { "epoch": 8.74, "eval_loss": 0.39945822954177856, "eval_runtime": 111.4334, "eval_samples_per_second": 13.021, "eval_steps_per_second": 0.817, "step": 2500 }, { "epoch": 8.77, "learning_rate": 3.8478260869565214e-05, "loss": 0.2668, "step": 2510 }, { "epoch": 8.81, "learning_rate": 3.7391304347826084e-05, "loss": 0.3159, "step": 2520 }, { "epoch": 8.84, "learning_rate": 3.630434782608695e-05, "loss": 0.3037, "step": 2530 }, { "epoch": 8.88, "learning_rate": 3.521739130434782e-05, "loss": 0.2869, "step": 2540 }, { "epoch": 8.91, "learning_rate": 3.413043478260869e-05, "loss": 0.2792, "step": 2550 }, { "epoch": 8.95, "learning_rate": 3.304347826086956e-05, "loss": 0.2939, "step": 2560 }, { "epoch": 8.98, "learning_rate": 3.195652173913043e-05, "loss": 0.2781, "step": 2570 }, { "epoch": 9.02, "learning_rate": 3.08695652173913e-05, "loss": 0.3154, "step": 2580 }, { "epoch": 9.06, "learning_rate": 2.978260869565217e-05, "loss": 0.301, "step": 2590 }, { "epoch": 9.09, "learning_rate": 2.869565217391304e-05, "loss": 0.2765, "step": 2600 }, { "epoch": 9.09, "eval_loss": 0.4026023745536804, "eval_runtime": 111.3595, "eval_samples_per_second": 13.03, "eval_steps_per_second": 0.817, "step": 2600 }, { "epoch": 9.13, "learning_rate": 2.760869565217391e-05, "loss": 0.2801, "step": 2610 }, { "epoch": 9.16, "learning_rate": 2.652173913043478e-05, "loss": 0.263, "step": 2620 }, { "epoch": 9.2, "learning_rate": 2.543478260869565e-05, "loss": 0.2586, "step": 2630 }, { "epoch": 9.23, "learning_rate": 2.4347826086956516e-05, "loss": 0.2496, "step": 2640 }, { "epoch": 9.27, "learning_rate": 2.3260869565217393e-05, "loss": 0.269, "step": 2650 }, { "epoch": 9.3, "learning_rate": 2.217391304347826e-05, "loss": 0.296, "step": 2660 }, { "epoch": 9.34, "learning_rate": 2.108695652173913e-05, "loss": 0.2531, "step": 2670 }, { "epoch": 9.37, "learning_rate": 1.9999999999999998e-05, "loss": 0.2871, "step": 2680 }, { "epoch": 9.4, "learning_rate": 1.8913043478260868e-05, "loss": 0.2616, "step": 2690 }, { "epoch": 9.44, "learning_rate": 1.7826086956521738e-05, "loss": 0.2453, "step": 2700 }, { "epoch": 9.44, "eval_loss": 0.4079287052154541, "eval_runtime": 111.4001, "eval_samples_per_second": 13.025, "eval_steps_per_second": 0.817, "step": 2700 }, { "epoch": 9.47, "learning_rate": 1.6739130434782607e-05, "loss": 0.3043, "step": 2710 }, { "epoch": 9.51, "learning_rate": 1.5652173913043477e-05, "loss": 0.2297, "step": 2720 }, { "epoch": 9.54, "learning_rate": 1.4565217391304347e-05, "loss": 0.2844, "step": 2730 }, { "epoch": 9.58, "learning_rate": 1.3478260869565216e-05, "loss": 0.2485, "step": 2740 }, { "epoch": 9.61, "learning_rate": 1.2391304347826086e-05, "loss": 0.2695, "step": 2750 }, { "epoch": 9.65, "learning_rate": 1.1304347826086956e-05, "loss": 0.3031, "step": 2760 }, { "epoch": 9.68, "learning_rate": 1.0217391304347825e-05, "loss": 0.2716, "step": 2770 }, { "epoch": 9.72, "learning_rate": 9.130434782608695e-06, "loss": 0.2424, "step": 2780 }, { "epoch": 9.75, "learning_rate": 8.043478260869565e-06, "loss": 0.2652, "step": 2790 }, { "epoch": 9.79, "learning_rate": 6.956521739130434e-06, "loss": 0.2883, "step": 2800 }, { "epoch": 9.79, "eval_loss": 0.4038572311401367, "eval_runtime": 111.7946, "eval_samples_per_second": 12.979, "eval_steps_per_second": 0.814, "step": 2800 }, { "epoch": 9.82, "learning_rate": 5.869565217391305e-06, "loss": 0.2743, "step": 2810 }, { "epoch": 9.86, "learning_rate": 4.7826086956521735e-06, "loss": 0.2823, "step": 2820 }, { "epoch": 9.89, "learning_rate": 3.695652173913043e-06, "loss": 0.2766, "step": 2830 }, { "epoch": 9.93, "learning_rate": 2.608695652173913e-06, "loss": 0.2881, "step": 2840 }, { "epoch": 9.96, "learning_rate": 1.5217391304347823e-06, "loss": 0.3219, "step": 2850 }, { "epoch": 10.0, "learning_rate": 4.3478260869565214e-07, "loss": 0.294, "step": 2860 }, { "epoch": 10.0, "step": 2860, "total_flos": 6.827433558323335e+18, "train_loss": 1.1227069218675574, "train_runtime": 9275.3367, "train_samples_per_second": 4.936, "train_steps_per_second": 0.308 }, { "epoch": 10.0, "eval_loss": 0.40599533915519714, "eval_runtime": 111.6061, "eval_samples_per_second": 13.001, "eval_steps_per_second": 0.815, "step": 2860 } ], "max_steps": 2860, "num_train_epochs": 10, "total_flos": 6.827433558323335e+18, "trial_name": null, "trial_params": null }