{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.997150997150996, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 9.971428571428571e-05, "loss": 1.6461, "step": 10 }, { "epoch": 0.11, "learning_rate": 9.942857142857144e-05, "loss": 1.5964, "step": 20 }, { "epoch": 0.17, "learning_rate": 9.914285714285715e-05, "loss": 1.52, "step": 30 }, { "epoch": 0.23, "learning_rate": 9.885714285714286e-05, "loss": 1.5243, "step": 40 }, { "epoch": 0.28, "learning_rate": 9.857142857142858e-05, "loss": 1.4323, "step": 50 }, { "epoch": 0.34, "learning_rate": 9.831428571428572e-05, "loss": 1.6594, "step": 60 }, { "epoch": 0.4, "learning_rate": 9.802857142857143e-05, "loss": 1.472, "step": 70 }, { "epoch": 0.46, "learning_rate": 9.774285714285715e-05, "loss": 1.4544, "step": 80 }, { "epoch": 0.51, "learning_rate": 9.745714285714286e-05, "loss": 1.4888, "step": 90 }, { "epoch": 0.57, "learning_rate": 9.72e-05, "loss": 1.3526, "step": 100 }, { "epoch": 0.63, "learning_rate": 9.691428571428573e-05, "loss": 1.3166, "step": 110 }, { "epoch": 0.68, "learning_rate": 9.662857142857144e-05, "loss": 1.283, "step": 120 }, { "epoch": 0.74, "learning_rate": 9.634285714285715e-05, "loss": 1.3889, "step": 130 }, { "epoch": 0.8, "learning_rate": 9.605714285714286e-05, "loss": 1.2688, "step": 140 }, { "epoch": 0.85, "learning_rate": 9.577142857142858e-05, "loss": 1.047, "step": 150 }, { "epoch": 0.91, "learning_rate": 9.548571428571429e-05, "loss": 1.3696, "step": 160 }, { "epoch": 0.97, "learning_rate": 9.522857142857143e-05, "loss": 1.3446, "step": 170 }, { "epoch": 1.0, "eval_accuracy": 0.4554455578327179, "eval_loss": 1.536527156829834, "eval_runtime": 11.3063, "eval_samples_per_second": 17.866, "eval_steps_per_second": 8.933, "step": 175 }, { "epoch": 1.03, "learning_rate": 9.494285714285714e-05, "loss": 0.8856, "step": 180 }, { "epoch": 1.09, "learning_rate": 9.465714285714286e-05, "loss": 1.2261, "step": 190 }, { "epoch": 1.14, "learning_rate": 9.437142857142857e-05, "loss": 1.1836, "step": 200 }, { "epoch": 1.2, "learning_rate": 9.40857142857143e-05, "loss": 1.3243, "step": 210 }, { "epoch": 1.26, "learning_rate": 9.38e-05, "loss": 0.747, "step": 220 }, { "epoch": 1.31, "learning_rate": 9.351428571428573e-05, "loss": 0.9695, "step": 230 }, { "epoch": 1.37, "learning_rate": 9.322857142857144e-05, "loss": 1.0319, "step": 240 }, { "epoch": 1.43, "learning_rate": 9.294285714285714e-05, "loss": 1.0031, "step": 250 }, { "epoch": 1.48, "learning_rate": 9.265714285714287e-05, "loss": 1.1495, "step": 260 }, { "epoch": 1.54, "learning_rate": 9.237142857142858e-05, "loss": 0.9622, "step": 270 }, { "epoch": 1.6, "learning_rate": 9.208571428571429e-05, "loss": 0.9744, "step": 280 }, { "epoch": 1.66, "learning_rate": 9.180000000000001e-05, "loss": 1.1524, "step": 290 }, { "epoch": 1.71, "learning_rate": 9.151428571428572e-05, "loss": 0.9051, "step": 300 }, { "epoch": 1.77, "learning_rate": 9.122857142857143e-05, "loss": 1.1604, "step": 310 }, { "epoch": 1.83, "learning_rate": 9.094285714285715e-05, "loss": 1.2363, "step": 320 }, { "epoch": 1.88, "learning_rate": 9.065714285714286e-05, "loss": 1.139, "step": 330 }, { "epoch": 1.94, "learning_rate": 9.037142857142857e-05, "loss": 1.1337, "step": 340 }, { "epoch": 2.0, "learning_rate": 9.008571428571429e-05, "loss": 0.9654, "step": 350 }, { "epoch": 2.0, "eval_accuracy": 0.6237623691558838, "eval_loss": 1.0576841831207275, "eval_runtime": 11.4424, "eval_samples_per_second": 17.654, "eval_steps_per_second": 8.827, "step": 350 }, { "epoch": 2.06, "learning_rate": 8.98e-05, "loss": 1.0752, "step": 360 }, { "epoch": 2.11, "learning_rate": 8.951428571428572e-05, "loss": 0.8605, "step": 370 }, { "epoch": 2.17, "learning_rate": 8.922857142857143e-05, "loss": 0.8216, "step": 380 }, { "epoch": 2.23, "learning_rate": 8.894285714285716e-05, "loss": 1.2676, "step": 390 }, { "epoch": 2.28, "learning_rate": 8.865714285714287e-05, "loss": 0.9299, "step": 400 }, { "epoch": 2.34, "learning_rate": 8.837142857142857e-05, "loss": 0.9775, "step": 410 }, { "epoch": 2.4, "learning_rate": 8.80857142857143e-05, "loss": 0.7707, "step": 420 }, { "epoch": 2.46, "learning_rate": 8.78e-05, "loss": 0.8136, "step": 430 }, { "epoch": 2.51, "learning_rate": 8.751428571428572e-05, "loss": 1.0706, "step": 440 }, { "epoch": 2.57, "learning_rate": 8.722857142857144e-05, "loss": 0.8833, "step": 450 }, { "epoch": 2.63, "learning_rate": 8.694285714285715e-05, "loss": 0.9111, "step": 460 }, { "epoch": 2.68, "learning_rate": 8.665714285714286e-05, "loss": 0.6969, "step": 470 }, { "epoch": 2.74, "learning_rate": 8.637142857142858e-05, "loss": 1.1425, "step": 480 }, { "epoch": 2.8, "learning_rate": 8.608571428571429e-05, "loss": 1.0298, "step": 490 }, { "epoch": 2.85, "learning_rate": 8.58e-05, "loss": 0.91, "step": 500 }, { "epoch": 2.91, "learning_rate": 8.551428571428571e-05, "loss": 0.8342, "step": 510 }, { "epoch": 2.97, "learning_rate": 8.522857142857143e-05, "loss": 0.8069, "step": 520 }, { "epoch": 3.0, "eval_accuracy": 0.6386138796806335, "eval_loss": 0.9259141683578491, "eval_runtime": 11.2435, "eval_samples_per_second": 17.966, "eval_steps_per_second": 8.983, "step": 525 }, { "epoch": 3.03, "learning_rate": 8.494285714285714e-05, "loss": 0.8804, "step": 530 }, { "epoch": 3.09, "learning_rate": 8.465714285714286e-05, "loss": 0.7882, "step": 540 }, { "epoch": 3.14, "learning_rate": 8.437142857142859e-05, "loss": 0.5915, "step": 550 }, { "epoch": 3.2, "learning_rate": 8.40857142857143e-05, "loss": 0.5102, "step": 560 }, { "epoch": 3.26, "learning_rate": 8.38e-05, "loss": 0.6473, "step": 570 }, { "epoch": 3.31, "learning_rate": 8.351428571428573e-05, "loss": 0.7545, "step": 580 }, { "epoch": 3.37, "learning_rate": 8.322857142857144e-05, "loss": 0.5438, "step": 590 }, { "epoch": 3.43, "learning_rate": 8.294285714285715e-05, "loss": 0.8545, "step": 600 }, { "epoch": 3.48, "learning_rate": 8.265714285714287e-05, "loss": 0.563, "step": 610 }, { "epoch": 3.54, "learning_rate": 8.237142857142858e-05, "loss": 0.7048, "step": 620 }, { "epoch": 3.6, "learning_rate": 8.208571428571429e-05, "loss": 1.019, "step": 630 }, { "epoch": 3.66, "learning_rate": 8.18e-05, "loss": 0.5084, "step": 640 }, { "epoch": 3.71, "learning_rate": 8.151428571428572e-05, "loss": 0.7297, "step": 650 }, { "epoch": 3.77, "learning_rate": 8.122857142857143e-05, "loss": 0.4933, "step": 660 }, { "epoch": 3.83, "learning_rate": 8.094285714285714e-05, "loss": 0.5224, "step": 670 }, { "epoch": 3.88, "learning_rate": 8.065714285714286e-05, "loss": 0.5695, "step": 680 }, { "epoch": 3.94, "learning_rate": 8.037142857142857e-05, "loss": 0.4603, "step": 690 }, { "epoch": 4.0, "learning_rate": 8.008571428571429e-05, "loss": 1.1203, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.6831682920455933, "eval_loss": 1.0746746063232422, "eval_runtime": 11.8499, "eval_samples_per_second": 17.047, "eval_steps_per_second": 8.523, "step": 700 }, { "epoch": 4.06, "learning_rate": 7.98e-05, "loss": 0.6816, "step": 710 }, { "epoch": 4.11, "learning_rate": 7.951428571428572e-05, "loss": 0.629, "step": 720 }, { "epoch": 4.17, "learning_rate": 7.922857142857143e-05, "loss": 0.6881, "step": 730 }, { "epoch": 4.23, "learning_rate": 7.894285714285716e-05, "loss": 0.8816, "step": 740 }, { "epoch": 4.28, "learning_rate": 7.865714285714287e-05, "loss": 0.4466, "step": 750 }, { "epoch": 4.34, "learning_rate": 7.837142857142858e-05, "loss": 0.721, "step": 760 }, { "epoch": 4.4, "learning_rate": 7.808571428571428e-05, "loss": 0.8953, "step": 770 }, { "epoch": 4.46, "learning_rate": 7.780000000000001e-05, "loss": 0.4612, "step": 780 }, { "epoch": 4.51, "learning_rate": 7.751428571428572e-05, "loss": 0.5196, "step": 790 }, { "epoch": 4.57, "learning_rate": 7.722857142857143e-05, "loss": 0.62, "step": 800 }, { "epoch": 4.63, "learning_rate": 7.694285714285715e-05, "loss": 0.3506, "step": 810 }, { "epoch": 4.68, "learning_rate": 7.665714285714286e-05, "loss": 0.2639, "step": 820 }, { "epoch": 4.74, "learning_rate": 7.637142857142857e-05, "loss": 0.9862, "step": 830 }, { "epoch": 4.8, "learning_rate": 7.608571428571429e-05, "loss": 0.6958, "step": 840 }, { "epoch": 4.85, "learning_rate": 7.58e-05, "loss": 0.5734, "step": 850 }, { "epoch": 4.91, "learning_rate": 7.551428571428571e-05, "loss": 0.6894, "step": 860 }, { "epoch": 4.97, "learning_rate": 7.522857142857143e-05, "loss": 0.3681, "step": 870 }, { "epoch": 5.0, "eval_accuracy": 0.7029703259468079, "eval_loss": 1.0062588453292847, "eval_runtime": 11.6332, "eval_samples_per_second": 17.364, "eval_steps_per_second": 8.682, "step": 875 }, { "epoch": 5.03, "learning_rate": 7.494285714285715e-05, "loss": 0.4552, "step": 880 }, { "epoch": 5.09, "learning_rate": 7.465714285714286e-05, "loss": 0.3066, "step": 890 }, { "epoch": 5.14, "learning_rate": 7.437142857142857e-05, "loss": 0.691, "step": 900 }, { "epoch": 5.2, "learning_rate": 7.40857142857143e-05, "loss": 0.5948, "step": 910 }, { "epoch": 5.26, "learning_rate": 7.38e-05, "loss": 0.2739, "step": 920 }, { "epoch": 5.31, "learning_rate": 7.351428571428571e-05, "loss": 0.29, "step": 930 }, { "epoch": 5.37, "learning_rate": 7.322857142857144e-05, "loss": 0.4972, "step": 940 }, { "epoch": 5.43, "learning_rate": 7.294285714285715e-05, "loss": 0.4721, "step": 950 }, { "epoch": 5.48, "learning_rate": 7.265714285714286e-05, "loss": 0.346, "step": 960 }, { "epoch": 5.54, "learning_rate": 7.237142857142858e-05, "loss": 0.6904, "step": 970 }, { "epoch": 5.6, "learning_rate": 7.211428571428572e-05, "loss": 0.6566, "step": 980 }, { "epoch": 5.66, "learning_rate": 7.182857142857143e-05, "loss": 0.3559, "step": 990 }, { "epoch": 5.71, "learning_rate": 7.154285714285714e-05, "loss": 0.2403, "step": 1000 }, { "epoch": 5.77, "learning_rate": 7.125714285714286e-05, "loss": 0.3333, "step": 1010 }, { "epoch": 5.83, "learning_rate": 7.097142857142857e-05, "loss": 0.6078, "step": 1020 }, { "epoch": 5.88, "learning_rate": 7.06857142857143e-05, "loss": 0.5469, "step": 1030 }, { "epoch": 5.94, "learning_rate": 7.04e-05, "loss": 0.327, "step": 1040 }, { "epoch": 6.0, "learning_rate": 7.011428571428573e-05, "loss": 0.6719, "step": 1050 }, { "epoch": 6.0, "eval_accuracy": 0.6930692791938782, "eval_loss": 1.4494578838348389, "eval_runtime": 12.9771, "eval_samples_per_second": 15.566, "eval_steps_per_second": 7.783, "step": 1050 }, { "epoch": 6.06, "learning_rate": 6.982857142857144e-05, "loss": 0.6276, "step": 1060 }, { "epoch": 6.11, "learning_rate": 6.954285714285714e-05, "loss": 0.2453, "step": 1070 }, { "epoch": 6.17, "learning_rate": 6.925714285714287e-05, "loss": 0.4183, "step": 1080 }, { "epoch": 6.23, "learning_rate": 6.897142857142858e-05, "loss": 0.5071, "step": 1090 }, { "epoch": 6.28, "learning_rate": 6.868571428571429e-05, "loss": 0.3372, "step": 1100 }, { "epoch": 6.34, "learning_rate": 6.840000000000001e-05, "loss": 0.3326, "step": 1110 }, { "epoch": 6.4, "learning_rate": 6.811428571428572e-05, "loss": 0.6091, "step": 1120 }, { "epoch": 6.46, "learning_rate": 6.782857142857143e-05, "loss": 0.387, "step": 1130 }, { "epoch": 6.51, "learning_rate": 6.754285714285714e-05, "loss": 0.3271, "step": 1140 }, { "epoch": 6.57, "learning_rate": 6.725714285714286e-05, "loss": 0.4024, "step": 1150 }, { "epoch": 6.63, "learning_rate": 6.697142857142857e-05, "loss": 0.5858, "step": 1160 }, { "epoch": 6.68, "learning_rate": 6.668571428571428e-05, "loss": 0.3328, "step": 1170 }, { "epoch": 6.74, "learning_rate": 6.64e-05, "loss": 0.5711, "step": 1180 }, { "epoch": 6.8, "learning_rate": 6.611428571428572e-05, "loss": 0.5658, "step": 1190 }, { "epoch": 6.85, "learning_rate": 6.582857142857143e-05, "loss": 0.3108, "step": 1200 }, { "epoch": 6.91, "learning_rate": 6.554285714285716e-05, "loss": 0.4348, "step": 1210 }, { "epoch": 6.97, "learning_rate": 6.525714285714287e-05, "loss": 0.646, "step": 1220 }, { "epoch": 7.0, "eval_accuracy": 0.6930692791938782, "eval_loss": 1.4014908075332642, "eval_runtime": 11.3752, "eval_samples_per_second": 17.758, "eval_steps_per_second": 8.879, "step": 1225 }, { "epoch": 7.03, "learning_rate": 6.497142857142857e-05, "loss": 0.6164, "step": 1230 }, { "epoch": 7.09, "learning_rate": 6.46857142857143e-05, "loss": 0.1188, "step": 1240 }, { "epoch": 7.14, "learning_rate": 6.440000000000001e-05, "loss": 0.7997, "step": 1250 }, { "epoch": 7.2, "learning_rate": 6.411428571428572e-05, "loss": 0.3099, "step": 1260 }, { "epoch": 7.26, "learning_rate": 6.382857142857143e-05, "loss": 0.1419, "step": 1270 }, { "epoch": 7.31, "learning_rate": 6.354285714285715e-05, "loss": 0.3644, "step": 1280 }, { "epoch": 7.37, "learning_rate": 6.325714285714286e-05, "loss": 0.2829, "step": 1290 }, { "epoch": 7.43, "learning_rate": 6.297142857142857e-05, "loss": 0.8076, "step": 1300 }, { "epoch": 7.48, "learning_rate": 6.268571428571429e-05, "loss": 0.4266, "step": 1310 }, { "epoch": 7.54, "learning_rate": 6.24e-05, "loss": 0.3325, "step": 1320 }, { "epoch": 7.6, "learning_rate": 6.211428571428571e-05, "loss": 0.8035, "step": 1330 }, { "epoch": 7.66, "learning_rate": 6.182857142857143e-05, "loss": 0.1186, "step": 1340 }, { "epoch": 7.71, "learning_rate": 6.154285714285714e-05, "loss": 0.5125, "step": 1350 }, { "epoch": 7.77, "learning_rate": 6.125714285714286e-05, "loss": 0.2002, "step": 1360 }, { "epoch": 7.83, "learning_rate": 6.097142857142858e-05, "loss": 0.147, "step": 1370 }, { "epoch": 7.88, "learning_rate": 6.068571428571429e-05, "loss": 0.4292, "step": 1380 }, { "epoch": 7.94, "learning_rate": 6.04e-05, "loss": 0.7013, "step": 1390 }, { "epoch": 8.0, "learning_rate": 6.0114285714285714e-05, "loss": 0.3072, "step": 1400 }, { "epoch": 8.0, "eval_accuracy": 0.6534653306007385, "eval_loss": 1.5413367748260498, "eval_runtime": 11.4039, "eval_samples_per_second": 17.713, "eval_steps_per_second": 8.857, "step": 1400 }, { "epoch": 8.06, "learning_rate": 5.9828571428571437e-05, "loss": 0.1924, "step": 1410 }, { "epoch": 8.11, "learning_rate": 5.9542857142857146e-05, "loss": 0.2838, "step": 1420 }, { "epoch": 8.17, "learning_rate": 5.9257142857142855e-05, "loss": 0.1783, "step": 1430 }, { "epoch": 8.23, "learning_rate": 5.897142857142858e-05, "loss": 0.2159, "step": 1440 }, { "epoch": 8.28, "learning_rate": 5.868571428571429e-05, "loss": 0.3815, "step": 1450 }, { "epoch": 8.34, "learning_rate": 5.8399999999999997e-05, "loss": 0.3401, "step": 1460 }, { "epoch": 8.4, "learning_rate": 5.811428571428572e-05, "loss": 0.2045, "step": 1470 }, { "epoch": 8.46, "learning_rate": 5.782857142857143e-05, "loss": 0.0864, "step": 1480 }, { "epoch": 8.51, "learning_rate": 5.7542857142857145e-05, "loss": 0.2836, "step": 1490 }, { "epoch": 8.57, "learning_rate": 5.725714285714287e-05, "loss": 0.1675, "step": 1500 }, { "epoch": 8.63, "learning_rate": 5.697142857142858e-05, "loss": 0.4174, "step": 1510 }, { "epoch": 8.68, "learning_rate": 5.6685714285714286e-05, "loss": 0.5875, "step": 1520 }, { "epoch": 8.74, "learning_rate": 5.6399999999999995e-05, "loss": 0.1532, "step": 1530 }, { "epoch": 8.8, "learning_rate": 5.611428571428572e-05, "loss": 0.2927, "step": 1540 }, { "epoch": 8.85, "learning_rate": 5.582857142857143e-05, "loss": 0.1527, "step": 1550 }, { "epoch": 8.91, "learning_rate": 5.5542857142857143e-05, "loss": 0.2842, "step": 1560 }, { "epoch": 8.97, "learning_rate": 5.525714285714286e-05, "loss": 0.3331, "step": 1570 }, { "epoch": 9.0, "eval_accuracy": 0.6930692791938782, "eval_loss": 1.759947657585144, "eval_runtime": 12.7286, "eval_samples_per_second": 15.87, "eval_steps_per_second": 7.935, "step": 1575 }, { "epoch": 9.03, "learning_rate": 5.4971428571428576e-05, "loss": 0.3041, "step": 1580 }, { "epoch": 9.09, "learning_rate": 5.4685714285714285e-05, "loss": 0.2894, "step": 1590 }, { "epoch": 9.14, "learning_rate": 5.440000000000001e-05, "loss": 0.2129, "step": 1600 }, { "epoch": 9.2, "learning_rate": 5.411428571428572e-05, "loss": 0.3424, "step": 1610 }, { "epoch": 9.26, "learning_rate": 5.3828571428571426e-05, "loss": 0.0508, "step": 1620 }, { "epoch": 9.31, "learning_rate": 5.354285714285715e-05, "loss": 0.3036, "step": 1630 }, { "epoch": 9.37, "learning_rate": 5.325714285714286e-05, "loss": 0.4638, "step": 1640 }, { "epoch": 9.43, "learning_rate": 5.2971428571428574e-05, "loss": 0.3329, "step": 1650 }, { "epoch": 9.48, "learning_rate": 5.2685714285714284e-05, "loss": 0.0781, "step": 1660 }, { "epoch": 9.54, "learning_rate": 5.2400000000000007e-05, "loss": 0.1371, "step": 1670 }, { "epoch": 9.6, "learning_rate": 5.2114285714285716e-05, "loss": 0.244, "step": 1680 }, { "epoch": 9.66, "learning_rate": 5.1828571428571425e-05, "loss": 0.4502, "step": 1690 }, { "epoch": 9.71, "learning_rate": 5.154285714285715e-05, "loss": 0.4222, "step": 1700 }, { "epoch": 9.77, "learning_rate": 5.125714285714286e-05, "loss": 0.4389, "step": 1710 }, { "epoch": 9.83, "learning_rate": 5.097142857142857e-05, "loss": 0.3595, "step": 1720 }, { "epoch": 9.88, "learning_rate": 5.068571428571429e-05, "loss": 0.2946, "step": 1730 }, { "epoch": 9.94, "learning_rate": 5.0400000000000005e-05, "loss": 0.3272, "step": 1740 }, { "epoch": 10.0, "learning_rate": 5.0114285714285715e-05, "loss": 0.3357, "step": 1750 }, { "epoch": 10.0, "eval_accuracy": 0.7475247383117676, "eval_loss": 1.4022135734558105, "eval_runtime": 12.0818, "eval_samples_per_second": 16.719, "eval_steps_per_second": 8.36, "step": 1750 }, { "epoch": 10.06, "learning_rate": 4.982857142857143e-05, "loss": 0.1572, "step": 1760 }, { "epoch": 10.11, "learning_rate": 4.954285714285715e-05, "loss": 0.129, "step": 1770 }, { "epoch": 10.17, "learning_rate": 4.9257142857142856e-05, "loss": 0.0891, "step": 1780 }, { "epoch": 10.23, "learning_rate": 4.897142857142857e-05, "loss": 0.27, "step": 1790 }, { "epoch": 10.28, "learning_rate": 4.868571428571429e-05, "loss": 0.1743, "step": 1800 }, { "epoch": 10.34, "learning_rate": 4.8400000000000004e-05, "loss": 0.2713, "step": 1810 }, { "epoch": 10.4, "learning_rate": 4.811428571428572e-05, "loss": 0.0383, "step": 1820 }, { "epoch": 10.46, "learning_rate": 4.782857142857143e-05, "loss": 0.0301, "step": 1830 }, { "epoch": 10.51, "learning_rate": 4.7542857142857146e-05, "loss": 0.0353, "step": 1840 }, { "epoch": 10.57, "learning_rate": 4.725714285714286e-05, "loss": 0.238, "step": 1850 }, { "epoch": 10.63, "learning_rate": 4.697142857142857e-05, "loss": 0.2494, "step": 1860 }, { "epoch": 10.68, "learning_rate": 4.668571428571429e-05, "loss": 0.2638, "step": 1870 }, { "epoch": 10.74, "learning_rate": 4.64e-05, "loss": 0.2013, "step": 1880 }, { "epoch": 10.8, "learning_rate": 4.611428571428571e-05, "loss": 0.0893, "step": 1890 }, { "epoch": 10.85, "learning_rate": 4.5828571428571435e-05, "loss": 0.4689, "step": 1900 }, { "epoch": 10.91, "learning_rate": 4.5542857142857144e-05, "loss": 0.2516, "step": 1910 }, { "epoch": 10.97, "learning_rate": 4.525714285714286e-05, "loss": 0.2441, "step": 1920 }, { "epoch": 11.0, "eval_accuracy": 0.7425742745399475, "eval_loss": 1.6350345611572266, "eval_runtime": 11.7948, "eval_samples_per_second": 17.126, "eval_steps_per_second": 8.563, "step": 1925 }, { "epoch": 11.03, "learning_rate": 4.4971428571428576e-05, "loss": 0.0519, "step": 1930 }, { "epoch": 11.09, "learning_rate": 4.4685714285714286e-05, "loss": 0.1633, "step": 1940 }, { "epoch": 11.14, "learning_rate": 4.44e-05, "loss": 0.1561, "step": 1950 }, { "epoch": 11.2, "learning_rate": 4.411428571428572e-05, "loss": 0.0328, "step": 1960 }, { "epoch": 11.26, "learning_rate": 4.382857142857143e-05, "loss": 0.1487, "step": 1970 }, { "epoch": 11.31, "learning_rate": 4.354285714285714e-05, "loss": 0.05, "step": 1980 }, { "epoch": 11.37, "learning_rate": 4.325714285714286e-05, "loss": 0.2281, "step": 1990 }, { "epoch": 11.43, "learning_rate": 4.2971428571428575e-05, "loss": 0.1016, "step": 2000 }, { "epoch": 11.48, "learning_rate": 4.268571428571429e-05, "loss": 0.3914, "step": 2010 }, { "epoch": 11.54, "learning_rate": 4.24e-05, "loss": 0.5323, "step": 2020 }, { "epoch": 11.6, "learning_rate": 4.211428571428572e-05, "loss": 0.0534, "step": 2030 }, { "epoch": 11.66, "learning_rate": 4.1828571428571426e-05, "loss": 0.1185, "step": 2040 }, { "epoch": 11.71, "learning_rate": 4.154285714285714e-05, "loss": 0.104, "step": 2050 }, { "epoch": 11.77, "learning_rate": 4.125714285714286e-05, "loss": 0.2268, "step": 2060 }, { "epoch": 11.83, "learning_rate": 4.0971428571428574e-05, "loss": 0.1499, "step": 2070 }, { "epoch": 11.88, "learning_rate": 4.068571428571429e-05, "loss": 0.0944, "step": 2080 }, { "epoch": 11.94, "learning_rate": 4.0400000000000006e-05, "loss": 0.0604, "step": 2090 }, { "epoch": 12.0, "learning_rate": 4.0114285714285715e-05, "loss": 0.1318, "step": 2100 }, { "epoch": 12.0, "eval_accuracy": 0.6881188154220581, "eval_loss": 1.895858645439148, "eval_runtime": 13.4328, "eval_samples_per_second": 15.038, "eval_steps_per_second": 7.519, "step": 2100 }, { "epoch": 12.06, "learning_rate": 3.982857142857143e-05, "loss": 0.035, "step": 2110 }, { "epoch": 12.11, "learning_rate": 3.954285714285714e-05, "loss": 0.1331, "step": 2120 }, { "epoch": 12.17, "learning_rate": 3.925714285714286e-05, "loss": 0.3371, "step": 2130 }, { "epoch": 12.23, "learning_rate": 3.897142857142857e-05, "loss": 0.0372, "step": 2140 }, { "epoch": 12.28, "learning_rate": 3.868571428571429e-05, "loss": 0.1479, "step": 2150 }, { "epoch": 12.34, "learning_rate": 3.8400000000000005e-05, "loss": 0.0245, "step": 2160 }, { "epoch": 12.4, "learning_rate": 3.8114285714285714e-05, "loss": 0.1451, "step": 2170 }, { "epoch": 12.46, "learning_rate": 3.782857142857143e-05, "loss": 0.3234, "step": 2180 }, { "epoch": 12.51, "learning_rate": 3.7542857142857146e-05, "loss": 0.196, "step": 2190 }, { "epoch": 12.57, "learning_rate": 3.7257142857142856e-05, "loss": 0.1208, "step": 2200 }, { "epoch": 12.63, "learning_rate": 3.697142857142857e-05, "loss": 0.1025, "step": 2210 }, { "epoch": 12.68, "learning_rate": 3.668571428571429e-05, "loss": 0.1806, "step": 2220 }, { "epoch": 12.74, "learning_rate": 3.6400000000000004e-05, "loss": 0.1551, "step": 2230 }, { "epoch": 12.8, "learning_rate": 3.611428571428572e-05, "loss": 0.1155, "step": 2240 }, { "epoch": 12.85, "learning_rate": 3.582857142857143e-05, "loss": 0.0046, "step": 2250 }, { "epoch": 12.91, "learning_rate": 3.5542857142857145e-05, "loss": 0.0258, "step": 2260 }, { "epoch": 12.97, "learning_rate": 3.525714285714286e-05, "loss": 0.1937, "step": 2270 }, { "epoch": 13.0, "eval_accuracy": 0.7029703259468079, "eval_loss": 2.013838291168213, "eval_runtime": 12.0463, "eval_samples_per_second": 16.769, "eval_steps_per_second": 8.384, "step": 2275 }, { "epoch": 13.03, "learning_rate": 3.497142857142857e-05, "loss": 0.0615, "step": 2280 }, { "epoch": 13.09, "learning_rate": 3.468571428571429e-05, "loss": 0.1725, "step": 2290 }, { "epoch": 13.14, "learning_rate": 3.4399999999999996e-05, "loss": 0.0289, "step": 2300 }, { "epoch": 13.2, "learning_rate": 3.411428571428571e-05, "loss": 0.173, "step": 2310 }, { "epoch": 13.26, "learning_rate": 3.3828571428571435e-05, "loss": 0.0992, "step": 2320 }, { "epoch": 13.31, "learning_rate": 3.3542857142857144e-05, "loss": 0.1459, "step": 2330 }, { "epoch": 13.37, "learning_rate": 3.325714285714286e-05, "loss": 0.0768, "step": 2340 }, { "epoch": 13.43, "learning_rate": 3.2971428571428576e-05, "loss": 0.0721, "step": 2350 }, { "epoch": 13.48, "learning_rate": 3.2685714285714285e-05, "loss": 0.002, "step": 2360 }, { "epoch": 13.54, "learning_rate": 3.24e-05, "loss": 0.0738, "step": 2370 }, { "epoch": 13.6, "learning_rate": 3.211428571428571e-05, "loss": 0.007, "step": 2380 }, { "epoch": 13.66, "learning_rate": 3.182857142857143e-05, "loss": 0.1269, "step": 2390 }, { "epoch": 13.71, "learning_rate": 3.154285714285714e-05, "loss": 0.1667, "step": 2400 }, { "epoch": 13.77, "learning_rate": 3.125714285714286e-05, "loss": 0.003, "step": 2410 }, { "epoch": 13.83, "learning_rate": 3.0971428571428575e-05, "loss": 0.2794, "step": 2420 }, { "epoch": 13.88, "learning_rate": 3.068571428571429e-05, "loss": 0.0959, "step": 2430 }, { "epoch": 13.94, "learning_rate": 3.04e-05, "loss": 0.1878, "step": 2440 }, { "epoch": 14.0, "learning_rate": 3.0114285714285716e-05, "loss": 0.0164, "step": 2450 }, { "epoch": 14.0, "eval_accuracy": 0.7079207897186279, "eval_loss": 2.0977747440338135, "eval_runtime": 11.5146, "eval_samples_per_second": 17.543, "eval_steps_per_second": 8.771, "step": 2450 }, { "epoch": 14.06, "learning_rate": 2.982857142857143e-05, "loss": 0.0451, "step": 2460 }, { "epoch": 14.11, "learning_rate": 2.9542857142857145e-05, "loss": 0.1998, "step": 2470 }, { "epoch": 14.17, "learning_rate": 2.925714285714286e-05, "loss": 0.0231, "step": 2480 }, { "epoch": 14.23, "learning_rate": 2.897142857142857e-05, "loss": 0.0211, "step": 2490 }, { "epoch": 14.28, "learning_rate": 2.8685714285714286e-05, "loss": 0.2257, "step": 2500 }, { "epoch": 14.34, "learning_rate": 2.84e-05, "loss": 0.0013, "step": 2510 }, { "epoch": 14.4, "learning_rate": 2.8114285714285715e-05, "loss": 0.2982, "step": 2520 }, { "epoch": 14.46, "learning_rate": 2.782857142857143e-05, "loss": 0.2192, "step": 2530 }, { "epoch": 14.51, "learning_rate": 2.7542857142857144e-05, "loss": 0.18, "step": 2540 }, { "epoch": 14.57, "learning_rate": 2.725714285714286e-05, "loss": 0.0076, "step": 2550 }, { "epoch": 14.63, "learning_rate": 2.6971428571428576e-05, "loss": 0.0029, "step": 2560 }, { "epoch": 14.68, "learning_rate": 2.6685714285714285e-05, "loss": 0.0309, "step": 2570 }, { "epoch": 14.74, "learning_rate": 2.64e-05, "loss": 0.079, "step": 2580 }, { "epoch": 14.8, "learning_rate": 2.6114285714285714e-05, "loss": 0.0142, "step": 2590 }, { "epoch": 14.85, "learning_rate": 2.582857142857143e-05, "loss": 0.0451, "step": 2600 }, { "epoch": 14.91, "learning_rate": 2.5542857142857146e-05, "loss": 0.0036, "step": 2610 }, { "epoch": 14.97, "learning_rate": 2.5257142857142855e-05, "loss": 0.1794, "step": 2620 }, { "epoch": 15.0, "eval_accuracy": 0.7178217768669128, "eval_loss": 1.9837726354599, "eval_runtime": 11.491, "eval_samples_per_second": 17.579, "eval_steps_per_second": 8.789, "step": 2625 }, { "epoch": 15.03, "learning_rate": 2.4971428571428575e-05, "loss": 0.1493, "step": 2630 }, { "epoch": 15.09, "learning_rate": 2.4685714285714288e-05, "loss": 0.115, "step": 2640 }, { "epoch": 15.14, "learning_rate": 2.44e-05, "loss": 0.005, "step": 2650 }, { "epoch": 15.2, "learning_rate": 2.4114285714285713e-05, "loss": 0.1421, "step": 2660 }, { "epoch": 15.26, "learning_rate": 2.3828571428571432e-05, "loss": 0.0637, "step": 2670 }, { "epoch": 15.31, "learning_rate": 2.3542857142857145e-05, "loss": 0.0165, "step": 2680 }, { "epoch": 15.37, "learning_rate": 2.3257142857142858e-05, "loss": 0.0551, "step": 2690 }, { "epoch": 15.43, "learning_rate": 2.297142857142857e-05, "loss": 0.0804, "step": 2700 }, { "epoch": 15.48, "learning_rate": 2.2685714285714286e-05, "loss": 0.1237, "step": 2710 }, { "epoch": 15.54, "learning_rate": 2.2400000000000002e-05, "loss": 0.154, "step": 2720 }, { "epoch": 15.6, "learning_rate": 2.2114285714285715e-05, "loss": 0.0109, "step": 2730 }, { "epoch": 15.66, "learning_rate": 2.1828571428571428e-05, "loss": 0.0115, "step": 2740 }, { "epoch": 15.71, "learning_rate": 2.1542857142857144e-05, "loss": 0.1456, "step": 2750 }, { "epoch": 15.77, "learning_rate": 2.125714285714286e-05, "loss": 0.0106, "step": 2760 }, { "epoch": 15.83, "learning_rate": 2.0971428571428572e-05, "loss": 0.0021, "step": 2770 }, { "epoch": 15.88, "learning_rate": 2.0685714285714285e-05, "loss": 0.0102, "step": 2780 }, { "epoch": 15.94, "learning_rate": 2.04e-05, "loss": 0.2036, "step": 2790 }, { "epoch": 16.0, "learning_rate": 2.0114285714285717e-05, "loss": 0.0257, "step": 2800 }, { "epoch": 16.0, "eval_accuracy": 0.7178217768669128, "eval_loss": 1.9555984735488892, "eval_runtime": 13.1046, "eval_samples_per_second": 15.414, "eval_steps_per_second": 7.707, "step": 2800 }, { "epoch": 16.06, "learning_rate": 1.982857142857143e-05, "loss": 0.0406, "step": 2810 }, { "epoch": 16.11, "learning_rate": 1.9542857142857143e-05, "loss": 0.2677, "step": 2820 }, { "epoch": 16.17, "learning_rate": 1.9257142857142855e-05, "loss": 0.3214, "step": 2830 }, { "epoch": 16.23, "learning_rate": 1.8971428571428575e-05, "loss": 0.035, "step": 2840 }, { "epoch": 16.28, "learning_rate": 1.8685714285714287e-05, "loss": 0.0105, "step": 2850 }, { "epoch": 16.34, "learning_rate": 1.84e-05, "loss": 0.0205, "step": 2860 }, { "epoch": 16.4, "learning_rate": 1.8114285714285713e-05, "loss": 0.0006, "step": 2870 }, { "epoch": 16.46, "learning_rate": 1.7828571428571432e-05, "loss": 0.2564, "step": 2880 }, { "epoch": 16.51, "learning_rate": 1.7542857142857145e-05, "loss": 0.0396, "step": 2890 }, { "epoch": 16.57, "learning_rate": 1.7257142857142857e-05, "loss": 0.0237, "step": 2900 }, { "epoch": 16.63, "learning_rate": 1.697142857142857e-05, "loss": 0.0028, "step": 2910 }, { "epoch": 16.68, "learning_rate": 1.6685714285714286e-05, "loss": 0.2431, "step": 2920 }, { "epoch": 16.74, "learning_rate": 1.6400000000000002e-05, "loss": 0.0013, "step": 2930 }, { "epoch": 16.8, "learning_rate": 1.6114285714285715e-05, "loss": 0.362, "step": 2940 }, { "epoch": 16.85, "learning_rate": 1.5828571428571428e-05, "loss": 0.031, "step": 2950 }, { "epoch": 16.91, "learning_rate": 1.5542857142857144e-05, "loss": 0.1515, "step": 2960 }, { "epoch": 16.97, "learning_rate": 1.5257142857142858e-05, "loss": 0.1409, "step": 2970 }, { "epoch": 17.0, "eval_accuracy": 0.6930692791938782, "eval_loss": 2.0634045600891113, "eval_runtime": 11.0027, "eval_samples_per_second": 18.359, "eval_steps_per_second": 9.18, "step": 2975 }, { "epoch": 17.03, "learning_rate": 1.4971428571428572e-05, "loss": 0.0622, "step": 2980 }, { "epoch": 17.09, "learning_rate": 1.4685714285714287e-05, "loss": 0.0295, "step": 2990 }, { "epoch": 17.14, "learning_rate": 1.44e-05, "loss": 0.045, "step": 3000 }, { "epoch": 17.2, "learning_rate": 1.4114285714285715e-05, "loss": 0.0384, "step": 3010 }, { "epoch": 17.26, "learning_rate": 1.382857142857143e-05, "loss": 0.0035, "step": 3020 }, { "epoch": 17.31, "learning_rate": 1.3542857142857142e-05, "loss": 0.0014, "step": 3030 }, { "epoch": 17.37, "learning_rate": 1.3257142857142857e-05, "loss": 0.1624, "step": 3040 }, { "epoch": 17.43, "learning_rate": 1.2971428571428573e-05, "loss": 0.0309, "step": 3050 }, { "epoch": 17.48, "learning_rate": 1.2685714285714287e-05, "loss": 0.3965, "step": 3060 }, { "epoch": 17.54, "learning_rate": 1.24e-05, "loss": 0.2438, "step": 3070 }, { "epoch": 17.6, "learning_rate": 1.2114285714285716e-05, "loss": 0.0551, "step": 3080 }, { "epoch": 17.66, "learning_rate": 1.1828571428571429e-05, "loss": 0.0676, "step": 3090 }, { "epoch": 17.71, "learning_rate": 1.1542857142857143e-05, "loss": 0.0025, "step": 3100 }, { "epoch": 17.77, "learning_rate": 1.1257142857142857e-05, "loss": 0.0031, "step": 3110 }, { "epoch": 17.83, "learning_rate": 1.0971428571428572e-05, "loss": 0.0166, "step": 3120 }, { "epoch": 17.88, "learning_rate": 1.0685714285714286e-05, "loss": 0.2534, "step": 3130 }, { "epoch": 17.94, "learning_rate": 1.04e-05, "loss": 0.002, "step": 3140 }, { "epoch": 18.0, "learning_rate": 1.0114285714285715e-05, "loss": 0.0123, "step": 3150 }, { "epoch": 18.0, "eval_accuracy": 0.698019802570343, "eval_loss": 2.1222872734069824, "eval_runtime": 11.8575, "eval_samples_per_second": 17.036, "eval_steps_per_second": 8.518, "step": 3150 }, { "epoch": 18.06, "learning_rate": 9.828571428571429e-06, "loss": 0.1595, "step": 3160 }, { "epoch": 18.11, "learning_rate": 9.542857142857143e-06, "loss": 0.0442, "step": 3170 }, { "epoch": 18.17, "learning_rate": 9.257142857142858e-06, "loss": 0.2398, "step": 3180 }, { "epoch": 18.23, "learning_rate": 8.971428571428572e-06, "loss": 0.0063, "step": 3190 }, { "epoch": 18.28, "learning_rate": 8.685714285714287e-06, "loss": 0.1119, "step": 3200 }, { "epoch": 18.34, "learning_rate": 8.400000000000001e-06, "loss": 0.0093, "step": 3210 }, { "epoch": 18.4, "learning_rate": 8.114285714285715e-06, "loss": 0.0219, "step": 3220 }, { "epoch": 18.46, "learning_rate": 7.82857142857143e-06, "loss": 0.0055, "step": 3230 }, { "epoch": 18.51, "learning_rate": 7.542857142857143e-06, "loss": 0.0329, "step": 3240 }, { "epoch": 18.57, "learning_rate": 7.257142857142857e-06, "loss": 0.0038, "step": 3250 }, { "epoch": 18.63, "learning_rate": 6.971428571428572e-06, "loss": 0.0437, "step": 3260 }, { "epoch": 18.68, "learning_rate": 6.685714285714285e-06, "loss": 0.0081, "step": 3270 }, { "epoch": 18.74, "learning_rate": 6.4000000000000006e-06, "loss": 0.0346, "step": 3280 }, { "epoch": 18.8, "learning_rate": 6.114285714285715e-06, "loss": 0.0025, "step": 3290 }, { "epoch": 18.85, "learning_rate": 5.828571428571429e-06, "loss": 0.2022, "step": 3300 }, { "epoch": 18.91, "learning_rate": 5.542857142857144e-06, "loss": 0.0005, "step": 3310 }, { "epoch": 18.97, "learning_rate": 5.257142857142858e-06, "loss": 0.0476, "step": 3320 }, { "epoch": 19.0, "eval_accuracy": 0.7277227640151978, "eval_loss": 1.9925730228424072, "eval_runtime": 11.4361, "eval_samples_per_second": 17.663, "eval_steps_per_second": 8.832, "step": 3325 }, { "epoch": 19.03, "learning_rate": 4.9714285714285715e-06, "loss": 0.0063, "step": 3330 }, { "epoch": 19.09, "learning_rate": 4.685714285714286e-06, "loss": 0.0022, "step": 3340 }, { "epoch": 19.14, "learning_rate": 4.4e-06, "loss": 0.0207, "step": 3350 }, { "epoch": 19.2, "learning_rate": 4.114285714285715e-06, "loss": 0.1322, "step": 3360 }, { "epoch": 19.26, "learning_rate": 3.828571428571429e-06, "loss": 0.3377, "step": 3370 }, { "epoch": 19.31, "learning_rate": 3.542857142857143e-06, "loss": 0.0005, "step": 3380 }, { "epoch": 19.37, "learning_rate": 3.2571428571428572e-06, "loss": 0.001, "step": 3390 }, { "epoch": 19.43, "learning_rate": 2.9714285714285716e-06, "loss": 0.0004, "step": 3400 }, { "epoch": 19.48, "learning_rate": 2.685714285714286e-06, "loss": 0.1235, "step": 3410 }, { "epoch": 19.54, "learning_rate": 2.4000000000000003e-06, "loss": 0.006, "step": 3420 }, { "epoch": 19.6, "learning_rate": 2.1142857142857147e-06, "loss": 0.0027, "step": 3430 }, { "epoch": 19.66, "learning_rate": 1.8285714285714288e-06, "loss": 0.001, "step": 3440 }, { "epoch": 19.71, "learning_rate": 1.542857142857143e-06, "loss": 0.0032, "step": 3450 }, { "epoch": 19.77, "learning_rate": 1.2571428571428573e-06, "loss": 0.0011, "step": 3460 }, { "epoch": 19.83, "learning_rate": 9.714285714285715e-07, "loss": 0.0029, "step": 3470 }, { "epoch": 19.88, "learning_rate": 6.857142857142857e-07, "loss": 0.0141, "step": 3480 }, { "epoch": 19.94, "learning_rate": 4.0000000000000003e-07, "loss": 0.0007, "step": 3490 }, { "epoch": 20.0, "learning_rate": 1.142857142857143e-07, "loss": 0.0006, "step": 3500 }, { "epoch": 20.0, "eval_accuracy": 0.7277227640151978, "eval_loss": 1.9778043031692505, "eval_runtime": 12.1861, "eval_samples_per_second": 16.576, "eval_steps_per_second": 8.288, "step": 3500 } ], "max_steps": 3500, "num_train_epochs": 20, "total_flos": 2.418962508100631e+18, "trial_name": null, "trial_params": null }