diff --git "a/checkpoint-5840/trainer_state.json" "b/checkpoint-5840/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5840/trainer_state.json" @@ -0,0 +1,4834 @@ +{ + "best_metric": 0.6604002714157104, + "best_model_checkpoint": "./vit-base-beans/checkpoint-1160", + "epoch": 14.974358974358974, + "global_step": 5840, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03, + "learning_rate": 0.00019965811965811967, + "loss": 1.7468, + "step": 10 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019931623931623932, + "loss": 1.653, + "step": 20 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019897435897435898, + "loss": 1.5026, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019863247863247864, + "loss": 1.4296, + "step": 40 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.41975308641975306, + "eval_loss": 1.4691457748413086, + "eval_runtime": 61.6215, + "eval_samples_per_second": 57.902, + "eval_steps_per_second": 7.238, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001982905982905983, + "loss": 1.369, + "step": 50 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019794871794871796, + "loss": 1.2948, + "step": 60 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001976068376068376, + "loss": 1.3074, + "step": 70 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019726495726495727, + "loss": 1.3074, + "step": 80 + }, + { + "epoch": 0.21, + "eval_accuracy": 0.5549943883277216, + "eval_loss": 1.1999692916870117, + "eval_runtime": 33.0603, + "eval_samples_per_second": 107.924, + "eval_steps_per_second": 13.491, + "step": 80 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019692307692307696, + "loss": 1.1725, + "step": 90 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019658119658119659, + "loss": 1.1944, + "step": 100 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019623931623931624, + "loss": 1.18, + "step": 110 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001958974358974359, + "loss": 1.1731, + "step": 120 + }, + { + "epoch": 0.31, + "eval_accuracy": 0.6689113355780022, + "eval_loss": 0.978410005569458, + "eval_runtime": 37.0531, + "eval_samples_per_second": 96.294, + "eval_steps_per_second": 12.037, + "step": 120 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019555555555555556, + "loss": 0.9441, + "step": 130 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019521367521367522, + "loss": 1.0459, + "step": 140 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019487179487179487, + "loss": 1.1109, + "step": 150 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019452991452991453, + "loss": 1.0094, + "step": 160 + }, + { + "epoch": 0.41, + "eval_accuracy": 0.6616161616161617, + "eval_loss": 0.9483916759490967, + "eval_runtime": 32.6783, + "eval_samples_per_second": 109.185, + "eval_steps_per_second": 13.648, + "step": 160 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001941880341880342, + "loss": 0.9165, + "step": 170 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019384615384615385, + "loss": 1.0904, + "step": 180 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001935042735042735, + "loss": 0.9846, + "step": 190 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019316239316239316, + "loss": 1.0908, + "step": 200 + }, + { + "epoch": 0.51, + "eval_accuracy": 0.5454545454545454, + "eval_loss": 1.2091434001922607, + "eval_runtime": 32.1465, + "eval_samples_per_second": 110.992, + "eval_steps_per_second": 13.874, + "step": 200 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019282051282051282, + "loss": 1.0193, + "step": 210 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001924786324786325, + "loss": 0.9809, + "step": 220 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019213675213675216, + "loss": 0.9673, + "step": 230 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019179487179487182, + "loss": 0.9402, + "step": 240 + }, + { + "epoch": 0.62, + "eval_accuracy": 0.6481481481481481, + "eval_loss": 0.9721583724021912, + "eval_runtime": 33.9462, + "eval_samples_per_second": 105.107, + "eval_steps_per_second": 13.138, + "step": 240 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019145299145299148, + "loss": 0.8103, + "step": 250 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019111111111111114, + "loss": 0.8564, + "step": 260 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001907692307692308, + "loss": 1.1074, + "step": 270 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019042735042735042, + "loss": 1.044, + "step": 280 + }, + { + "epoch": 0.72, + "eval_accuracy": 0.5897867564534232, + "eval_loss": 1.1147791147232056, + "eval_runtime": 32.6846, + "eval_samples_per_second": 109.164, + "eval_steps_per_second": 13.646, + "step": 280 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019008547008547008, + "loss": 1.016, + "step": 290 + }, + { + "epoch": 0.77, + "learning_rate": 0.00018974358974358974, + "loss": 0.8815, + "step": 300 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001894017094017094, + "loss": 0.8718, + "step": 310 + }, + { + "epoch": 0.82, + "learning_rate": 0.00018905982905982906, + "loss": 0.6886, + "step": 320 + }, + { + "epoch": 0.82, + "eval_accuracy": 0.7272727272727273, + "eval_loss": 0.7878678441047668, + "eval_runtime": 31.1497, + "eval_samples_per_second": 114.544, + "eval_steps_per_second": 14.318, + "step": 320 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001887179487179487, + "loss": 0.8122, + "step": 330 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018837606837606837, + "loss": 0.6107, + "step": 340 + }, + { + "epoch": 0.9, + "learning_rate": 0.00018803418803418803, + "loss": 0.7783, + "step": 350 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001876923076923077, + "loss": 0.8432, + "step": 360 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.7480359147025814, + "eval_loss": 0.7056980729103088, + "eval_runtime": 32.2533, + "eval_samples_per_second": 110.624, + "eval_steps_per_second": 13.828, + "step": 360 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018735042735042737, + "loss": 0.8314, + "step": 370 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018700854700854703, + "loss": 0.8488, + "step": 380 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001866666666666667, + "loss": 0.8547, + "step": 390 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018632478632478634, + "loss": 0.7394, + "step": 400 + }, + { + "epoch": 1.03, + "eval_accuracy": 0.6835016835016835, + "eval_loss": 0.9139666557312012, + "eval_runtime": 42.9913, + "eval_samples_per_second": 82.994, + "eval_steps_per_second": 10.374, + "step": 400 + }, + { + "epoch": 1.05, + "learning_rate": 0.000185982905982906, + "loss": 0.5767, + "step": 410 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018564102564102566, + "loss": 0.5304, + "step": 420 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018529914529914532, + "loss": 0.6296, + "step": 430 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018495726495726497, + "loss": 0.6004, + "step": 440 + }, + { + "epoch": 1.13, + "eval_accuracy": 0.7255892255892256, + "eval_loss": 0.7451665997505188, + "eval_runtime": 31.8253, + "eval_samples_per_second": 112.112, + "eval_steps_per_second": 14.014, + "step": 440 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018461538461538463, + "loss": 0.518, + "step": 450 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018427350427350426, + "loss": 0.5825, + "step": 460 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018393162393162392, + "loss": 0.5465, + "step": 470 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018358974358974358, + "loss": 0.5201, + "step": 480 + }, + { + "epoch": 1.23, + "eval_accuracy": 0.6879910213243546, + "eval_loss": 0.8496339917182922, + "eval_runtime": 33.617, + "eval_samples_per_second": 106.137, + "eval_steps_per_second": 13.267, + "step": 480 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018324786324786324, + "loss": 0.5016, + "step": 490 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018290598290598292, + "loss": 0.6101, + "step": 500 + }, + { + "epoch": 1.31, + "learning_rate": 0.00018256410256410258, + "loss": 0.5572, + "step": 510 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018222222222222224, + "loss": 0.4039, + "step": 520 + }, + { + "epoch": 1.33, + "eval_accuracy": 0.7312008978675645, + "eval_loss": 0.7843908071517944, + "eval_runtime": 31.7575, + "eval_samples_per_second": 112.351, + "eval_steps_per_second": 14.044, + "step": 520 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001818803418803419, + "loss": 0.4831, + "step": 530 + }, + { + "epoch": 1.38, + "learning_rate": 0.00018153846153846155, + "loss": 0.5493, + "step": 540 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001811965811965812, + "loss": 0.4942, + "step": 550 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018085470085470087, + "loss": 0.4475, + "step": 560 + }, + { + "epoch": 1.44, + "eval_accuracy": 0.6593714927048261, + "eval_loss": 0.9572438597679138, + "eval_runtime": 31.2916, + "eval_samples_per_second": 114.024, + "eval_steps_per_second": 14.253, + "step": 560 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018051282051282052, + "loss": 0.4667, + "step": 570 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018017094017094018, + "loss": 0.4685, + "step": 580 + }, + { + "epoch": 1.51, + "learning_rate": 0.00017982905982905984, + "loss": 0.5028, + "step": 590 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001794871794871795, + "loss": 0.6071, + "step": 600 + }, + { + "epoch": 1.54, + "eval_accuracy": 0.7676767676767676, + "eval_loss": 0.6666165590286255, + "eval_runtime": 32.2041, + "eval_samples_per_second": 110.793, + "eval_steps_per_second": 13.849, + "step": 600 + }, + { + "epoch": 1.56, + "learning_rate": 0.00017914529914529916, + "loss": 0.5134, + "step": 610 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001788034188034188, + "loss": 0.658, + "step": 620 + }, + { + "epoch": 1.62, + "learning_rate": 0.00017846153846153847, + "loss": 0.606, + "step": 630 + }, + { + "epoch": 1.64, + "learning_rate": 0.00017811965811965813, + "loss": 0.5004, + "step": 640 + }, + { + "epoch": 1.64, + "eval_accuracy": 0.7373737373737373, + "eval_loss": 0.7380097508430481, + "eval_runtime": 32.5063, + "eval_samples_per_second": 109.763, + "eval_steps_per_second": 13.72, + "step": 640 + }, + { + "epoch": 1.67, + "learning_rate": 0.00017777777777777779, + "loss": 0.5404, + "step": 650 + }, + { + "epoch": 1.69, + "learning_rate": 0.00017743589743589744, + "loss": 0.5775, + "step": 660 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001770940170940171, + "loss": 0.5013, + "step": 670 + }, + { + "epoch": 1.74, + "learning_rate": 0.00017675213675213676, + "loss": 0.6024, + "step": 680 + }, + { + "epoch": 1.74, + "eval_accuracy": 0.7407407407407407, + "eval_loss": 0.7545998692512512, + "eval_runtime": 36.6802, + "eval_samples_per_second": 97.273, + "eval_steps_per_second": 12.159, + "step": 680 + }, + { + "epoch": 1.77, + "learning_rate": 0.00017641025641025642, + "loss": 0.4815, + "step": 690 + }, + { + "epoch": 1.79, + "learning_rate": 0.00017606837606837607, + "loss": 0.4727, + "step": 700 + }, + { + "epoch": 1.82, + "learning_rate": 0.00017572649572649573, + "loss": 0.4698, + "step": 710 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001753846153846154, + "loss": 0.4813, + "step": 720 + }, + { + "epoch": 1.85, + "eval_accuracy": 0.7609427609427609, + "eval_loss": 0.7190226912498474, + "eval_runtime": 32.2305, + "eval_samples_per_second": 110.702, + "eval_steps_per_second": 13.838, + "step": 720 + }, + { + "epoch": 1.87, + "learning_rate": 0.00017504273504273505, + "loss": 0.5407, + "step": 730 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001747008547008547, + "loss": 0.4137, + "step": 740 + }, + { + "epoch": 1.92, + "learning_rate": 0.00017435897435897436, + "loss": 0.5252, + "step": 750 + }, + { + "epoch": 1.95, + "learning_rate": 0.00017401709401709402, + "loss": 0.5608, + "step": 760 + }, + { + "epoch": 1.95, + "eval_accuracy": 0.7283950617283951, + "eval_loss": 0.7672268748283386, + "eval_runtime": 34.3059, + "eval_samples_per_second": 104.006, + "eval_steps_per_second": 13.001, + "step": 760 + }, + { + "epoch": 1.97, + "learning_rate": 0.00017367521367521368, + "loss": 0.4692, + "step": 770 + }, + { + "epoch": 2.0, + "learning_rate": 0.00017333333333333334, + "loss": 0.488, + "step": 780 + }, + { + "epoch": 2.03, + "learning_rate": 0.000172991452991453, + "loss": 0.3798, + "step": 790 + }, + { + "epoch": 2.05, + "learning_rate": 0.00017264957264957268, + "loss": 0.3272, + "step": 800 + }, + { + "epoch": 2.05, + "eval_accuracy": 0.7497194163860831, + "eval_loss": 0.7055391073226929, + "eval_runtime": 34.4041, + "eval_samples_per_second": 103.709, + "eval_steps_per_second": 12.964, + "step": 800 + }, + { + "epoch": 2.08, + "learning_rate": 0.00017230769230769234, + "loss": 0.2257, + "step": 810 + }, + { + "epoch": 2.1, + "learning_rate": 0.000171965811965812, + "loss": 0.238, + "step": 820 + }, + { + "epoch": 2.13, + "learning_rate": 0.00017162393162393162, + "loss": 0.3225, + "step": 830 + }, + { + "epoch": 2.15, + "learning_rate": 0.00017128205128205128, + "loss": 0.2803, + "step": 840 + }, + { + "epoch": 2.15, + "eval_accuracy": 0.7514029180695847, + "eval_loss": 0.7669840455055237, + "eval_runtime": 32.953, + "eval_samples_per_second": 108.275, + "eval_steps_per_second": 13.534, + "step": 840 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017094017094017094, + "loss": 0.234, + "step": 850 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001705982905982906, + "loss": 0.2201, + "step": 860 + }, + { + "epoch": 2.23, + "learning_rate": 0.00017025641025641026, + "loss": 0.4651, + "step": 870 + }, + { + "epoch": 2.26, + "learning_rate": 0.0001699145299145299, + "loss": 0.4051, + "step": 880 + }, + { + "epoch": 2.26, + "eval_accuracy": 0.7620650953984287, + "eval_loss": 0.691470205783844, + "eval_runtime": 37.2444, + "eval_samples_per_second": 95.8, + "eval_steps_per_second": 11.975, + "step": 880 + }, + { + "epoch": 2.28, + "learning_rate": 0.00016957264957264957, + "loss": 0.3069, + "step": 890 + }, + { + "epoch": 2.31, + "learning_rate": 0.00016923076923076923, + "loss": 0.2787, + "step": 900 + }, + { + "epoch": 2.33, + "learning_rate": 0.00016888888888888889, + "loss": 0.2794, + "step": 910 + }, + { + "epoch": 2.36, + "learning_rate": 0.00016854700854700854, + "loss": 0.3436, + "step": 920 + }, + { + "epoch": 2.36, + "eval_accuracy": 0.7631874298540965, + "eval_loss": 0.6948888301849365, + "eval_runtime": 32.1442, + "eval_samples_per_second": 111.0, + "eval_steps_per_second": 13.875, + "step": 920 + }, + { + "epoch": 2.38, + "learning_rate": 0.00016820512820512823, + "loss": 0.2964, + "step": 930 + }, + { + "epoch": 2.41, + "learning_rate": 0.00016786324786324789, + "loss": 0.1973, + "step": 940 + }, + { + "epoch": 2.44, + "learning_rate": 0.00016752136752136754, + "loss": 0.2257, + "step": 950 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001671794871794872, + "loss": 0.2759, + "step": 960 + }, + { + "epoch": 2.46, + "eval_accuracy": 0.7356902356902357, + "eval_loss": 0.8428576588630676, + "eval_runtime": 32.8186, + "eval_samples_per_second": 108.719, + "eval_steps_per_second": 13.59, + "step": 960 + }, + { + "epoch": 2.49, + "learning_rate": 0.00016683760683760686, + "loss": 0.272, + "step": 970 + }, + { + "epoch": 2.51, + "learning_rate": 0.00016649572649572652, + "loss": 0.2721, + "step": 980 + }, + { + "epoch": 2.54, + "learning_rate": 0.00016615384615384617, + "loss": 0.1414, + "step": 990 + }, + { + "epoch": 2.56, + "learning_rate": 0.00016581196581196583, + "loss": 0.5024, + "step": 1000 + }, + { + "epoch": 2.56, + "eval_accuracy": 0.7502805836139169, + "eval_loss": 0.8222711086273193, + "eval_runtime": 32.7463, + "eval_samples_per_second": 108.959, + "eval_steps_per_second": 13.62, + "step": 1000 + }, + { + "epoch": 2.59, + "learning_rate": 0.00016547008547008546, + "loss": 0.3348, + "step": 1010 + }, + { + "epoch": 2.62, + "learning_rate": 0.00016512820512820512, + "loss": 0.3692, + "step": 1020 + }, + { + "epoch": 2.64, + "learning_rate": 0.00016478632478632478, + "loss": 0.3981, + "step": 1030 + }, + { + "epoch": 2.67, + "learning_rate": 0.00016444444444444444, + "loss": 0.3571, + "step": 1040 + }, + { + "epoch": 2.67, + "eval_accuracy": 0.7289562289562289, + "eval_loss": 0.8546615839004517, + "eval_runtime": 33.6376, + "eval_samples_per_second": 106.072, + "eval_steps_per_second": 13.259, + "step": 1040 + }, + { + "epoch": 2.69, + "learning_rate": 0.0001641025641025641, + "loss": 0.3647, + "step": 1050 + }, + { + "epoch": 2.72, + "learning_rate": 0.00016376068376068375, + "loss": 0.2547, + "step": 1060 + }, + { + "epoch": 2.74, + "learning_rate": 0.00016341880341880344, + "loss": 0.3708, + "step": 1070 + }, + { + "epoch": 2.77, + "learning_rate": 0.0001630769230769231, + "loss": 0.3197, + "step": 1080 + }, + { + "epoch": 2.77, + "eval_accuracy": 0.754769921436588, + "eval_loss": 0.7111819982528687, + "eval_runtime": 33.1915, + "eval_samples_per_second": 107.497, + "eval_steps_per_second": 13.437, + "step": 1080 + }, + { + "epoch": 2.79, + "learning_rate": 0.00016273504273504275, + "loss": 0.2805, + "step": 1090 + }, + { + "epoch": 2.82, + "learning_rate": 0.0001623931623931624, + "loss": 0.2792, + "step": 1100 + }, + { + "epoch": 2.85, + "learning_rate": 0.00016205128205128207, + "loss": 0.3781, + "step": 1110 + }, + { + "epoch": 2.87, + "learning_rate": 0.00016170940170940172, + "loss": 0.3872, + "step": 1120 + }, + { + "epoch": 2.87, + "eval_accuracy": 0.7485970819304153, + "eval_loss": 0.7666726112365723, + "eval_runtime": 33.3793, + "eval_samples_per_second": 106.892, + "eval_steps_per_second": 13.362, + "step": 1120 + }, + { + "epoch": 2.9, + "learning_rate": 0.00016136752136752138, + "loss": 0.3489, + "step": 1130 + }, + { + "epoch": 2.92, + "learning_rate": 0.00016102564102564104, + "loss": 0.3436, + "step": 1140 + }, + { + "epoch": 2.95, + "learning_rate": 0.0001606837606837607, + "loss": 0.2608, + "step": 1150 + }, + { + "epoch": 2.97, + "learning_rate": 0.00016034188034188036, + "loss": 0.2699, + "step": 1160 + }, + { + "epoch": 2.97, + "eval_accuracy": 0.7867564534231201, + "eval_loss": 0.6604002714157104, + "eval_runtime": 42.2756, + "eval_samples_per_second": 84.398, + "eval_steps_per_second": 10.55, + "step": 1160 + }, + { + "epoch": 3.0, + "learning_rate": 0.00016, + "loss": 0.1998, + "step": 1170 + }, + { + "epoch": 3.03, + "learning_rate": 0.00015965811965811967, + "loss": 0.1375, + "step": 1180 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001593162393162393, + "loss": 0.1511, + "step": 1190 + }, + { + "epoch": 3.08, + "learning_rate": 0.00015897435897435896, + "loss": 0.237, + "step": 1200 + }, + { + "epoch": 3.08, + "eval_accuracy": 0.7570145903479237, + "eval_loss": 0.8170732855796814, + "eval_runtime": 34.8004, + "eval_samples_per_second": 102.528, + "eval_steps_per_second": 12.816, + "step": 1200 + }, + { + "epoch": 3.1, + "learning_rate": 0.00015863247863247864, + "loss": 0.1256, + "step": 1210 + }, + { + "epoch": 3.13, + "learning_rate": 0.0001582905982905983, + "loss": 0.1109, + "step": 1220 + }, + { + "epoch": 3.15, + "learning_rate": 0.00015794871794871796, + "loss": 0.1218, + "step": 1230 + }, + { + "epoch": 3.18, + "learning_rate": 0.00015760683760683762, + "loss": 0.2952, + "step": 1240 + }, + { + "epoch": 3.18, + "eval_accuracy": 0.7598204264870931, + "eval_loss": 0.8052437901496887, + "eval_runtime": 33.2814, + "eval_samples_per_second": 107.207, + "eval_steps_per_second": 13.401, + "step": 1240 + }, + { + "epoch": 3.21, + "learning_rate": 0.00015726495726495727, + "loss": 0.1483, + "step": 1250 + }, + { + "epoch": 3.23, + "learning_rate": 0.00015692307692307693, + "loss": 0.1639, + "step": 1260 + }, + { + "epoch": 3.26, + "learning_rate": 0.0001565811965811966, + "loss": 0.1832, + "step": 1270 + }, + { + "epoch": 3.28, + "learning_rate": 0.00015623931623931625, + "loss": 0.217, + "step": 1280 + }, + { + "epoch": 3.28, + "eval_accuracy": 0.7822671156004489, + "eval_loss": 0.7558466196060181, + "eval_runtime": 33.6999, + "eval_samples_per_second": 105.876, + "eval_steps_per_second": 13.234, + "step": 1280 + }, + { + "epoch": 3.31, + "learning_rate": 0.0001558974358974359, + "loss": 0.1555, + "step": 1290 + }, + { + "epoch": 3.33, + "learning_rate": 0.00015555555555555556, + "loss": 0.1288, + "step": 1300 + }, + { + "epoch": 3.36, + "learning_rate": 0.00015521367521367522, + "loss": 0.1276, + "step": 1310 + }, + { + "epoch": 3.38, + "learning_rate": 0.00015487179487179488, + "loss": 0.1804, + "step": 1320 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.7631874298540965, + "eval_loss": 0.8506298065185547, + "eval_runtime": 35.5517, + "eval_samples_per_second": 100.361, + "eval_steps_per_second": 12.545, + "step": 1320 + }, + { + "epoch": 3.41, + "learning_rate": 0.00015452991452991454, + "loss": 0.1482, + "step": 1330 + }, + { + "epoch": 3.44, + "learning_rate": 0.0001541880341880342, + "loss": 0.1177, + "step": 1340 + }, + { + "epoch": 3.46, + "learning_rate": 0.00015384615384615385, + "loss": 0.137, + "step": 1350 + }, + { + "epoch": 3.49, + "learning_rate": 0.0001535042735042735, + "loss": 0.234, + "step": 1360 + }, + { + "epoch": 3.49, + "eval_accuracy": 0.7974186307519641, + "eval_loss": 0.7378016114234924, + "eval_runtime": 33.7299, + "eval_samples_per_second": 105.782, + "eval_steps_per_second": 13.223, + "step": 1360 + }, + { + "epoch": 3.51, + "learning_rate": 0.0001531623931623932, + "loss": 0.2669, + "step": 1370 + }, + { + "epoch": 3.54, + "learning_rate": 0.00015282051282051282, + "loss": 0.1098, + "step": 1380 + }, + { + "epoch": 3.56, + "learning_rate": 0.00015247863247863248, + "loss": 0.1492, + "step": 1390 + }, + { + "epoch": 3.59, + "learning_rate": 0.00015213675213675214, + "loss": 0.1799, + "step": 1400 + }, + { + "epoch": 3.59, + "eval_accuracy": 0.7749719416386083, + "eval_loss": 0.7870209813117981, + "eval_runtime": 33.1, + "eval_samples_per_second": 107.795, + "eval_steps_per_second": 13.474, + "step": 1400 + }, + { + "epoch": 3.62, + "learning_rate": 0.0001517948717948718, + "loss": 0.2032, + "step": 1410 + }, + { + "epoch": 3.64, + "learning_rate": 0.00015145299145299146, + "loss": 0.3042, + "step": 1420 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001511111111111111, + "loss": 0.2242, + "step": 1430 + }, + { + "epoch": 3.69, + "learning_rate": 0.00015076923076923077, + "loss": 0.2508, + "step": 1440 + }, + { + "epoch": 3.69, + "eval_accuracy": 0.7283950617283951, + "eval_loss": 1.016114354133606, + "eval_runtime": 33.0909, + "eval_samples_per_second": 107.824, + "eval_steps_per_second": 13.478, + "step": 1440 + }, + { + "epoch": 3.72, + "learning_rate": 0.00015042735042735043, + "loss": 0.3502, + "step": 1450 + }, + { + "epoch": 3.74, + "learning_rate": 0.00015008547008547009, + "loss": 0.1808, + "step": 1460 + }, + { + "epoch": 3.77, + "learning_rate": 0.00014974358974358974, + "loss": 0.1187, + "step": 1470 + }, + { + "epoch": 3.79, + "learning_rate": 0.0001494017094017094, + "loss": 0.2714, + "step": 1480 + }, + { + "epoch": 3.79, + "eval_accuracy": 0.7721661054994389, + "eval_loss": 0.8105702996253967, + "eval_runtime": 33.8257, + "eval_samples_per_second": 105.482, + "eval_steps_per_second": 13.185, + "step": 1480 + }, + { + "epoch": 3.82, + "learning_rate": 0.00014905982905982906, + "loss": 0.1947, + "step": 1490 + }, + { + "epoch": 3.85, + "learning_rate": 0.00014871794871794872, + "loss": 0.2269, + "step": 1500 + }, + { + "epoch": 3.87, + "learning_rate": 0.0001483760683760684, + "loss": 0.2676, + "step": 1510 + }, + { + "epoch": 3.9, + "learning_rate": 0.00014803418803418806, + "loss": 0.1342, + "step": 1520 + }, + { + "epoch": 3.9, + "eval_accuracy": 0.7665544332210998, + "eval_loss": 0.8097100853919983, + "eval_runtime": 32.9851, + "eval_samples_per_second": 108.17, + "eval_steps_per_second": 13.521, + "step": 1520 + }, + { + "epoch": 3.92, + "learning_rate": 0.00014769230769230772, + "loss": 0.1616, + "step": 1530 + }, + { + "epoch": 3.95, + "learning_rate": 0.00014735042735042737, + "loss": 0.1682, + "step": 1540 + }, + { + "epoch": 3.97, + "learning_rate": 0.00014700854700854703, + "loss": 0.1335, + "step": 1550 + }, + { + "epoch": 4.0, + "learning_rate": 0.00014666666666666666, + "loss": 0.2337, + "step": 1560 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.7805836139169473, + "eval_loss": 0.7859733700752258, + "eval_runtime": 32.3155, + "eval_samples_per_second": 110.411, + "eval_steps_per_second": 13.801, + "step": 1560 + }, + { + "epoch": 4.03, + "learning_rate": 0.00014632478632478632, + "loss": 0.0954, + "step": 1570 + }, + { + "epoch": 4.05, + "learning_rate": 0.00014598290598290598, + "loss": 0.1129, + "step": 1580 + }, + { + "epoch": 4.08, + "learning_rate": 0.00014564102564102564, + "loss": 0.085, + "step": 1590 + }, + { + "epoch": 4.1, + "learning_rate": 0.0001452991452991453, + "loss": 0.052, + "step": 1600 + }, + { + "epoch": 4.1, + "eval_accuracy": 0.7654320987654321, + "eval_loss": 0.8651421666145325, + "eval_runtime": 37.0818, + "eval_samples_per_second": 96.22, + "eval_steps_per_second": 12.027, + "step": 1600 + }, + { + "epoch": 4.13, + "learning_rate": 0.00014495726495726495, + "loss": 0.0705, + "step": 1610 + }, + { + "epoch": 4.15, + "learning_rate": 0.0001446153846153846, + "loss": 0.0467, + "step": 1620 + }, + { + "epoch": 4.18, + "learning_rate": 0.00014427350427350427, + "loss": 0.0585, + "step": 1630 + }, + { + "epoch": 4.21, + "learning_rate": 0.00014393162393162392, + "loss": 0.2216, + "step": 1640 + }, + { + "epoch": 4.21, + "eval_accuracy": 0.7800224466891134, + "eval_loss": 0.8762761950492859, + "eval_runtime": 32.976, + "eval_samples_per_second": 108.2, + "eval_steps_per_second": 13.525, + "step": 1640 + }, + { + "epoch": 4.23, + "learning_rate": 0.0001435897435897436, + "loss": 0.0832, + "step": 1650 + }, + { + "epoch": 4.26, + "learning_rate": 0.00014324786324786327, + "loss": 0.0812, + "step": 1660 + }, + { + "epoch": 4.28, + "learning_rate": 0.00014290598290598292, + "loss": 0.1584, + "step": 1670 + }, + { + "epoch": 4.31, + "learning_rate": 0.00014256410256410258, + "loss": 0.1194, + "step": 1680 + }, + { + "epoch": 4.31, + "eval_accuracy": 0.7822671156004489, + "eval_loss": 0.848014771938324, + "eval_runtime": 33.7757, + "eval_samples_per_second": 105.638, + "eval_steps_per_second": 13.205, + "step": 1680 + }, + { + "epoch": 4.33, + "learning_rate": 0.00014222222222222224, + "loss": 0.1615, + "step": 1690 + }, + { + "epoch": 4.36, + "learning_rate": 0.0001418803418803419, + "loss": 0.1423, + "step": 1700 + }, + { + "epoch": 4.38, + "learning_rate": 0.00014153846153846156, + "loss": 0.0727, + "step": 1710 + }, + { + "epoch": 4.41, + "learning_rate": 0.0001411965811965812, + "loss": 0.1959, + "step": 1720 + }, + { + "epoch": 4.41, + "eval_accuracy": 0.7581369248035915, + "eval_loss": 0.9282656908035278, + "eval_runtime": 33.2614, + "eval_samples_per_second": 107.272, + "eval_steps_per_second": 13.409, + "step": 1720 + }, + { + "epoch": 4.44, + "learning_rate": 0.00014085470085470087, + "loss": 0.1193, + "step": 1730 + }, + { + "epoch": 4.46, + "learning_rate": 0.0001405128205128205, + "loss": 0.1106, + "step": 1740 + }, + { + "epoch": 4.49, + "learning_rate": 0.00014017094017094016, + "loss": 0.0624, + "step": 1750 + }, + { + "epoch": 4.51, + "learning_rate": 0.00013982905982905982, + "loss": 0.095, + "step": 1760 + }, + { + "epoch": 4.51, + "eval_accuracy": 0.7648709315375982, + "eval_loss": 0.9624485969543457, + "eval_runtime": 32.5961, + "eval_samples_per_second": 109.461, + "eval_steps_per_second": 13.683, + "step": 1760 + }, + { + "epoch": 4.54, + "learning_rate": 0.00013948717948717947, + "loss": 0.0659, + "step": 1770 + }, + { + "epoch": 4.56, + "learning_rate": 0.00013914529914529916, + "loss": 0.1178, + "step": 1780 + }, + { + "epoch": 4.59, + "learning_rate": 0.00013880341880341882, + "loss": 0.0864, + "step": 1790 + }, + { + "epoch": 4.62, + "learning_rate": 0.00013846153846153847, + "loss": 0.0727, + "step": 1800 + }, + { + "epoch": 4.62, + "eval_accuracy": 0.7615039281705949, + "eval_loss": 0.9454267621040344, + "eval_runtime": 33.1697, + "eval_samples_per_second": 107.568, + "eval_steps_per_second": 13.446, + "step": 1800 + }, + { + "epoch": 4.64, + "learning_rate": 0.00013811965811965813, + "loss": 0.0658, + "step": 1810 + }, + { + "epoch": 4.67, + "learning_rate": 0.0001377777777777778, + "loss": 0.1133, + "step": 1820 + }, + { + "epoch": 4.69, + "learning_rate": 0.00013743589743589745, + "loss": 0.0978, + "step": 1830 + }, + { + "epoch": 4.72, + "learning_rate": 0.0001370940170940171, + "loss": 0.0989, + "step": 1840 + }, + { + "epoch": 4.72, + "eval_accuracy": 0.7878787878787878, + "eval_loss": 0.8532549142837524, + "eval_runtime": 33.6969, + "eval_samples_per_second": 105.885, + "eval_steps_per_second": 13.236, + "step": 1840 + }, + { + "epoch": 4.74, + "learning_rate": 0.00013675213675213676, + "loss": 0.1147, + "step": 1850 + }, + { + "epoch": 4.77, + "learning_rate": 0.00013641025641025642, + "loss": 0.187, + "step": 1860 + }, + { + "epoch": 4.79, + "learning_rate": 0.00013606837606837608, + "loss": 0.1036, + "step": 1870 + }, + { + "epoch": 4.82, + "learning_rate": 0.00013572649572649574, + "loss": 0.0785, + "step": 1880 + }, + { + "epoch": 4.82, + "eval_accuracy": 0.7890011223344556, + "eval_loss": 0.8057104349136353, + "eval_runtime": 31.6741, + "eval_samples_per_second": 112.647, + "eval_steps_per_second": 14.081, + "step": 1880 + }, + { + "epoch": 4.85, + "learning_rate": 0.0001353846153846154, + "loss": 0.0838, + "step": 1890 + }, + { + "epoch": 4.87, + "learning_rate": 0.00013504273504273505, + "loss": 0.0703, + "step": 1900 + }, + { + "epoch": 4.9, + "learning_rate": 0.0001347008547008547, + "loss": 0.0543, + "step": 1910 + }, + { + "epoch": 4.92, + "learning_rate": 0.00013435897435897437, + "loss": 0.1162, + "step": 1920 + }, + { + "epoch": 4.92, + "eval_accuracy": 0.7631874298540965, + "eval_loss": 0.9254993200302124, + "eval_runtime": 32.3255, + "eval_samples_per_second": 110.377, + "eval_steps_per_second": 13.797, + "step": 1920 + }, + { + "epoch": 4.95, + "learning_rate": 0.00013401709401709402, + "loss": 0.2486, + "step": 1930 + }, + { + "epoch": 4.97, + "learning_rate": 0.00013367521367521368, + "loss": 0.1092, + "step": 1940 + }, + { + "epoch": 5.0, + "learning_rate": 0.00013333333333333334, + "loss": 0.1394, + "step": 1950 + }, + { + "epoch": 5.03, + "learning_rate": 0.000132991452991453, + "loss": 0.0824, + "step": 1960 + }, + { + "epoch": 5.03, + "eval_accuracy": 0.7665544332210998, + "eval_loss": 0.9655769467353821, + "eval_runtime": 33.4626, + "eval_samples_per_second": 106.627, + "eval_steps_per_second": 13.328, + "step": 1960 + }, + { + "epoch": 5.05, + "learning_rate": 0.00013264957264957266, + "loss": 0.0459, + "step": 1970 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001323076923076923, + "loss": 0.0497, + "step": 1980 + }, + { + "epoch": 5.1, + "learning_rate": 0.00013196581196581197, + "loss": 0.1127, + "step": 1990 + }, + { + "epoch": 5.13, + "learning_rate": 0.00013162393162393163, + "loss": 0.117, + "step": 2000 + }, + { + "epoch": 5.13, + "eval_accuracy": 0.7850729517396184, + "eval_loss": 0.9316745400428772, + "eval_runtime": 33.9606, + "eval_samples_per_second": 105.063, + "eval_steps_per_second": 13.133, + "step": 2000 + }, + { + "epoch": 5.15, + "learning_rate": 0.00013128205128205129, + "loss": 0.0828, + "step": 2010 + }, + { + "epoch": 5.18, + "learning_rate": 0.00013094017094017094, + "loss": 0.1093, + "step": 2020 + }, + { + "epoch": 5.21, + "learning_rate": 0.0001305982905982906, + "loss": 0.0873, + "step": 2030 + }, + { + "epoch": 5.23, + "learning_rate": 0.00013025641025641026, + "loss": 0.0714, + "step": 2040 + }, + { + "epoch": 5.23, + "eval_accuracy": 0.7811447811447811, + "eval_loss": 0.881272554397583, + "eval_runtime": 32.119, + "eval_samples_per_second": 111.087, + "eval_steps_per_second": 13.886, + "step": 2040 + }, + { + "epoch": 5.26, + "learning_rate": 0.00012991452991452992, + "loss": 0.0285, + "step": 2050 + }, + { + "epoch": 5.28, + "learning_rate": 0.00012957264957264957, + "loss": 0.0122, + "step": 2060 + }, + { + "epoch": 5.31, + "learning_rate": 0.00012923076923076923, + "loss": 0.0407, + "step": 2070 + }, + { + "epoch": 5.33, + "learning_rate": 0.00012888888888888892, + "loss": 0.086, + "step": 2080 + }, + { + "epoch": 5.33, + "eval_accuracy": 0.7901234567901234, + "eval_loss": 0.9098324775695801, + "eval_runtime": 31.6914, + "eval_samples_per_second": 112.586, + "eval_steps_per_second": 14.073, + "step": 2080 + }, + { + "epoch": 5.36, + "learning_rate": 0.00012854700854700857, + "loss": 0.0616, + "step": 2090 + }, + { + "epoch": 5.38, + "learning_rate": 0.00012820512820512823, + "loss": 0.1591, + "step": 2100 + }, + { + "epoch": 5.41, + "learning_rate": 0.00012786324786324786, + "loss": 0.1219, + "step": 2110 + }, + { + "epoch": 5.44, + "learning_rate": 0.00012752136752136752, + "loss": 0.1538, + "step": 2120 + }, + { + "epoch": 5.44, + "eval_accuracy": 0.7934904601571269, + "eval_loss": 0.8712666630744934, + "eval_runtime": 31.735, + "eval_samples_per_second": 112.431, + "eval_steps_per_second": 14.054, + "step": 2120 + }, + { + "epoch": 5.46, + "learning_rate": 0.00012717948717948718, + "loss": 0.0529, + "step": 2130 + }, + { + "epoch": 5.49, + "learning_rate": 0.00012683760683760684, + "loss": 0.1755, + "step": 2140 + }, + { + "epoch": 5.51, + "learning_rate": 0.0001264957264957265, + "loss": 0.0202, + "step": 2150 + }, + { + "epoch": 5.54, + "learning_rate": 0.00012615384615384615, + "loss": 0.0522, + "step": 2160 + }, + { + "epoch": 5.54, + "eval_accuracy": 0.7744107744107744, + "eval_loss": 0.9605388641357422, + "eval_runtime": 31.6059, + "eval_samples_per_second": 112.89, + "eval_steps_per_second": 14.111, + "step": 2160 + }, + { + "epoch": 5.56, + "learning_rate": 0.0001258119658119658, + "loss": 0.0517, + "step": 2170 + }, + { + "epoch": 5.59, + "learning_rate": 0.00012547008547008547, + "loss": 0.0873, + "step": 2180 + }, + { + "epoch": 5.62, + "learning_rate": 0.00012512820512820512, + "loss": 0.1013, + "step": 2190 + }, + { + "epoch": 5.64, + "learning_rate": 0.00012478632478632478, + "loss": 0.0485, + "step": 2200 + }, + { + "epoch": 5.64, + "eval_accuracy": 0.7856341189674523, + "eval_loss": 0.881409227848053, + "eval_runtime": 36.6436, + "eval_samples_per_second": 97.37, + "eval_steps_per_second": 12.171, + "step": 2200 + }, + { + "epoch": 5.67, + "learning_rate": 0.00012444444444444444, + "loss": 0.0432, + "step": 2210 + }, + { + "epoch": 5.69, + "learning_rate": 0.00012410256410256412, + "loss": 0.0601, + "step": 2220 + }, + { + "epoch": 5.72, + "learning_rate": 0.00012376068376068378, + "loss": 0.0593, + "step": 2230 + }, + { + "epoch": 5.74, + "learning_rate": 0.00012341880341880344, + "loss": 0.0374, + "step": 2240 + }, + { + "epoch": 5.74, + "eval_accuracy": 0.7946127946127947, + "eval_loss": 0.8851107358932495, + "eval_runtime": 31.8935, + "eval_samples_per_second": 111.872, + "eval_steps_per_second": 13.984, + "step": 2240 + }, + { + "epoch": 5.77, + "learning_rate": 0.0001230769230769231, + "loss": 0.106, + "step": 2250 + }, + { + "epoch": 5.79, + "learning_rate": 0.00012273504273504276, + "loss": 0.1051, + "step": 2260 + }, + { + "epoch": 5.82, + "learning_rate": 0.0001223931623931624, + "loss": 0.0107, + "step": 2270 + }, + { + "epoch": 5.85, + "learning_rate": 0.00012205128205128207, + "loss": 0.019, + "step": 2280 + }, + { + "epoch": 5.85, + "eval_accuracy": 0.7850729517396184, + "eval_loss": 0.9623528122901917, + "eval_runtime": 32.6648, + "eval_samples_per_second": 109.231, + "eval_steps_per_second": 13.654, + "step": 2280 + }, + { + "epoch": 5.87, + "learning_rate": 0.0001217094017094017, + "loss": 0.0304, + "step": 2290 + }, + { + "epoch": 5.9, + "learning_rate": 0.00012136752136752136, + "loss": 0.0269, + "step": 2300 + }, + { + "epoch": 5.92, + "learning_rate": 0.00012102564102564103, + "loss": 0.1185, + "step": 2310 + }, + { + "epoch": 5.95, + "learning_rate": 0.00012068376068376069, + "loss": 0.1115, + "step": 2320 + }, + { + "epoch": 5.95, + "eval_accuracy": 0.7811447811447811, + "eval_loss": 0.9703367948532104, + "eval_runtime": 40.8069, + "eval_samples_per_second": 87.436, + "eval_steps_per_second": 10.93, + "step": 2320 + }, + { + "epoch": 5.97, + "learning_rate": 0.00012034188034188035, + "loss": 0.0199, + "step": 2330 + }, + { + "epoch": 6.0, + "learning_rate": 0.00012, + "loss": 0.0796, + "step": 2340 + }, + { + "epoch": 6.03, + "learning_rate": 0.00011965811965811966, + "loss": 0.0881, + "step": 2350 + }, + { + "epoch": 6.05, + "learning_rate": 0.00011931623931623932, + "loss": 0.0579, + "step": 2360 + }, + { + "epoch": 6.05, + "eval_accuracy": 0.7901234567901234, + "eval_loss": 0.9741098880767822, + "eval_runtime": 34.3499, + "eval_samples_per_second": 103.872, + "eval_steps_per_second": 12.984, + "step": 2360 + }, + { + "epoch": 6.08, + "learning_rate": 0.00011897435897435898, + "loss": 0.0237, + "step": 2370 + }, + { + "epoch": 6.1, + "learning_rate": 0.00011863247863247863, + "loss": 0.0536, + "step": 2380 + }, + { + "epoch": 6.13, + "learning_rate": 0.00011829059829059829, + "loss": 0.0947, + "step": 2390 + }, + { + "epoch": 6.15, + "learning_rate": 0.00011794871794871796, + "loss": 0.027, + "step": 2400 + }, + { + "epoch": 6.15, + "eval_accuracy": 0.77665544332211, + "eval_loss": 1.0114481449127197, + "eval_runtime": 32.0411, + "eval_samples_per_second": 111.357, + "eval_steps_per_second": 13.92, + "step": 2400 + }, + { + "epoch": 6.18, + "learning_rate": 0.00011760683760683762, + "loss": 0.0638, + "step": 2410 + }, + { + "epoch": 6.21, + "learning_rate": 0.00011726495726495728, + "loss": 0.015, + "step": 2420 + }, + { + "epoch": 6.23, + "learning_rate": 0.00011692307692307694, + "loss": 0.0663, + "step": 2430 + }, + { + "epoch": 6.26, + "learning_rate": 0.0001165811965811966, + "loss": 0.0202, + "step": 2440 + }, + { + "epoch": 6.26, + "eval_accuracy": 0.7929292929292929, + "eval_loss": 0.9184174537658691, + "eval_runtime": 32.1376, + "eval_samples_per_second": 111.022, + "eval_steps_per_second": 13.878, + "step": 2440 + }, + { + "epoch": 6.28, + "learning_rate": 0.00011623931623931625, + "loss": 0.0905, + "step": 2450 + }, + { + "epoch": 6.31, + "learning_rate": 0.00011589743589743591, + "loss": 0.0114, + "step": 2460 + }, + { + "epoch": 6.33, + "learning_rate": 0.00011555555555555555, + "loss": 0.0427, + "step": 2470 + }, + { + "epoch": 6.36, + "learning_rate": 0.00011521367521367521, + "loss": 0.0023, + "step": 2480 + }, + { + "epoch": 6.36, + "eval_accuracy": 0.7755331088664422, + "eval_loss": 1.037878155708313, + "eval_runtime": 32.6535, + "eval_samples_per_second": 109.269, + "eval_steps_per_second": 13.659, + "step": 2480 + }, + { + "epoch": 6.38, + "learning_rate": 0.00011487179487179487, + "loss": 0.0371, + "step": 2490 + }, + { + "epoch": 6.41, + "learning_rate": 0.00011452991452991453, + "loss": 0.0473, + "step": 2500 + }, + { + "epoch": 6.44, + "learning_rate": 0.00011418803418803418, + "loss": 0.0203, + "step": 2510 + }, + { + "epoch": 6.46, + "learning_rate": 0.00011384615384615384, + "loss": 0.1106, + "step": 2520 + }, + { + "epoch": 6.46, + "eval_accuracy": 0.7839506172839507, + "eval_loss": 0.9894450902938843, + "eval_runtime": 34.0314, + "eval_samples_per_second": 104.844, + "eval_steps_per_second": 13.106, + "step": 2520 + }, + { + "epoch": 6.49, + "learning_rate": 0.00011350427350427351, + "loss": 0.048, + "step": 2530 + }, + { + "epoch": 6.51, + "learning_rate": 0.00011316239316239317, + "loss": 0.0615, + "step": 2540 + }, + { + "epoch": 6.54, + "learning_rate": 0.00011282051282051283, + "loss": 0.0127, + "step": 2550 + }, + { + "epoch": 6.56, + "learning_rate": 0.00011247863247863249, + "loss": 0.0032, + "step": 2560 + }, + { + "epoch": 6.56, + "eval_accuracy": 0.8063973063973064, + "eval_loss": 0.912738025188446, + "eval_runtime": 36.8685, + "eval_samples_per_second": 96.776, + "eval_steps_per_second": 12.097, + "step": 2560 + }, + { + "epoch": 6.59, + "learning_rate": 0.00011213675213675214, + "loss": 0.0161, + "step": 2570 + }, + { + "epoch": 6.62, + "learning_rate": 0.0001117948717948718, + "loss": 0.0356, + "step": 2580 + }, + { + "epoch": 6.64, + "learning_rate": 0.00011145299145299146, + "loss": 0.0041, + "step": 2590 + }, + { + "epoch": 6.67, + "learning_rate": 0.00011111111111111112, + "loss": 0.0305, + "step": 2600 + }, + { + "epoch": 6.67, + "eval_accuracy": 0.803030303030303, + "eval_loss": 0.920154333114624, + "eval_runtime": 34.2244, + "eval_samples_per_second": 104.253, + "eval_steps_per_second": 13.032, + "step": 2600 + }, + { + "epoch": 6.69, + "learning_rate": 0.00011076923076923077, + "loss": 0.0022, + "step": 2610 + }, + { + "epoch": 6.72, + "learning_rate": 0.00011042735042735045, + "loss": 0.0211, + "step": 2620 + }, + { + "epoch": 6.74, + "learning_rate": 0.0001100854700854701, + "loss": 0.0031, + "step": 2630 + }, + { + "epoch": 6.77, + "learning_rate": 0.00010974358974358976, + "loss": 0.021, + "step": 2640 + }, + { + "epoch": 6.77, + "eval_accuracy": 0.7985409652076318, + "eval_loss": 0.9700907468795776, + "eval_runtime": 34.5087, + "eval_samples_per_second": 103.394, + "eval_steps_per_second": 12.924, + "step": 2640 + }, + { + "epoch": 6.79, + "learning_rate": 0.00010940170940170942, + "loss": 0.0434, + "step": 2650 + }, + { + "epoch": 6.82, + "learning_rate": 0.00010905982905982905, + "loss": 0.0163, + "step": 2660 + }, + { + "epoch": 6.85, + "learning_rate": 0.00010871794871794872, + "loss": 0.0366, + "step": 2670 + }, + { + "epoch": 6.87, + "learning_rate": 0.00010837606837606838, + "loss": 0.044, + "step": 2680 + }, + { + "epoch": 6.87, + "eval_accuracy": 0.792368125701459, + "eval_loss": 1.1099859476089478, + "eval_runtime": 34.3976, + "eval_samples_per_second": 103.728, + "eval_steps_per_second": 12.966, + "step": 2680 + }, + { + "epoch": 6.9, + "learning_rate": 0.00010803418803418804, + "loss": 0.0224, + "step": 2690 + }, + { + "epoch": 6.92, + "learning_rate": 0.0001076923076923077, + "loss": 0.0068, + "step": 2700 + }, + { + "epoch": 6.95, + "learning_rate": 0.00010735042735042735, + "loss": 0.0419, + "step": 2710 + }, + { + "epoch": 6.97, + "learning_rate": 0.00010700854700854701, + "loss": 0.0097, + "step": 2720 + }, + { + "epoch": 6.97, + "eval_accuracy": 0.7631874298540965, + "eval_loss": 1.2310634851455688, + "eval_runtime": 37.3556, + "eval_samples_per_second": 95.514, + "eval_steps_per_second": 11.939, + "step": 2720 + }, + { + "epoch": 7.0, + "learning_rate": 0.00010666666666666667, + "loss": 0.0381, + "step": 2730 + }, + { + "epoch": 7.03, + "learning_rate": 0.00010632478632478632, + "loss": 0.0017, + "step": 2740 + }, + { + "epoch": 7.05, + "learning_rate": 0.000105982905982906, + "loss": 0.0077, + "step": 2750 + }, + { + "epoch": 7.08, + "learning_rate": 0.00010564102564102565, + "loss": 0.0015, + "step": 2760 + }, + { + "epoch": 7.08, + "eval_accuracy": 0.8052749719416387, + "eval_loss": 0.9980599880218506, + "eval_runtime": 38.1071, + "eval_samples_per_second": 93.631, + "eval_steps_per_second": 11.704, + "step": 2760 + }, + { + "epoch": 7.1, + "learning_rate": 0.00010529914529914531, + "loss": 0.0017, + "step": 2770 + }, + { + "epoch": 7.13, + "learning_rate": 0.00010495726495726497, + "loss": 0.0041, + "step": 2780 + }, + { + "epoch": 7.15, + "learning_rate": 0.00010461538461538463, + "loss": 0.0348, + "step": 2790 + }, + { + "epoch": 7.18, + "learning_rate": 0.00010427350427350428, + "loss": 0.0034, + "step": 2800 + }, + { + "epoch": 7.18, + "eval_accuracy": 0.7929292929292929, + "eval_loss": 1.0722278356552124, + "eval_runtime": 34.0426, + "eval_samples_per_second": 104.81, + "eval_steps_per_second": 13.101, + "step": 2800 + }, + { + "epoch": 7.21, + "learning_rate": 0.00010393162393162394, + "loss": 0.0296, + "step": 2810 + }, + { + "epoch": 7.23, + "learning_rate": 0.0001035897435897436, + "loss": 0.0614, + "step": 2820 + }, + { + "epoch": 7.26, + "learning_rate": 0.00010324786324786326, + "loss": 0.0014, + "step": 2830 + }, + { + "epoch": 7.28, + "learning_rate": 0.0001029059829059829, + "loss": 0.0317, + "step": 2840 + }, + { + "epoch": 7.28, + "eval_accuracy": 0.7968574635241302, + "eval_loss": 1.049239158630371, + "eval_runtime": 34.4577, + "eval_samples_per_second": 103.547, + "eval_steps_per_second": 12.943, + "step": 2840 + }, + { + "epoch": 7.31, + "learning_rate": 0.00010256410256410256, + "loss": 0.0122, + "step": 2850 + }, + { + "epoch": 7.33, + "learning_rate": 0.00010222222222222222, + "loss": 0.0019, + "step": 2860 + }, + { + "epoch": 7.36, + "learning_rate": 0.00010188034188034187, + "loss": 0.0055, + "step": 2870 + }, + { + "epoch": 7.38, + "learning_rate": 0.00010153846153846153, + "loss": 0.0013, + "step": 2880 + }, + { + "epoch": 7.38, + "eval_accuracy": 0.792368125701459, + "eval_loss": 1.101838231086731, + "eval_runtime": 38.1047, + "eval_samples_per_second": 93.637, + "eval_steps_per_second": 11.705, + "step": 2880 + }, + { + "epoch": 7.41, + "learning_rate": 0.0001011965811965812, + "loss": 0.0027, + "step": 2890 + }, + { + "epoch": 7.44, + "learning_rate": 0.00010085470085470086, + "loss": 0.1045, + "step": 2900 + }, + { + "epoch": 7.46, + "learning_rate": 0.00010051282051282052, + "loss": 0.003, + "step": 2910 + }, + { + "epoch": 7.49, + "learning_rate": 0.00010017094017094018, + "loss": 0.0145, + "step": 2920 + }, + { + "epoch": 7.49, + "eval_accuracy": 0.7890011223344556, + "eval_loss": 1.1084059476852417, + "eval_runtime": 32.7356, + "eval_samples_per_second": 108.995, + "eval_steps_per_second": 13.624, + "step": 2920 + }, + { + "epoch": 7.51, + "learning_rate": 9.982905982905983e-05, + "loss": 0.0023, + "step": 2930 + }, + { + "epoch": 7.54, + "learning_rate": 9.948717948717949e-05, + "loss": 0.003, + "step": 2940 + }, + { + "epoch": 7.56, + "learning_rate": 9.914529914529915e-05, + "loss": 0.0292, + "step": 2950 + }, + { + "epoch": 7.59, + "learning_rate": 9.88034188034188e-05, + "loss": 0.0171, + "step": 2960 + }, + { + "epoch": 7.59, + "eval_accuracy": 0.8069584736251403, + "eval_loss": 1.0281527042388916, + "eval_runtime": 32.3307, + "eval_samples_per_second": 110.359, + "eval_steps_per_second": 13.795, + "step": 2960 + }, + { + "epoch": 7.62, + "learning_rate": 9.846153846153848e-05, + "loss": 0.0433, + "step": 2970 + }, + { + "epoch": 7.64, + "learning_rate": 9.811965811965812e-05, + "loss": 0.0178, + "step": 2980 + }, + { + "epoch": 7.67, + "learning_rate": 9.777777777777778e-05, + "loss": 0.0218, + "step": 2990 + }, + { + "epoch": 7.69, + "learning_rate": 9.743589743589744e-05, + "loss": 0.0429, + "step": 3000 + }, + { + "epoch": 7.69, + "eval_accuracy": 0.7901234567901234, + "eval_loss": 1.107517123222351, + "eval_runtime": 32.4213, + "eval_samples_per_second": 110.051, + "eval_steps_per_second": 13.756, + "step": 3000 + }, + { + "epoch": 7.72, + "learning_rate": 9.70940170940171e-05, + "loss": 0.0068, + "step": 3010 + }, + { + "epoch": 7.74, + "learning_rate": 9.675213675213675e-05, + "loss": 0.0084, + "step": 3020 + }, + { + "epoch": 7.77, + "learning_rate": 9.641025641025641e-05, + "loss": 0.0019, + "step": 3030 + }, + { + "epoch": 7.79, + "learning_rate": 9.606837606837608e-05, + "loss": 0.0012, + "step": 3040 + }, + { + "epoch": 7.79, + "eval_accuracy": 0.7951739618406285, + "eval_loss": 1.1173745393753052, + "eval_runtime": 33.3697, + "eval_samples_per_second": 106.923, + "eval_steps_per_second": 13.365, + "step": 3040 + }, + { + "epoch": 7.82, + "learning_rate": 9.572649572649574e-05, + "loss": 0.0012, + "step": 3050 + }, + { + "epoch": 7.85, + "learning_rate": 9.53846153846154e-05, + "loss": 0.0148, + "step": 3060 + }, + { + "epoch": 7.87, + "learning_rate": 9.504273504273504e-05, + "loss": 0.0396, + "step": 3070 + }, + { + "epoch": 7.9, + "learning_rate": 9.47008547008547e-05, + "loss": 0.0267, + "step": 3080 + }, + { + "epoch": 7.9, + "eval_accuracy": 0.7895622895622896, + "eval_loss": 1.1118639707565308, + "eval_runtime": 39.7858, + "eval_samples_per_second": 89.68, + "eval_steps_per_second": 11.21, + "step": 3080 + }, + { + "epoch": 7.92, + "learning_rate": 9.435897435897436e-05, + "loss": 0.0531, + "step": 3090 + }, + { + "epoch": 7.95, + "learning_rate": 9.401709401709401e-05, + "loss": 0.0025, + "step": 3100 + }, + { + "epoch": 7.97, + "learning_rate": 9.367521367521369e-05, + "loss": 0.0388, + "step": 3110 + }, + { + "epoch": 8.0, + "learning_rate": 9.333333333333334e-05, + "loss": 0.0313, + "step": 3120 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.7951739618406285, + "eval_loss": 1.1246873140335083, + "eval_runtime": 33.2457, + "eval_samples_per_second": 107.322, + "eval_steps_per_second": 13.415, + "step": 3120 + }, + { + "epoch": 8.03, + "learning_rate": 9.2991452991453e-05, + "loss": 0.0015, + "step": 3130 + }, + { + "epoch": 8.05, + "learning_rate": 9.264957264957266e-05, + "loss": 0.001, + "step": 3140 + }, + { + "epoch": 8.08, + "learning_rate": 9.230769230769232e-05, + "loss": 0.0048, + "step": 3150 + }, + { + "epoch": 8.1, + "learning_rate": 9.196581196581196e-05, + "loss": 0.001, + "step": 3160 + }, + { + "epoch": 8.1, + "eval_accuracy": 0.7861952861952862, + "eval_loss": 1.1706517934799194, + "eval_runtime": 34.9683, + "eval_samples_per_second": 102.035, + "eval_steps_per_second": 12.754, + "step": 3160 + }, + { + "epoch": 8.13, + "learning_rate": 9.162393162393162e-05, + "loss": 0.0203, + "step": 3170 + }, + { + "epoch": 8.15, + "learning_rate": 9.128205128205129e-05, + "loss": 0.001, + "step": 3180 + }, + { + "epoch": 8.18, + "learning_rate": 9.094017094017095e-05, + "loss": 0.0019, + "step": 3190 + }, + { + "epoch": 8.21, + "learning_rate": 9.05982905982906e-05, + "loss": 0.0372, + "step": 3200 + }, + { + "epoch": 8.21, + "eval_accuracy": 0.8024691358024691, + "eval_loss": 1.1111818552017212, + "eval_runtime": 33.7848, + "eval_samples_per_second": 105.61, + "eval_steps_per_second": 13.201, + "step": 3200 + }, + { + "epoch": 8.23, + "learning_rate": 9.025641025641026e-05, + "loss": 0.0031, + "step": 3210 + }, + { + "epoch": 8.26, + "learning_rate": 8.991452991452992e-05, + "loss": 0.002, + "step": 3220 + }, + { + "epoch": 8.28, + "learning_rate": 8.957264957264958e-05, + "loss": 0.0009, + "step": 3230 + }, + { + "epoch": 8.31, + "learning_rate": 8.923076923076924e-05, + "loss": 0.0088, + "step": 3240 + }, + { + "epoch": 8.31, + "eval_accuracy": 0.8092031425364759, + "eval_loss": 1.0615540742874146, + "eval_runtime": 32.6537, + "eval_samples_per_second": 109.268, + "eval_steps_per_second": 13.658, + "step": 3240 + }, + { + "epoch": 8.33, + "learning_rate": 8.888888888888889e-05, + "loss": 0.0009, + "step": 3250 + }, + { + "epoch": 8.36, + "learning_rate": 8.854700854700855e-05, + "loss": 0.0389, + "step": 3260 + }, + { + "epoch": 8.38, + "learning_rate": 8.820512820512821e-05, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 8.41, + "learning_rate": 8.786324786324787e-05, + "loss": 0.0155, + "step": 3280 + }, + { + "epoch": 8.41, + "eval_accuracy": 0.7968574635241302, + "eval_loss": 1.1376060247421265, + "eval_runtime": 34.6264, + "eval_samples_per_second": 103.043, + "eval_steps_per_second": 12.88, + "step": 3280 + }, + { + "epoch": 8.44, + "learning_rate": 8.752136752136752e-05, + "loss": 0.0011, + "step": 3290 + }, + { + "epoch": 8.46, + "learning_rate": 8.717948717948718e-05, + "loss": 0.0013, + "step": 3300 + }, + { + "epoch": 8.49, + "learning_rate": 8.683760683760684e-05, + "loss": 0.0017, + "step": 3310 + }, + { + "epoch": 8.51, + "learning_rate": 8.64957264957265e-05, + "loss": 0.0095, + "step": 3320 + }, + { + "epoch": 8.51, + "eval_accuracy": 0.7794612794612794, + "eval_loss": 1.210555911064148, + "eval_runtime": 33.508, + "eval_samples_per_second": 106.482, + "eval_steps_per_second": 13.31, + "step": 3320 + }, + { + "epoch": 8.54, + "learning_rate": 8.615384615384617e-05, + "loss": 0.0133, + "step": 3330 + }, + { + "epoch": 8.56, + "learning_rate": 8.581196581196581e-05, + "loss": 0.0007, + "step": 3340 + }, + { + "epoch": 8.59, + "learning_rate": 8.547008547008547e-05, + "loss": 0.0008, + "step": 3350 + }, + { + "epoch": 8.62, + "learning_rate": 8.512820512820513e-05, + "loss": 0.0008, + "step": 3360 + }, + { + "epoch": 8.62, + "eval_accuracy": 0.797979797979798, + "eval_loss": 1.1278263330459595, + "eval_runtime": 33.3484, + "eval_samples_per_second": 106.992, + "eval_steps_per_second": 13.374, + "step": 3360 + }, + { + "epoch": 8.64, + "learning_rate": 8.478632478632479e-05, + "loss": 0.0062, + "step": 3370 + }, + { + "epoch": 8.67, + "learning_rate": 8.444444444444444e-05, + "loss": 0.0464, + "step": 3380 + }, + { + "epoch": 8.69, + "learning_rate": 8.410256410256411e-05, + "loss": 0.0314, + "step": 3390 + }, + { + "epoch": 8.72, + "learning_rate": 8.376068376068377e-05, + "loss": 0.0068, + "step": 3400 + }, + { + "epoch": 8.72, + "eval_accuracy": 0.8108866442199776, + "eval_loss": 1.0497138500213623, + "eval_runtime": 34.5617, + "eval_samples_per_second": 103.236, + "eval_steps_per_second": 12.904, + "step": 3400 + }, + { + "epoch": 8.74, + "learning_rate": 8.341880341880343e-05, + "loss": 0.0007, + "step": 3410 + }, + { + "epoch": 8.77, + "learning_rate": 8.307692307692309e-05, + "loss": 0.0009, + "step": 3420 + }, + { + "epoch": 8.79, + "learning_rate": 8.273504273504273e-05, + "loss": 0.0235, + "step": 3430 + }, + { + "epoch": 8.82, + "learning_rate": 8.239316239316239e-05, + "loss": 0.0446, + "step": 3440 + }, + { + "epoch": 8.82, + "eval_accuracy": 0.8024691358024691, + "eval_loss": 1.1239783763885498, + "eval_runtime": 35.6118, + "eval_samples_per_second": 100.192, + "eval_steps_per_second": 12.524, + "step": 3440 + }, + { + "epoch": 8.85, + "learning_rate": 8.205128205128205e-05, + "loss": 0.0454, + "step": 3450 + }, + { + "epoch": 8.87, + "learning_rate": 8.170940170940172e-05, + "loss": 0.0012, + "step": 3460 + }, + { + "epoch": 8.9, + "learning_rate": 8.136752136752138e-05, + "loss": 0.0222, + "step": 3470 + }, + { + "epoch": 8.92, + "learning_rate": 8.102564102564103e-05, + "loss": 0.0026, + "step": 3480 + }, + { + "epoch": 8.92, + "eval_accuracy": 0.8120089786756454, + "eval_loss": 1.0241341590881348, + "eval_runtime": 44.9153, + "eval_samples_per_second": 79.438, + "eval_steps_per_second": 9.93, + "step": 3480 + }, + { + "epoch": 8.95, + "learning_rate": 8.068376068376069e-05, + "loss": 0.0112, + "step": 3490 + }, + { + "epoch": 8.97, + "learning_rate": 8.034188034188035e-05, + "loss": 0.0044, + "step": 3500 + }, + { + "epoch": 9.0, + "learning_rate": 8e-05, + "loss": 0.0125, + "step": 3510 + }, + { + "epoch": 9.03, + "learning_rate": 7.965811965811965e-05, + "loss": 0.0315, + "step": 3520 + }, + { + "epoch": 9.03, + "eval_accuracy": 0.8024691358024691, + "eval_loss": 1.0902154445648193, + "eval_runtime": 33.6468, + "eval_samples_per_second": 106.043, + "eval_steps_per_second": 13.255, + "step": 3520 + }, + { + "epoch": 9.05, + "learning_rate": 7.931623931623932e-05, + "loss": 0.0312, + "step": 3530 + }, + { + "epoch": 9.08, + "learning_rate": 7.897435897435898e-05, + "loss": 0.0011, + "step": 3540 + }, + { + "epoch": 9.1, + "learning_rate": 7.863247863247864e-05, + "loss": 0.0101, + "step": 3550 + }, + { + "epoch": 9.13, + "learning_rate": 7.82905982905983e-05, + "loss": 0.0262, + "step": 3560 + }, + { + "epoch": 9.13, + "eval_accuracy": 0.8035914702581369, + "eval_loss": 1.0600807666778564, + "eval_runtime": 34.7033, + "eval_samples_per_second": 102.814, + "eval_steps_per_second": 12.852, + "step": 3560 + }, + { + "epoch": 9.15, + "learning_rate": 7.794871794871795e-05, + "loss": 0.0007, + "step": 3570 + }, + { + "epoch": 9.18, + "learning_rate": 7.760683760683761e-05, + "loss": 0.0008, + "step": 3580 + }, + { + "epoch": 9.21, + "learning_rate": 7.726495726495727e-05, + "loss": 0.0008, + "step": 3590 + }, + { + "epoch": 9.23, + "learning_rate": 7.692307692307693e-05, + "loss": 0.0014, + "step": 3600 + }, + { + "epoch": 9.23, + "eval_accuracy": 0.7946127946127947, + "eval_loss": 1.1410149335861206, + "eval_runtime": 36.6565, + "eval_samples_per_second": 97.336, + "eval_steps_per_second": 12.167, + "step": 3600 + }, + { + "epoch": 9.26, + "learning_rate": 7.65811965811966e-05, + "loss": 0.021, + "step": 3610 + }, + { + "epoch": 9.28, + "learning_rate": 7.623931623931624e-05, + "loss": 0.0006, + "step": 3620 + }, + { + "epoch": 9.31, + "learning_rate": 7.58974358974359e-05, + "loss": 0.0007, + "step": 3630 + }, + { + "epoch": 9.33, + "learning_rate": 7.555555555555556e-05, + "loss": 0.0006, + "step": 3640 + }, + { + "epoch": 9.33, + "eval_accuracy": 0.8103254769921436, + "eval_loss": 1.0399866104125977, + "eval_runtime": 33.4528, + "eval_samples_per_second": 106.658, + "eval_steps_per_second": 13.332, + "step": 3640 + }, + { + "epoch": 9.36, + "learning_rate": 7.521367521367521e-05, + "loss": 0.0006, + "step": 3650 + }, + { + "epoch": 9.38, + "learning_rate": 7.487179487179487e-05, + "loss": 0.0009, + "step": 3660 + }, + { + "epoch": 9.41, + "learning_rate": 7.452991452991453e-05, + "loss": 0.0368, + "step": 3670 + }, + { + "epoch": 9.44, + "learning_rate": 7.41880341880342e-05, + "loss": 0.0063, + "step": 3680 + }, + { + "epoch": 9.44, + "eval_accuracy": 0.8058361391694725, + "eval_loss": 1.107193112373352, + "eval_runtime": 41.6725, + "eval_samples_per_second": 85.62, + "eval_steps_per_second": 10.702, + "step": 3680 + }, + { + "epoch": 9.46, + "learning_rate": 7.384615384615386e-05, + "loss": 0.0207, + "step": 3690 + }, + { + "epoch": 9.49, + "learning_rate": 7.350427350427352e-05, + "loss": 0.0006, + "step": 3700 + }, + { + "epoch": 9.51, + "learning_rate": 7.316239316239316e-05, + "loss": 0.0025, + "step": 3710 + }, + { + "epoch": 9.54, + "learning_rate": 7.282051282051282e-05, + "loss": 0.0006, + "step": 3720 + }, + { + "epoch": 9.54, + "eval_accuracy": 0.8013468013468014, + "eval_loss": 1.1391396522521973, + "eval_runtime": 32.0452, + "eval_samples_per_second": 111.343, + "eval_steps_per_second": 13.918, + "step": 3720 + }, + { + "epoch": 9.56, + "learning_rate": 7.247863247863248e-05, + "loss": 0.0031, + "step": 3730 + }, + { + "epoch": 9.59, + "learning_rate": 7.213675213675213e-05, + "loss": 0.001, + "step": 3740 + }, + { + "epoch": 9.62, + "learning_rate": 7.17948717948718e-05, + "loss": 0.0297, + "step": 3750 + }, + { + "epoch": 9.64, + "learning_rate": 7.145299145299146e-05, + "loss": 0.0007, + "step": 3760 + }, + { + "epoch": 9.64, + "eval_accuracy": 0.8013468013468014, + "eval_loss": 1.124064326286316, + "eval_runtime": 32.9836, + "eval_samples_per_second": 108.175, + "eval_steps_per_second": 13.522, + "step": 3760 + }, + { + "epoch": 9.67, + "learning_rate": 7.111111111111112e-05, + "loss": 0.0006, + "step": 3770 + }, + { + "epoch": 9.69, + "learning_rate": 7.076923076923078e-05, + "loss": 0.0006, + "step": 3780 + }, + { + "epoch": 9.72, + "learning_rate": 7.042735042735044e-05, + "loss": 0.0216, + "step": 3790 + }, + { + "epoch": 9.74, + "learning_rate": 7.008547008547008e-05, + "loss": 0.0006, + "step": 3800 + }, + { + "epoch": 9.74, + "eval_accuracy": 0.7789001122334456, + "eval_loss": 1.2622841596603394, + "eval_runtime": 34.3515, + "eval_samples_per_second": 103.867, + "eval_steps_per_second": 12.983, + "step": 3800 + }, + { + "epoch": 9.77, + "learning_rate": 6.974358974358974e-05, + "loss": 0.0007, + "step": 3810 + }, + { + "epoch": 9.79, + "learning_rate": 6.940170940170941e-05, + "loss": 0.0015, + "step": 3820 + }, + { + "epoch": 9.82, + "learning_rate": 6.905982905982907e-05, + "loss": 0.0009, + "step": 3830 + }, + { + "epoch": 9.85, + "learning_rate": 6.871794871794872e-05, + "loss": 0.0006, + "step": 3840 + }, + { + "epoch": 9.85, + "eval_accuracy": 0.7991021324354658, + "eval_loss": 1.1737140417099, + "eval_runtime": 43.7608, + "eval_samples_per_second": 81.534, + "eval_steps_per_second": 10.192, + "step": 3840 + }, + { + "epoch": 9.87, + "learning_rate": 6.837606837606838e-05, + "loss": 0.0005, + "step": 3850 + }, + { + "epoch": 9.9, + "learning_rate": 6.803418803418804e-05, + "loss": 0.0014, + "step": 3860 + }, + { + "epoch": 9.92, + "learning_rate": 6.76923076923077e-05, + "loss": 0.0006, + "step": 3870 + }, + { + "epoch": 9.95, + "learning_rate": 6.735042735042735e-05, + "loss": 0.0011, + "step": 3880 + }, + { + "epoch": 9.95, + "eval_accuracy": 0.7884399551066218, + "eval_loss": 1.2243778705596924, + "eval_runtime": 36.7265, + "eval_samples_per_second": 97.15, + "eval_steps_per_second": 12.144, + "step": 3880 + }, + { + "epoch": 9.97, + "learning_rate": 6.700854700854701e-05, + "loss": 0.0015, + "step": 3890 + }, + { + "epoch": 10.0, + "learning_rate": 6.666666666666667e-05, + "loss": 0.0117, + "step": 3900 + }, + { + "epoch": 10.03, + "learning_rate": 6.632478632478633e-05, + "loss": 0.0007, + "step": 3910 + }, + { + "epoch": 10.05, + "learning_rate": 6.598290598290599e-05, + "loss": 0.0047, + "step": 3920 + }, + { + "epoch": 10.05, + "eval_accuracy": 0.8002244668911336, + "eval_loss": 1.185262680053711, + "eval_runtime": 32.7462, + "eval_samples_per_second": 108.959, + "eval_steps_per_second": 13.62, + "step": 3920 + }, + { + "epoch": 10.08, + "learning_rate": 6.564102564102564e-05, + "loss": 0.0006, + "step": 3930 + }, + { + "epoch": 10.1, + "learning_rate": 6.52991452991453e-05, + "loss": 0.0288, + "step": 3940 + }, + { + "epoch": 10.13, + "learning_rate": 6.495726495726496e-05, + "loss": 0.0005, + "step": 3950 + }, + { + "epoch": 10.15, + "learning_rate": 6.461538461538462e-05, + "loss": 0.0018, + "step": 3960 + }, + { + "epoch": 10.15, + "eval_accuracy": 0.8092031425364759, + "eval_loss": 1.1372754573822021, + "eval_runtime": 34.7934, + "eval_samples_per_second": 102.548, + "eval_steps_per_second": 12.819, + "step": 3960 + }, + { + "epoch": 10.18, + "learning_rate": 6.427350427350429e-05, + "loss": 0.0005, + "step": 3970 + }, + { + "epoch": 10.21, + "learning_rate": 6.393162393162393e-05, + "loss": 0.0005, + "step": 3980 + }, + { + "epoch": 10.23, + "learning_rate": 6.358974358974359e-05, + "loss": 0.0005, + "step": 3990 + }, + { + "epoch": 10.26, + "learning_rate": 6.324786324786325e-05, + "loss": 0.0006, + "step": 4000 + }, + { + "epoch": 10.26, + "eval_accuracy": 0.8069584736251403, + "eval_loss": 1.1416363716125488, + "eval_runtime": 33.7666, + "eval_samples_per_second": 105.667, + "eval_steps_per_second": 13.208, + "step": 4000 + }, + { + "epoch": 10.28, + "learning_rate": 6.29059829059829e-05, + "loss": 0.0005, + "step": 4010 + }, + { + "epoch": 10.31, + "learning_rate": 6.256410256410256e-05, + "loss": 0.012, + "step": 4020 + }, + { + "epoch": 10.33, + "learning_rate": 6.222222222222222e-05, + "loss": 0.0013, + "step": 4030 + }, + { + "epoch": 10.36, + "learning_rate": 6.188034188034189e-05, + "loss": 0.007, + "step": 4040 + }, + { + "epoch": 10.36, + "eval_accuracy": 0.8047138047138047, + "eval_loss": 1.205945372581482, + "eval_runtime": 32.5065, + "eval_samples_per_second": 109.763, + "eval_steps_per_second": 13.72, + "step": 4040 + }, + { + "epoch": 10.38, + "learning_rate": 6.153846153846155e-05, + "loss": 0.0005, + "step": 4050 + }, + { + "epoch": 10.41, + "learning_rate": 6.11965811965812e-05, + "loss": 0.0096, + "step": 4060 + }, + { + "epoch": 10.44, + "learning_rate": 6.085470085470085e-05, + "loss": 0.0008, + "step": 4070 + }, + { + "epoch": 10.46, + "learning_rate": 6.0512820512820515e-05, + "loss": 0.0005, + "step": 4080 + }, + { + "epoch": 10.46, + "eval_accuracy": 0.8170594837261503, + "eval_loss": 1.131420612335205, + "eval_runtime": 36.6164, + "eval_samples_per_second": 97.443, + "eval_steps_per_second": 12.18, + "step": 4080 + }, + { + "epoch": 10.49, + "learning_rate": 6.017094017094017e-05, + "loss": 0.0005, + "step": 4090 + }, + { + "epoch": 10.51, + "learning_rate": 5.982905982905983e-05, + "loss": 0.0006, + "step": 4100 + }, + { + "epoch": 10.54, + "learning_rate": 5.948717948717949e-05, + "loss": 0.0005, + "step": 4110 + }, + { + "epoch": 10.56, + "learning_rate": 5.9145299145299146e-05, + "loss": 0.0005, + "step": 4120 + }, + { + "epoch": 10.56, + "eval_accuracy": 0.8164983164983165, + "eval_loss": 1.1375317573547363, + "eval_runtime": 32.4301, + "eval_samples_per_second": 110.021, + "eval_steps_per_second": 13.753, + "step": 4120 + }, + { + "epoch": 10.59, + "learning_rate": 5.880341880341881e-05, + "loss": 0.0004, + "step": 4130 + }, + { + "epoch": 10.62, + "learning_rate": 5.846153846153847e-05, + "loss": 0.0005, + "step": 4140 + }, + { + "epoch": 10.64, + "learning_rate": 5.8119658119658126e-05, + "loss": 0.0005, + "step": 4150 + }, + { + "epoch": 10.67, + "learning_rate": 5.7777777777777776e-05, + "loss": 0.0004, + "step": 4160 + }, + { + "epoch": 10.67, + "eval_accuracy": 0.8164983164983165, + "eval_loss": 1.136337161064148, + "eval_runtime": 31.869, + "eval_samples_per_second": 111.959, + "eval_steps_per_second": 13.995, + "step": 4160 + }, + { + "epoch": 10.69, + "learning_rate": 5.7435897435897434e-05, + "loss": 0.0004, + "step": 4170 + }, + { + "epoch": 10.72, + "learning_rate": 5.709401709401709e-05, + "loss": 0.0004, + "step": 4180 + }, + { + "epoch": 10.74, + "learning_rate": 5.6752136752136756e-05, + "loss": 0.0005, + "step": 4190 + }, + { + "epoch": 10.77, + "learning_rate": 5.6410256410256414e-05, + "loss": 0.0004, + "step": 4200 + }, + { + "epoch": 10.77, + "eval_accuracy": 0.819304152637486, + "eval_loss": 1.1374398469924927, + "eval_runtime": 34.7543, + "eval_samples_per_second": 102.664, + "eval_steps_per_second": 12.833, + "step": 4200 + }, + { + "epoch": 10.79, + "learning_rate": 5.606837606837607e-05, + "loss": 0.0007, + "step": 4210 + }, + { + "epoch": 10.82, + "learning_rate": 5.572649572649573e-05, + "loss": 0.0004, + "step": 4220 + }, + { + "epoch": 10.85, + "learning_rate": 5.538461538461539e-05, + "loss": 0.0005, + "step": 4230 + }, + { + "epoch": 10.87, + "learning_rate": 5.504273504273505e-05, + "loss": 0.0009, + "step": 4240 + }, + { + "epoch": 10.87, + "eval_accuracy": 0.8069584736251403, + "eval_loss": 1.1739455461502075, + "eval_runtime": 33.5639, + "eval_samples_per_second": 106.305, + "eval_steps_per_second": 13.288, + "step": 4240 + }, + { + "epoch": 10.9, + "learning_rate": 5.470085470085471e-05, + "loss": 0.0005, + "step": 4250 + }, + { + "epoch": 10.92, + "learning_rate": 5.435897435897436e-05, + "loss": 0.0005, + "step": 4260 + }, + { + "epoch": 10.95, + "learning_rate": 5.401709401709402e-05, + "loss": 0.0004, + "step": 4270 + }, + { + "epoch": 10.97, + "learning_rate": 5.3675213675213675e-05, + "loss": 0.0004, + "step": 4280 + }, + { + "epoch": 10.97, + "eval_accuracy": 0.8131313131313131, + "eval_loss": 1.145166277885437, + "eval_runtime": 32.5186, + "eval_samples_per_second": 109.722, + "eval_steps_per_second": 13.715, + "step": 4280 + }, + { + "epoch": 11.0, + "learning_rate": 5.333333333333333e-05, + "loss": 0.0004, + "step": 4290 + }, + { + "epoch": 11.03, + "learning_rate": 5.2991452991453e-05, + "loss": 0.0004, + "step": 4300 + }, + { + "epoch": 11.05, + "learning_rate": 5.2649572649572655e-05, + "loss": 0.0011, + "step": 4310 + }, + { + "epoch": 11.08, + "learning_rate": 5.230769230769231e-05, + "loss": 0.0004, + "step": 4320 + }, + { + "epoch": 11.08, + "eval_accuracy": 0.797979797979798, + "eval_loss": 1.2409276962280273, + "eval_runtime": 32.9205, + "eval_samples_per_second": 108.382, + "eval_steps_per_second": 13.548, + "step": 4320 + }, + { + "epoch": 11.1, + "learning_rate": 5.196581196581197e-05, + "loss": 0.0005, + "step": 4330 + }, + { + "epoch": 11.13, + "learning_rate": 5.162393162393163e-05, + "loss": 0.0004, + "step": 4340 + }, + { + "epoch": 11.15, + "learning_rate": 5.128205128205128e-05, + "loss": 0.0074, + "step": 4350 + }, + { + "epoch": 11.18, + "learning_rate": 5.094017094017094e-05, + "loss": 0.0004, + "step": 4360 + }, + { + "epoch": 11.18, + "eval_accuracy": 0.8063973063973064, + "eval_loss": 1.1721081733703613, + "eval_runtime": 33.0524, + "eval_samples_per_second": 107.95, + "eval_steps_per_second": 13.494, + "step": 4360 + }, + { + "epoch": 11.21, + "learning_rate": 5.05982905982906e-05, + "loss": 0.0004, + "step": 4370 + }, + { + "epoch": 11.23, + "learning_rate": 5.025641025641026e-05, + "loss": 0.0004, + "step": 4380 + }, + { + "epoch": 11.26, + "learning_rate": 4.991452991452992e-05, + "loss": 0.0021, + "step": 4390 + }, + { + "epoch": 11.28, + "learning_rate": 4.9572649572649575e-05, + "loss": 0.0004, + "step": 4400 + }, + { + "epoch": 11.28, + "eval_accuracy": 0.8041526374859708, + "eval_loss": 1.195190191268921, + "eval_runtime": 34.4526, + "eval_samples_per_second": 103.563, + "eval_steps_per_second": 12.945, + "step": 4400 + }, + { + "epoch": 11.31, + "learning_rate": 4.923076923076924e-05, + "loss": 0.0004, + "step": 4410 + }, + { + "epoch": 11.33, + "learning_rate": 4.888888888888889e-05, + "loss": 0.0004, + "step": 4420 + }, + { + "epoch": 11.36, + "learning_rate": 4.854700854700855e-05, + "loss": 0.0004, + "step": 4430 + }, + { + "epoch": 11.38, + "learning_rate": 4.8205128205128205e-05, + "loss": 0.0004, + "step": 4440 + }, + { + "epoch": 11.38, + "eval_accuracy": 0.8024691358024691, + "eval_loss": 1.2087780237197876, + "eval_runtime": 37.4802, + "eval_samples_per_second": 95.197, + "eval_steps_per_second": 11.9, + "step": 4440 + }, + { + "epoch": 11.41, + "learning_rate": 4.786324786324787e-05, + "loss": 0.0004, + "step": 4450 + }, + { + "epoch": 11.44, + "learning_rate": 4.752136752136752e-05, + "loss": 0.0023, + "step": 4460 + }, + { + "epoch": 11.46, + "learning_rate": 4.717948717948718e-05, + "loss": 0.0004, + "step": 4470 + }, + { + "epoch": 11.49, + "learning_rate": 4.683760683760684e-05, + "loss": 0.0004, + "step": 4480 + }, + { + "epoch": 11.49, + "eval_accuracy": 0.8170594837261503, + "eval_loss": 1.1391326189041138, + "eval_runtime": 33.734, + "eval_samples_per_second": 105.769, + "eval_steps_per_second": 13.221, + "step": 4480 + }, + { + "epoch": 11.51, + "learning_rate": 4.64957264957265e-05, + "loss": 0.0004, + "step": 4490 + }, + { + "epoch": 11.54, + "learning_rate": 4.615384615384616e-05, + "loss": 0.0004, + "step": 4500 + }, + { + "epoch": 11.56, + "learning_rate": 4.581196581196581e-05, + "loss": 0.0007, + "step": 4510 + }, + { + "epoch": 11.59, + "learning_rate": 4.5470085470085474e-05, + "loss": 0.0004, + "step": 4520 + }, + { + "epoch": 11.59, + "eval_accuracy": 0.8148148148148148, + "eval_loss": 1.1593190431594849, + "eval_runtime": 32.1112, + "eval_samples_per_second": 111.114, + "eval_steps_per_second": 13.889, + "step": 4520 + }, + { + "epoch": 11.62, + "learning_rate": 4.512820512820513e-05, + "loss": 0.0004, + "step": 4530 + }, + { + "epoch": 11.64, + "learning_rate": 4.478632478632479e-05, + "loss": 0.0004, + "step": 4540 + }, + { + "epoch": 11.67, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.0003, + "step": 4550 + }, + { + "epoch": 11.69, + "learning_rate": 4.4102564102564104e-05, + "loss": 0.0004, + "step": 4560 + }, + { + "epoch": 11.69, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.1428256034851074, + "eval_runtime": 34.6547, + "eval_samples_per_second": 102.959, + "eval_steps_per_second": 12.87, + "step": 4560 + }, + { + "epoch": 11.72, + "learning_rate": 4.376068376068376e-05, + "loss": 0.0004, + "step": 4570 + }, + { + "epoch": 11.74, + "learning_rate": 4.341880341880342e-05, + "loss": 0.0003, + "step": 4580 + }, + { + "epoch": 11.77, + "learning_rate": 4.3076923076923084e-05, + "loss": 0.0003, + "step": 4590 + }, + { + "epoch": 11.79, + "learning_rate": 4.2735042735042735e-05, + "loss": 0.0003, + "step": 4600 + }, + { + "epoch": 11.79, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.1417738199234009, + "eval_runtime": 33.7453, + "eval_samples_per_second": 105.733, + "eval_steps_per_second": 13.217, + "step": 4600 + }, + { + "epoch": 11.82, + "learning_rate": 4.239316239316239e-05, + "loss": 0.0003, + "step": 4610 + }, + { + "epoch": 11.85, + "learning_rate": 4.205128205128206e-05, + "loss": 0.0003, + "step": 4620 + }, + { + "epoch": 11.87, + "learning_rate": 4.1709401709401715e-05, + "loss": 0.0003, + "step": 4630 + }, + { + "epoch": 11.9, + "learning_rate": 4.1367521367521366e-05, + "loss": 0.0003, + "step": 4640 + }, + { + "epoch": 11.9, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.1414417028427124, + "eval_runtime": 31.736, + "eval_samples_per_second": 112.427, + "eval_steps_per_second": 14.053, + "step": 4640 + }, + { + "epoch": 11.92, + "learning_rate": 4.1025641025641023e-05, + "loss": 0.0003, + "step": 4650 + }, + { + "epoch": 11.95, + "learning_rate": 4.068376068376069e-05, + "loss": 0.0003, + "step": 4660 + }, + { + "epoch": 11.97, + "learning_rate": 4.0341880341880346e-05, + "loss": 0.0005, + "step": 4670 + }, + { + "epoch": 12.0, + "learning_rate": 4e-05, + "loss": 0.0006, + "step": 4680 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.8215488215488216, + "eval_loss": 1.160527229309082, + "eval_runtime": 32.9127, + "eval_samples_per_second": 108.408, + "eval_steps_per_second": 13.551, + "step": 4680 + }, + { + "epoch": 12.03, + "learning_rate": 3.965811965811966e-05, + "loss": 0.0003, + "step": 4690 + }, + { + "epoch": 12.05, + "learning_rate": 3.931623931623932e-05, + "loss": 0.0004, + "step": 4700 + }, + { + "epoch": 12.08, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0003, + "step": 4710 + }, + { + "epoch": 12.1, + "learning_rate": 3.8632478632478634e-05, + "loss": 0.0003, + "step": 4720 + }, + { + "epoch": 12.1, + "eval_accuracy": 0.8080808080808081, + "eval_loss": 1.209052562713623, + "eval_runtime": 32.6946, + "eval_samples_per_second": 109.131, + "eval_steps_per_second": 13.641, + "step": 4720 + }, + { + "epoch": 12.13, + "learning_rate": 3.82905982905983e-05, + "loss": 0.0004, + "step": 4730 + }, + { + "epoch": 12.15, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0003, + "step": 4740 + }, + { + "epoch": 12.18, + "learning_rate": 3.760683760683761e-05, + "loss": 0.0004, + "step": 4750 + }, + { + "epoch": 12.21, + "learning_rate": 3.7264957264957265e-05, + "loss": 0.0005, + "step": 4760 + }, + { + "epoch": 12.21, + "eval_accuracy": 0.8232323232323232, + "eval_loss": 1.1692516803741455, + "eval_runtime": 33.4085, + "eval_samples_per_second": 106.799, + "eval_steps_per_second": 13.35, + "step": 4760 + }, + { + "epoch": 12.23, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0003, + "step": 4770 + }, + { + "epoch": 12.26, + "learning_rate": 3.658119658119658e-05, + "loss": 0.0003, + "step": 4780 + }, + { + "epoch": 12.28, + "learning_rate": 3.623931623931624e-05, + "loss": 0.0004, + "step": 4790 + }, + { + "epoch": 12.31, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0003, + "step": 4800 + }, + { + "epoch": 12.31, + "eval_accuracy": 0.8215488215488216, + "eval_loss": 1.152122974395752, + "eval_runtime": 34.5292, + "eval_samples_per_second": 103.333, + "eval_steps_per_second": 12.917, + "step": 4800 + }, + { + "epoch": 12.33, + "learning_rate": 3.555555555555556e-05, + "loss": 0.0005, + "step": 4810 + }, + { + "epoch": 12.36, + "learning_rate": 3.521367521367522e-05, + "loss": 0.0003, + "step": 4820 + }, + { + "epoch": 12.38, + "learning_rate": 3.487179487179487e-05, + "loss": 0.0003, + "step": 4830 + }, + { + "epoch": 12.41, + "learning_rate": 3.452991452991453e-05, + "loss": 0.0003, + "step": 4840 + }, + { + "epoch": 12.41, + "eval_accuracy": 0.8125701459034792, + "eval_loss": 1.1672531366348267, + "eval_runtime": 32.9525, + "eval_samples_per_second": 108.277, + "eval_steps_per_second": 13.535, + "step": 4840 + }, + { + "epoch": 12.44, + "learning_rate": 3.418803418803419e-05, + "loss": 0.0003, + "step": 4850 + }, + { + "epoch": 12.46, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0003, + "step": 4860 + }, + { + "epoch": 12.49, + "learning_rate": 3.3504273504273506e-05, + "loss": 0.0003, + "step": 4870 + }, + { + "epoch": 12.51, + "learning_rate": 3.3162393162393164e-05, + "loss": 0.0003, + "step": 4880 + }, + { + "epoch": 12.51, + "eval_accuracy": 0.8125701459034792, + "eval_loss": 1.1674937009811401, + "eval_runtime": 34.1043, + "eval_samples_per_second": 104.62, + "eval_steps_per_second": 13.078, + "step": 4880 + }, + { + "epoch": 12.54, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0003, + "step": 4890 + }, + { + "epoch": 12.56, + "learning_rate": 3.247863247863248e-05, + "loss": 0.0003, + "step": 4900 + }, + { + "epoch": 12.59, + "learning_rate": 3.2136752136752144e-05, + "loss": 0.0003, + "step": 4910 + }, + { + "epoch": 12.62, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0003, + "step": 4920 + }, + { + "epoch": 12.62, + "eval_accuracy": 0.8148148148148148, + "eval_loss": 1.1658987998962402, + "eval_runtime": 34.5805, + "eval_samples_per_second": 103.179, + "eval_steps_per_second": 12.897, + "step": 4920 + }, + { + "epoch": 12.64, + "learning_rate": 3.145299145299145e-05, + "loss": 0.0003, + "step": 4930 + }, + { + "epoch": 12.67, + "learning_rate": 3.111111111111111e-05, + "loss": 0.0003, + "step": 4940 + }, + { + "epoch": 12.69, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0003, + "step": 4950 + }, + { + "epoch": 12.72, + "learning_rate": 3.0427350427350425e-05, + "loss": 0.0003, + "step": 4960 + }, + { + "epoch": 12.72, + "eval_accuracy": 0.8142536475869809, + "eval_loss": 1.1660560369491577, + "eval_runtime": 33.7463, + "eval_samples_per_second": 105.73, + "eval_steps_per_second": 13.216, + "step": 4960 + }, + { + "epoch": 12.74, + "learning_rate": 3.0085470085470086e-05, + "loss": 0.0003, + "step": 4970 + }, + { + "epoch": 12.77, + "learning_rate": 2.9743589743589744e-05, + "loss": 0.0003, + "step": 4980 + }, + { + "epoch": 12.79, + "learning_rate": 2.9401709401709405e-05, + "loss": 0.0003, + "step": 4990 + }, + { + "epoch": 12.82, + "learning_rate": 2.9059829059829063e-05, + "loss": 0.0003, + "step": 5000 + }, + { + "epoch": 12.82, + "eval_accuracy": 0.8159371492704826, + "eval_loss": 1.1644172668457031, + "eval_runtime": 32.5332, + "eval_samples_per_second": 109.672, + "eval_steps_per_second": 13.709, + "step": 5000 + }, + { + "epoch": 12.85, + "learning_rate": 2.8717948717948717e-05, + "loss": 0.0003, + "step": 5010 + }, + { + "epoch": 12.87, + "learning_rate": 2.8376068376068378e-05, + "loss": 0.0003, + "step": 5020 + }, + { + "epoch": 12.9, + "learning_rate": 2.8034188034188036e-05, + "loss": 0.0003, + "step": 5030 + }, + { + "epoch": 12.92, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0003, + "step": 5040 + }, + { + "epoch": 12.92, + "eval_accuracy": 0.8148148148148148, + "eval_loss": 1.1629841327667236, + "eval_runtime": 34.2224, + "eval_samples_per_second": 104.259, + "eval_steps_per_second": 13.032, + "step": 5040 + }, + { + "epoch": 12.95, + "learning_rate": 2.7350427350427355e-05, + "loss": 0.0003, + "step": 5050 + }, + { + "epoch": 12.97, + "learning_rate": 2.700854700854701e-05, + "loss": 0.0003, + "step": 5060 + }, + { + "epoch": 13.0, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.0003, + "step": 5070 + }, + { + "epoch": 13.03, + "learning_rate": 2.6324786324786328e-05, + "loss": 0.0003, + "step": 5080 + }, + { + "epoch": 13.03, + "eval_accuracy": 0.8159371492704826, + "eval_loss": 1.1632208824157715, + "eval_runtime": 41.2446, + "eval_samples_per_second": 86.508, + "eval_steps_per_second": 10.814, + "step": 5080 + }, + { + "epoch": 13.05, + "learning_rate": 2.5982905982905985e-05, + "loss": 0.0003, + "step": 5090 + }, + { + "epoch": 13.08, + "learning_rate": 2.564102564102564e-05, + "loss": 0.0003, + "step": 5100 + }, + { + "epoch": 13.1, + "learning_rate": 2.52991452991453e-05, + "loss": 0.0003, + "step": 5110 + }, + { + "epoch": 13.13, + "learning_rate": 2.495726495726496e-05, + "loss": 0.0003, + "step": 5120 + }, + { + "epoch": 13.13, + "eval_accuracy": 0.8164983164983165, + "eval_loss": 1.1635642051696777, + "eval_runtime": 33.779, + "eval_samples_per_second": 105.628, + "eval_steps_per_second": 13.203, + "step": 5120 + }, + { + "epoch": 13.15, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0003, + "step": 5130 + }, + { + "epoch": 13.18, + "learning_rate": 2.4273504273504274e-05, + "loss": 0.0003, + "step": 5140 + }, + { + "epoch": 13.21, + "learning_rate": 2.3931623931623935e-05, + "loss": 0.0003, + "step": 5150 + }, + { + "epoch": 13.23, + "learning_rate": 2.358974358974359e-05, + "loss": 0.0003, + "step": 5160 + }, + { + "epoch": 13.23, + "eval_accuracy": 0.8170594837261503, + "eval_loss": 1.1645357608795166, + "eval_runtime": 34.4551, + "eval_samples_per_second": 103.555, + "eval_steps_per_second": 12.944, + "step": 5160 + }, + { + "epoch": 13.26, + "learning_rate": 2.324786324786325e-05, + "loss": 0.0003, + "step": 5170 + }, + { + "epoch": 13.28, + "learning_rate": 2.2905982905982905e-05, + "loss": 0.0003, + "step": 5180 + }, + { + "epoch": 13.31, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0003, + "step": 5190 + }, + { + "epoch": 13.33, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0003, + "step": 5200 + }, + { + "epoch": 13.33, + "eval_accuracy": 0.8164983164983165, + "eval_loss": 1.1655912399291992, + "eval_runtime": 33.1731, + "eval_samples_per_second": 107.557, + "eval_steps_per_second": 13.445, + "step": 5200 + }, + { + "epoch": 13.36, + "learning_rate": 2.188034188034188e-05, + "loss": 0.0003, + "step": 5210 + }, + { + "epoch": 13.38, + "learning_rate": 2.1538461538461542e-05, + "loss": 0.0003, + "step": 5220 + }, + { + "epoch": 13.41, + "learning_rate": 2.1196581196581196e-05, + "loss": 0.0003, + "step": 5230 + }, + { + "epoch": 13.44, + "learning_rate": 2.0854700854700857e-05, + "loss": 0.0003, + "step": 5240 + }, + { + "epoch": 13.44, + "eval_accuracy": 0.8164983164983165, + "eval_loss": 1.1652870178222656, + "eval_runtime": 32.1186, + "eval_samples_per_second": 111.088, + "eval_steps_per_second": 13.886, + "step": 5240 + }, + { + "epoch": 13.46, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0003, + "step": 5250 + }, + { + "epoch": 13.49, + "learning_rate": 2.0170940170940173e-05, + "loss": 0.0003, + "step": 5260 + }, + { + "epoch": 13.51, + "learning_rate": 1.982905982905983e-05, + "loss": 0.0003, + "step": 5270 + }, + { + "epoch": 13.54, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0003, + "step": 5280 + }, + { + "epoch": 13.54, + "eval_accuracy": 0.8164983164983165, + "eval_loss": 1.1658546924591064, + "eval_runtime": 35.35, + "eval_samples_per_second": 100.933, + "eval_steps_per_second": 12.617, + "step": 5280 + }, + { + "epoch": 13.56, + "learning_rate": 1.914529914529915e-05, + "loss": 0.0003, + "step": 5290 + }, + { + "epoch": 13.59, + "learning_rate": 1.8803418803418804e-05, + "loss": 0.0003, + "step": 5300 + }, + { + "epoch": 13.62, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0003, + "step": 5310 + }, + { + "epoch": 13.64, + "learning_rate": 1.811965811965812e-05, + "loss": 0.0003, + "step": 5320 + }, + { + "epoch": 13.64, + "eval_accuracy": 0.8170594837261503, + "eval_loss": 1.166174054145813, + "eval_runtime": 33.5318, + "eval_samples_per_second": 106.406, + "eval_steps_per_second": 13.301, + "step": 5320 + }, + { + "epoch": 13.67, + "learning_rate": 1.777777777777778e-05, + "loss": 0.0003, + "step": 5330 + }, + { + "epoch": 13.69, + "learning_rate": 1.7435897435897434e-05, + "loss": 0.0003, + "step": 5340 + }, + { + "epoch": 13.72, + "learning_rate": 1.7094017094017095e-05, + "loss": 0.0003, + "step": 5350 + }, + { + "epoch": 13.74, + "learning_rate": 1.6752136752136753e-05, + "loss": 0.0003, + "step": 5360 + }, + { + "epoch": 13.74, + "eval_accuracy": 0.8176206509539843, + "eval_loss": 1.1661781072616577, + "eval_runtime": 32.3616, + "eval_samples_per_second": 110.254, + "eval_steps_per_second": 13.782, + "step": 5360 + }, + { + "epoch": 13.77, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0003, + "step": 5370 + }, + { + "epoch": 13.79, + "learning_rate": 1.6068376068376072e-05, + "loss": 0.0003, + "step": 5380 + }, + { + "epoch": 13.82, + "learning_rate": 1.5726495726495726e-05, + "loss": 0.0003, + "step": 5390 + }, + { + "epoch": 13.85, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0003, + "step": 5400 + }, + { + "epoch": 13.85, + "eval_accuracy": 0.8176206509539843, + "eval_loss": 1.1663531064987183, + "eval_runtime": 35.1534, + "eval_samples_per_second": 101.498, + "eval_steps_per_second": 12.687, + "step": 5400 + }, + { + "epoch": 13.87, + "learning_rate": 1.5042735042735043e-05, + "loss": 0.0003, + "step": 5410 + }, + { + "epoch": 13.9, + "learning_rate": 1.4700854700854703e-05, + "loss": 0.0003, + "step": 5420 + }, + { + "epoch": 13.92, + "learning_rate": 1.4358974358974359e-05, + "loss": 0.0003, + "step": 5430 + }, + { + "epoch": 13.95, + "learning_rate": 1.4017094017094018e-05, + "loss": 0.0002, + "step": 5440 + }, + { + "epoch": 13.95, + "eval_accuracy": 0.8181818181818182, + "eval_loss": 1.1670769453048706, + "eval_runtime": 33.5689, + "eval_samples_per_second": 106.289, + "eval_steps_per_second": 13.286, + "step": 5440 + }, + { + "epoch": 13.97, + "learning_rate": 1.3675213675213677e-05, + "loss": 0.0003, + "step": 5450 + }, + { + "epoch": 14.0, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0003, + "step": 5460 + }, + { + "epoch": 14.03, + "learning_rate": 1.2991452991452993e-05, + "loss": 0.0003, + "step": 5470 + }, + { + "epoch": 14.05, + "learning_rate": 1.264957264957265e-05, + "loss": 0.0003, + "step": 5480 + }, + { + "epoch": 14.05, + "eval_accuracy": 0.8181818181818182, + "eval_loss": 1.1676580905914307, + "eval_runtime": 42.4697, + "eval_samples_per_second": 84.013, + "eval_steps_per_second": 10.502, + "step": 5480 + }, + { + "epoch": 14.08, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0003, + "step": 5490 + }, + { + "epoch": 14.1, + "learning_rate": 1.1965811965811967e-05, + "loss": 0.0003, + "step": 5500 + }, + { + "epoch": 14.13, + "learning_rate": 1.1623931623931625e-05, + "loss": 0.0003, + "step": 5510 + }, + { + "epoch": 14.15, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0003, + "step": 5520 + }, + { + "epoch": 14.15, + "eval_accuracy": 0.8181818181818182, + "eval_loss": 1.1681442260742188, + "eval_runtime": 33.5678, + "eval_samples_per_second": 106.292, + "eval_steps_per_second": 13.287, + "step": 5520 + }, + { + "epoch": 14.18, + "learning_rate": 1.094017094017094e-05, + "loss": 0.0003, + "step": 5530 + }, + { + "epoch": 14.21, + "learning_rate": 1.0598290598290598e-05, + "loss": 0.0003, + "step": 5540 + }, + { + "epoch": 14.23, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0003, + "step": 5550 + }, + { + "epoch": 14.26, + "learning_rate": 9.914529914529915e-06, + "loss": 0.0003, + "step": 5560 + }, + { + "epoch": 14.26, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.1684342622756958, + "eval_runtime": 32.4759, + "eval_samples_per_second": 109.866, + "eval_steps_per_second": 13.733, + "step": 5560 + }, + { + "epoch": 14.28, + "learning_rate": 9.572649572649575e-06, + "loss": 0.0003, + "step": 5570 + }, + { + "epoch": 14.31, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0003, + "step": 5580 + }, + { + "epoch": 14.33, + "learning_rate": 8.88888888888889e-06, + "loss": 0.0003, + "step": 5590 + }, + { + "epoch": 14.36, + "learning_rate": 8.547008547008548e-06, + "loss": 0.0003, + "step": 5600 + }, + { + "epoch": 14.36, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.1688289642333984, + "eval_runtime": 32.3951, + "eval_samples_per_second": 110.14, + "eval_steps_per_second": 13.768, + "step": 5600 + }, + { + "epoch": 14.38, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0002, + "step": 5610 + }, + { + "epoch": 14.41, + "learning_rate": 7.863247863247863e-06, + "loss": 0.0002, + "step": 5620 + }, + { + "epoch": 14.44, + "learning_rate": 7.521367521367522e-06, + "loss": 0.0003, + "step": 5630 + }, + { + "epoch": 14.46, + "learning_rate": 7.179487179487179e-06, + "loss": 0.0003, + "step": 5640 + }, + { + "epoch": 14.46, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.168882131576538, + "eval_runtime": 31.9999, + "eval_samples_per_second": 111.5, + "eval_steps_per_second": 13.938, + "step": 5640 + }, + { + "epoch": 14.49, + "learning_rate": 6.837606837606839e-06, + "loss": 0.0003, + "step": 5650 + }, + { + "epoch": 14.51, + "learning_rate": 6.495726495726496e-06, + "loss": 0.0003, + "step": 5660 + }, + { + "epoch": 14.54, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0003, + "step": 5670 + }, + { + "epoch": 14.56, + "learning_rate": 5.8119658119658126e-06, + "loss": 0.0002, + "step": 5680 + }, + { + "epoch": 14.56, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.1692728996276855, + "eval_runtime": 33.4951, + "eval_samples_per_second": 106.523, + "eval_steps_per_second": 13.315, + "step": 5680 + }, + { + "epoch": 14.59, + "learning_rate": 5.47008547008547e-06, + "loss": 0.0002, + "step": 5690 + }, + { + "epoch": 14.62, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0003, + "step": 5700 + }, + { + "epoch": 14.64, + "learning_rate": 4.786324786324787e-06, + "loss": 0.0002, + "step": 5710 + }, + { + "epoch": 14.67, + "learning_rate": 4.444444444444445e-06, + "loss": 0.0002, + "step": 5720 + }, + { + "epoch": 14.67, + "eval_accuracy": 0.8187429854096521, + "eval_loss": 1.169299602508545, + "eval_runtime": 32.5001, + "eval_samples_per_second": 109.784, + "eval_steps_per_second": 13.723, + "step": 5720 + }, + { + "epoch": 14.69, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0003, + "step": 5730 + }, + { + "epoch": 14.72, + "learning_rate": 3.760683760683761e-06, + "loss": 0.0003, + "step": 5740 + }, + { + "epoch": 14.74, + "learning_rate": 3.4188034188034193e-06, + "loss": 0.0003, + "step": 5750 + }, + { + "epoch": 14.77, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0002, + "step": 5760 + }, + { + "epoch": 14.77, + "eval_accuracy": 0.819304152637486, + "eval_loss": 1.1695406436920166, + "eval_runtime": 33.5775, + "eval_samples_per_second": 106.262, + "eval_steps_per_second": 13.283, + "step": 5760 + }, + { + "epoch": 14.79, + "learning_rate": 2.735042735042735e-06, + "loss": 0.0003, + "step": 5770 + }, + { + "epoch": 14.82, + "learning_rate": 2.3931623931623937e-06, + "loss": 0.0002, + "step": 5780 + }, + { + "epoch": 14.85, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.0003, + "step": 5790 + }, + { + "epoch": 14.87, + "learning_rate": 1.7094017094017097e-06, + "loss": 0.0003, + "step": 5800 + }, + { + "epoch": 14.87, + "eval_accuracy": 0.819304152637486, + "eval_loss": 1.1696120500564575, + "eval_runtime": 34.2314, + "eval_samples_per_second": 104.232, + "eval_steps_per_second": 13.029, + "step": 5800 + }, + { + "epoch": 14.9, + "learning_rate": 1.3675213675213676e-06, + "loss": 0.0003, + "step": 5810 + }, + { + "epoch": 14.92, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0003, + "step": 5820 + }, + { + "epoch": 14.95, + "learning_rate": 6.837606837606838e-07, + "loss": 0.0002, + "step": 5830 + }, + { + "epoch": 14.97, + "learning_rate": 3.418803418803419e-07, + "loss": 0.0002, + "step": 5840 + }, + { + "epoch": 14.97, + "eval_accuracy": 0.819304152637486, + "eval_loss": 1.1696137189865112, + "eval_runtime": 43.2941, + "eval_samples_per_second": 82.413, + "eval_steps_per_second": 10.302, + "step": 5840 + } + ], + "max_steps": 5850, + "num_train_epochs": 15, + "total_flos": 7.231346758696919e+18, + "trial_name": null, + "trial_params": null +}