{ "best_metric": 2.513946056365967, "best_model_checkpoint": "checkpoints-finetuning/checkpoint-1080", "epoch": 193.14128943758573, "eval_steps": 40, "global_step": 2200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.51, "learning_rate": 2.0000000000000002e-07, "loss": 3.595, "step": 40 }, { "epoch": 3.51, "eval_loss": 3.5299072265625, "eval_runtime": 5.0148, "eval_samples_per_second": 62.416, "eval_steps_per_second": 15.753, "step": 40 }, { "epoch": 7.02, "learning_rate": 4.0000000000000003e-07, "loss": 3.4769, "step": 80 }, { "epoch": 7.02, "eval_loss": 3.3721721172332764, "eval_runtime": 4.4435, "eval_samples_per_second": 70.441, "eval_steps_per_second": 17.779, "step": 80 }, { "epoch": 10.53, "learning_rate": 6.000000000000001e-07, "loss": 3.3037, "step": 120 }, { "epoch": 10.53, "eval_loss": 3.1870808601379395, "eval_runtime": 4.6407, "eval_samples_per_second": 67.446, "eval_steps_per_second": 17.023, "step": 120 }, { "epoch": 14.05, "learning_rate": 8.000000000000001e-07, "loss": 3.1255, "step": 160 }, { "epoch": 14.05, "eval_loss": 3.0087945461273193, "eval_runtime": 4.7026, "eval_samples_per_second": 66.559, "eval_steps_per_second": 16.799, "step": 160 }, { "epoch": 17.56, "learning_rate": 1.0000000000000002e-06, "loss": 2.9615, "step": 200 }, { "epoch": 17.56, "eval_loss": 2.8684051036834717, "eval_runtime": 4.6401, "eval_samples_per_second": 67.455, "eval_steps_per_second": 17.026, "step": 200 }, { "epoch": 21.07, "learning_rate": 1.2000000000000002e-06, "loss": 2.8468, "step": 240 }, { "epoch": 21.07, "eval_loss": 2.780834436416626, "eval_runtime": 4.4221, "eval_samples_per_second": 70.78, "eval_steps_per_second": 17.865, "step": 240 }, { "epoch": 24.58, "learning_rate": 1.4000000000000001e-06, "loss": 2.7699, "step": 280 }, { "epoch": 24.58, "eval_loss": 2.720453977584839, "eval_runtime": 4.5663, "eval_samples_per_second": 68.546, "eval_steps_per_second": 17.301, "step": 280 }, { "epoch": 28.09, "learning_rate": 1.6000000000000001e-06, "loss": 2.7139, "step": 320 }, { "epoch": 28.09, "eval_loss": 2.679349422454834, "eval_runtime": 4.7784, "eval_samples_per_second": 65.504, "eval_steps_per_second": 16.533, "step": 320 }, { "epoch": 31.6, "learning_rate": 1.8000000000000001e-06, "loss": 2.6712, "step": 360 }, { "epoch": 31.6, "eval_loss": 2.650853395462036, "eval_runtime": 4.6713, "eval_samples_per_second": 67.005, "eval_steps_per_second": 16.912, "step": 360 }, { "epoch": 35.12, "learning_rate": 2.0000000000000003e-06, "loss": 2.6356, "step": 400 }, { "epoch": 35.12, "eval_loss": 2.6293749809265137, "eval_runtime": 4.6364, "eval_samples_per_second": 67.51, "eval_steps_per_second": 17.039, "step": 400 }, { "epoch": 38.63, "learning_rate": 2.2e-06, "loss": 2.6048, "step": 440 }, { "epoch": 38.63, "eval_loss": 2.611950635910034, "eval_runtime": 4.5116, "eval_samples_per_second": 69.377, "eval_steps_per_second": 17.511, "step": 440 }, { "epoch": 42.14, "learning_rate": 2.4000000000000003e-06, "loss": 2.5823, "step": 480 }, { "epoch": 42.14, "eval_loss": 2.597449541091919, "eval_runtime": 4.5213, "eval_samples_per_second": 69.228, "eval_steps_per_second": 17.473, "step": 480 }, { "epoch": 45.65, "learning_rate": 2.6e-06, "loss": 2.5536, "step": 520 }, { "epoch": 45.65, "eval_loss": 2.5848779678344727, "eval_runtime": 4.4398, "eval_samples_per_second": 70.498, "eval_steps_per_second": 17.793, "step": 520 }, { "epoch": 49.16, "learning_rate": 2.8000000000000003e-06, "loss": 2.5293, "step": 560 }, { "epoch": 49.16, "eval_loss": 2.574049472808838, "eval_runtime": 4.6572, "eval_samples_per_second": 67.208, "eval_steps_per_second": 16.963, "step": 560 }, { "epoch": 52.67, "learning_rate": 3e-06, "loss": 2.5058, "step": 600 }, { "epoch": 52.67, "eval_loss": 2.5643808841705322, "eval_runtime": 4.6849, "eval_samples_per_second": 66.81, "eval_steps_per_second": 16.863, "step": 600 }, { "epoch": 56.19, "learning_rate": 3.2000000000000003e-06, "loss": 2.482, "step": 640 }, { "epoch": 56.19, "eval_loss": 2.555607557296753, "eval_runtime": 4.491, "eval_samples_per_second": 69.695, "eval_steps_per_second": 17.591, "step": 640 }, { "epoch": 59.7, "learning_rate": 3.4000000000000005e-06, "loss": 2.4575, "step": 680 }, { "epoch": 59.7, "eval_loss": 2.547734260559082, "eval_runtime": 4.6182, "eval_samples_per_second": 67.776, "eval_steps_per_second": 17.106, "step": 680 }, { "epoch": 63.21, "learning_rate": 3.6000000000000003e-06, "loss": 2.4339, "step": 720 }, { "epoch": 63.21, "eval_loss": 2.5405359268188477, "eval_runtime": 4.5137, "eval_samples_per_second": 69.345, "eval_steps_per_second": 17.502, "step": 720 }, { "epoch": 66.72, "learning_rate": 3.8000000000000005e-06, "loss": 2.4073, "step": 760 }, { "epoch": 66.72, "eval_loss": 2.5350451469421387, "eval_runtime": 4.6034, "eval_samples_per_second": 67.993, "eval_steps_per_second": 17.161, "step": 760 }, { "epoch": 70.23, "learning_rate": 4.000000000000001e-06, "loss": 2.3845, "step": 800 }, { "epoch": 70.23, "eval_loss": 2.530299186706543, "eval_runtime": 4.6325, "eval_samples_per_second": 67.566, "eval_steps_per_second": 17.053, "step": 800 }, { "epoch": 73.74, "learning_rate": 4.2000000000000004e-06, "loss": 2.3606, "step": 840 }, { "epoch": 73.74, "eval_loss": 2.525312662124634, "eval_runtime": 4.4668, "eval_samples_per_second": 70.072, "eval_steps_per_second": 17.686, "step": 840 }, { "epoch": 77.26, "learning_rate": 4.4e-06, "loss": 2.329, "step": 880 }, { "epoch": 77.26, "eval_loss": 2.5215225219726562, "eval_runtime": 4.4699, "eval_samples_per_second": 70.023, "eval_steps_per_second": 17.674, "step": 880 }, { "epoch": 80.77, "learning_rate": 4.600000000000001e-06, "loss": 2.3071, "step": 920 }, { "epoch": 80.77, "eval_loss": 2.5184576511383057, "eval_runtime": 4.3807, "eval_samples_per_second": 71.45, "eval_steps_per_second": 18.034, "step": 920 }, { "epoch": 84.28, "learning_rate": 4.800000000000001e-06, "loss": 2.2768, "step": 960 }, { "epoch": 84.28, "eval_loss": 2.515460729598999, "eval_runtime": 4.6634, "eval_samples_per_second": 67.119, "eval_steps_per_second": 16.941, "step": 960 }, { "epoch": 87.79, "learning_rate": 5e-06, "loss": 2.2479, "step": 1000 }, { "epoch": 87.79, "eval_loss": 2.514392852783203, "eval_runtime": 4.5583, "eval_samples_per_second": 68.665, "eval_steps_per_second": 17.331, "step": 1000 }, { "epoch": 91.3, "learning_rate": 4.986304738420684e-06, "loss": 2.2181, "step": 1040 }, { "epoch": 91.3, "eval_loss": 2.515076160430908, "eval_runtime": 4.6324, "eval_samples_per_second": 67.568, "eval_steps_per_second": 17.054, "step": 1040 }, { "epoch": 94.81, "learning_rate": 4.9453690018345144e-06, "loss": 2.1901, "step": 1080 }, { "epoch": 94.81, "eval_loss": 2.513946056365967, "eval_runtime": 4.635, "eval_samples_per_second": 67.53, "eval_steps_per_second": 17.044, "step": 1080 }, { "epoch": 98.33, "learning_rate": 4.8776412907378845e-06, "loss": 2.1571, "step": 1120 }, { "epoch": 98.33, "eval_loss": 2.514775037765503, "eval_runtime": 4.7132, "eval_samples_per_second": 66.41, "eval_steps_per_second": 16.762, "step": 1120 }, { "epoch": 101.84, "learning_rate": 4.783863644106502e-06, "loss": 2.1308, "step": 1160 }, { "epoch": 101.84, "eval_loss": 2.5165762901306152, "eval_runtime": 4.6347, "eval_samples_per_second": 67.535, "eval_steps_per_second": 17.046, "step": 1160 }, { "epoch": 105.35, "learning_rate": 4.665063509461098e-06, "loss": 2.1032, "step": 1200 }, { "epoch": 105.35, "eval_loss": 2.5192971229553223, "eval_runtime": 4.6292, "eval_samples_per_second": 67.614, "eval_steps_per_second": 17.066, "step": 1200 }, { "epoch": 108.86, "learning_rate": 4.522542485937369e-06, "loss": 2.0761, "step": 1240 }, { "epoch": 108.86, "eval_loss": 2.5203866958618164, "eval_runtime": 4.6638, "eval_samples_per_second": 67.113, "eval_steps_per_second": 16.939, "step": 1240 }, { "epoch": 112.37, "learning_rate": 4.357862063693486e-06, "loss": 2.0495, "step": 1280 }, { "epoch": 112.37, "eval_loss": 2.5268709659576416, "eval_runtime": 4.6504, "eval_samples_per_second": 67.306, "eval_steps_per_second": 16.988, "step": 1280 }, { "epoch": 115.88, "learning_rate": 4.172826515897146e-06, "loss": 2.0231, "step": 1320 }, { "epoch": 115.88, "eval_loss": 2.5284526348114014, "eval_runtime": 4.6029, "eval_samples_per_second": 68.0, "eval_steps_per_second": 17.163, "step": 1320 }, { "epoch": 119.4, "learning_rate": 3.969463130731183e-06, "loss": 2.0021, "step": 1360 }, { "epoch": 119.4, "eval_loss": 2.5327632427215576, "eval_runtime": 4.7118, "eval_samples_per_second": 66.429, "eval_steps_per_second": 16.767, "step": 1360 }, { "epoch": 122.91, "learning_rate": 3.7500000000000005e-06, "loss": 1.9793, "step": 1400 }, { "epoch": 122.91, "eval_loss": 2.5382816791534424, "eval_runtime": 4.6299, "eval_samples_per_second": 67.603, "eval_steps_per_second": 17.063, "step": 1400 }, { "epoch": 126.42, "learning_rate": 3.516841607689501e-06, "loss": 1.9575, "step": 1440 }, { "epoch": 126.42, "eval_loss": 2.5441536903381348, "eval_runtime": 4.6442, "eval_samples_per_second": 67.396, "eval_steps_per_second": 17.01, "step": 1440 }, { "epoch": 129.93, "learning_rate": 3.272542485937369e-06, "loss": 1.9368, "step": 1480 }, { "epoch": 129.93, "eval_loss": 2.5487852096557617, "eval_runtime": 4.6396, "eval_samples_per_second": 67.462, "eval_steps_per_second": 17.027, "step": 1480 }, { "epoch": 133.44, "learning_rate": 3.019779227044398e-06, "loss": 1.9216, "step": 1520 }, { "epoch": 133.44, "eval_loss": 2.5533745288848877, "eval_runtime": 4.6038, "eval_samples_per_second": 67.987, "eval_steps_per_second": 17.16, "step": 1520 }, { "epoch": 136.95, "learning_rate": 2.761321158169134e-06, "loss": 1.902, "step": 1560 }, { "epoch": 136.95, "eval_loss": 2.558429479598999, "eval_runtime": 4.605, "eval_samples_per_second": 67.969, "eval_steps_per_second": 17.155, "step": 1560 }, { "epoch": 140.47, "learning_rate": 2.5e-06, "loss": 1.8885, "step": 1600 }, { "epoch": 140.47, "eval_loss": 2.560931444168091, "eval_runtime": 4.6137, "eval_samples_per_second": 67.842, "eval_steps_per_second": 17.123, "step": 1600 }, { "epoch": 143.98, "learning_rate": 2.238678841830867e-06, "loss": 1.8728, "step": 1640 }, { "epoch": 143.98, "eval_loss": 2.565746307373047, "eval_runtime": 4.6085, "eval_samples_per_second": 67.918, "eval_steps_per_second": 17.142, "step": 1640 }, { "epoch": 147.49, "learning_rate": 1.9802207729556023e-06, "loss": 1.8605, "step": 1680 }, { "epoch": 147.49, "eval_loss": 2.569748640060425, "eval_runtime": 4.6652, "eval_samples_per_second": 67.092, "eval_steps_per_second": 16.934, "step": 1680 }, { "epoch": 151.0, "learning_rate": 1.7274575140626318e-06, "loss": 1.8476, "step": 1720 }, { "epoch": 151.0, "eval_loss": 2.5741446018218994, "eval_runtime": 4.7429, "eval_samples_per_second": 65.994, "eval_steps_per_second": 16.657, "step": 1720 }, { "epoch": 154.51, "learning_rate": 1.4831583923105e-06, "loss": 1.8402, "step": 1760 }, { "epoch": 154.51, "eval_loss": 2.5770394802093506, "eval_runtime": 4.6184, "eval_samples_per_second": 67.772, "eval_steps_per_second": 17.105, "step": 1760 }, { "epoch": 158.02, "learning_rate": 1.2500000000000007e-06, "loss": 1.8274, "step": 1800 }, { "epoch": 158.02, "eval_loss": 2.580260992050171, "eval_runtime": 4.5687, "eval_samples_per_second": 68.509, "eval_steps_per_second": 17.291, "step": 1800 }, { "epoch": 161.54, "learning_rate": 1.0305368692688175e-06, "loss": 1.8218, "step": 1840 }, { "epoch": 161.54, "eval_loss": 2.582859992980957, "eval_runtime": 4.6266, "eval_samples_per_second": 67.653, "eval_steps_per_second": 17.075, "step": 1840 }, { "epoch": 165.05, "learning_rate": 8.271734841028553e-07, "loss": 1.8144, "step": 1880 }, { "epoch": 165.05, "eval_loss": 2.5846669673919678, "eval_runtime": 4.601, "eval_samples_per_second": 68.029, "eval_steps_per_second": 17.17, "step": 1880 }, { "epoch": 168.56, "learning_rate": 6.421379363065142e-07, "loss": 1.8097, "step": 1920 }, { "epoch": 168.56, "eval_loss": 2.5867464542388916, "eval_runtime": 4.593, "eval_samples_per_second": 68.148, "eval_steps_per_second": 17.2, "step": 1920 }, { "epoch": 172.07, "learning_rate": 4.774575140626317e-07, "loss": 1.8076, "step": 1960 }, { "epoch": 172.07, "eval_loss": 2.5882575511932373, "eval_runtime": 4.601, "eval_samples_per_second": 68.028, "eval_steps_per_second": 17.17, "step": 1960 }, { "epoch": 175.58, "learning_rate": 3.3493649053890325e-07, "loss": 1.8014, "step": 2000 }, { "epoch": 175.58, "eval_loss": 2.589245080947876, "eval_runtime": 4.5976, "eval_samples_per_second": 68.079, "eval_steps_per_second": 17.183, "step": 2000 }, { "epoch": 179.09, "learning_rate": 2.1613635589349756e-07, "loss": 1.8001, "step": 2040 }, { "epoch": 179.09, "eval_loss": 2.589866876602173, "eval_runtime": 4.5824, "eval_samples_per_second": 68.305, "eval_steps_per_second": 17.24, "step": 2040 }, { "epoch": 182.61, "learning_rate": 1.223587092621162e-07, "loss": 1.7987, "step": 2080 }, { "epoch": 182.61, "eval_loss": 2.5903093814849854, "eval_runtime": 4.6146, "eval_samples_per_second": 67.829, "eval_steps_per_second": 17.12, "step": 2080 }, { "epoch": 186.12, "learning_rate": 5.463099816548578e-08, "loss": 1.7971, "step": 2120 }, { "epoch": 186.12, "eval_loss": 2.590583562850952, "eval_runtime": 4.609, "eval_samples_per_second": 67.911, "eval_steps_per_second": 17.141, "step": 2120 }, { "epoch": 189.63, "learning_rate": 1.3695261579316776e-08, "loss": 1.7979, "step": 2160 }, { "epoch": 189.63, "eval_loss": 2.5907208919525146, "eval_runtime": 4.6125, "eval_samples_per_second": 67.859, "eval_steps_per_second": 17.127, "step": 2160 }, { "epoch": 193.14, "learning_rate": 0.0, "loss": 1.7975, "step": 2200 }, { "epoch": 193.14, "eval_loss": 2.590698719024658, "eval_runtime": 4.6213, "eval_samples_per_second": 67.729, "eval_steps_per_second": 17.095, "step": 2200 }, { "epoch": 193.14, "step": 2200, "total_flos": 1.0517861659312128e+18, "train_loss": 2.2616969472711737, "train_runtime": 20093.6832, "train_samples_per_second": 29.024, "train_steps_per_second": 0.109 } ], "logging_steps": 40, "max_steps": 2200, "num_train_epochs": 200, "save_steps": 40, "total_flos": 1.0517861659312128e+18, "trial_name": null, "trial_params": null }