{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9960768928991763, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 9.997799572243123e-06, "loss": 0.4974, "step": 10 }, { "epoch": 0.06, "learning_rate": 9.990195641770761e-06, "loss": 0.4841, "step": 20 }, { "epoch": 0.06, "eval_loss": 0.4602764844894409, "eval_runtime": 62.0223, "eval_samples_per_second": 13.431, "eval_steps_per_second": 0.855, "step": 20 }, { "epoch": 0.09, "learning_rate": 9.9771693033643e-06, "loss": 0.4748, "step": 30 }, { "epoch": 0.13, "learning_rate": 9.958734711603195e-06, "loss": 0.4678, "step": 40 }, { "epoch": 0.13, "eval_loss": 0.45144811272621155, "eval_runtime": 61.8568, "eval_samples_per_second": 13.467, "eval_steps_per_second": 0.857, "step": 40 }, { "epoch": 0.16, "learning_rate": 9.934911897741493e-06, "loss": 0.4687, "step": 50 }, { "epoch": 0.19, "learning_rate": 9.905726747941616e-06, "loss": 0.4687, "step": 60 }, { "epoch": 0.19, "eval_loss": 0.44708773493766785, "eval_runtime": 62.0907, "eval_samples_per_second": 13.416, "eval_steps_per_second": 0.854, "step": 60 }, { "epoch": 0.22, "learning_rate": 9.871210975146135e-06, "loss": 0.473, "step": 70 }, { "epoch": 0.25, "learning_rate": 9.831402084618113e-06, "loss": 0.4608, "step": 80 }, { "epoch": 0.25, "eval_loss": 0.44395506381988525, "eval_runtime": 61.9484, "eval_samples_per_second": 13.447, "eval_steps_per_second": 0.856, "step": 80 }, { "epoch": 0.28, "learning_rate": 9.786343333187412e-06, "loss": 0.4542, "step": 90 }, { "epoch": 0.31, "learning_rate": 9.736083682247287e-06, "loss": 0.4593, "step": 100 }, { "epoch": 0.31, "eval_loss": 0.43974635004997253, "eval_runtime": 61.9084, "eval_samples_per_second": 13.455, "eval_steps_per_second": 0.856, "step": 100 }, { "epoch": 0.35, "learning_rate": 9.680677744552346e-06, "loss": 0.4681, "step": 110 }, { "epoch": 0.38, "learning_rate": 9.620185724875652e-06, "loss": 0.4488, "step": 120 }, { "epoch": 0.38, "eval_loss": 0.435758113861084, "eval_runtime": 61.8043, "eval_samples_per_second": 13.478, "eval_steps_per_second": 0.858, "step": 120 }, { "epoch": 0.41, "learning_rate": 9.55467335458948e-06, "loss": 0.4478, "step": 130 }, { "epoch": 0.44, "learning_rate": 9.484211820240797e-06, "loss": 0.4494, "step": 140 }, { "epoch": 0.44, "eval_loss": 0.434115469455719, "eval_runtime": 61.841, "eval_samples_per_second": 13.47, "eval_steps_per_second": 0.857, "step": 140 }, { "epoch": 0.47, "learning_rate": 9.408877686199078e-06, "loss": 0.44, "step": 150 }, { "epoch": 0.5, "learning_rate": 9.328752811460542e-06, "loss": 0.4477, "step": 160 }, { "epoch": 0.5, "eval_loss": 0.4319334030151367, "eval_runtime": 61.9956, "eval_samples_per_second": 13.436, "eval_steps_per_second": 0.855, "step": 160 }, { "epoch": 0.53, "learning_rate": 9.243924260699133e-06, "loss": 0.4465, "step": 170 }, { "epoch": 0.56, "learning_rate": 9.154484209661002e-06, "loss": 0.445, "step": 180 }, { "epoch": 0.56, "eval_loss": 0.42997926473617554, "eval_runtime": 62.0149, "eval_samples_per_second": 13.432, "eval_steps_per_second": 0.855, "step": 180 }, { "epoch": 0.6, "learning_rate": 9.060529845005184e-06, "loss": 0.4332, "step": 190 }, { "epoch": 0.63, "learning_rate": 8.962163258699397e-06, "loss": 0.4366, "step": 200 }, { "epoch": 0.63, "eval_loss": 0.4274918735027313, "eval_runtime": 61.8797, "eval_samples_per_second": 13.462, "eval_steps_per_second": 0.857, "step": 200 }, { "epoch": 0.66, "learning_rate": 8.859491337085643e-06, "loss": 0.428, "step": 210 }, { "epoch": 0.69, "learning_rate": 8.752625644736204e-06, "loss": 0.442, "step": 220 }, { "epoch": 0.69, "eval_loss": 0.42491650581359863, "eval_runtime": 62.1183, "eval_samples_per_second": 13.41, "eval_steps_per_second": 0.853, "step": 220 }, { "epoch": 0.72, "learning_rate": 8.641682303226197e-06, "loss": 0.4442, "step": 230 }, { "epoch": 0.75, "learning_rate": 8.526781864954453e-06, "loss": 0.4424, "step": 240 }, { "epoch": 0.75, "eval_loss": 0.4228505492210388, "eval_runtime": 62.0338, "eval_samples_per_second": 13.428, "eval_steps_per_second": 0.854, "step": 240 }, { "epoch": 0.78, "learning_rate": 8.40804918214979e-06, "loss": 0.4301, "step": 250 }, { "epoch": 0.82, "learning_rate": 8.28561327120505e-06, "loss": 0.4427, "step": 260 }, { "epoch": 0.82, "eval_loss": 0.41998758912086487, "eval_runtime": 61.9344, "eval_samples_per_second": 13.45, "eval_steps_per_second": 0.856, "step": 260 }, { "epoch": 0.85, "learning_rate": 8.159607172486301e-06, "loss": 0.4316, "step": 270 }, { "epoch": 0.88, "learning_rate": 8.030167805769537e-06, "loss": 0.4372, "step": 280 }, { "epoch": 0.88, "eval_loss": 0.4175536036491394, "eval_runtime": 61.971, "eval_samples_per_second": 13.442, "eval_steps_per_second": 0.855, "step": 280 }, { "epoch": 0.91, "learning_rate": 7.897435821461964e-06, "loss": 0.4398, "step": 290 }, { "epoch": 0.94, "learning_rate": 7.761555447769548e-06, "loss": 0.4335, "step": 300 }, { "epoch": 0.94, "eval_loss": 0.4155929982662201, "eval_runtime": 61.9696, "eval_samples_per_second": 13.442, "eval_steps_per_second": 0.855, "step": 300 }, { "epoch": 0.97, "learning_rate": 7.622674333976863e-06, "loss": 0.4371, "step": 310 }, { "epoch": 1.0, "learning_rate": 7.4809433900095705e-06, "loss": 0.4088, "step": 320 }, { "epoch": 1.0, "eval_loss": 0.41465404629707336, "eval_runtime": 61.9822, "eval_samples_per_second": 13.439, "eval_steps_per_second": 0.855, "step": 320 }, { "epoch": 1.04, "learning_rate": 7.336516622453833e-06, "loss": 0.3166, "step": 330 }, { "epoch": 1.07, "learning_rate": 7.1895509672108674e-06, "loss": 0.3145, "step": 340 }, { "epoch": 1.07, "eval_loss": 0.42051395773887634, "eval_runtime": 62.5118, "eval_samples_per_second": 13.325, "eval_steps_per_second": 0.848, "step": 340 }, { "epoch": 1.1, "learning_rate": 7.040206118968466e-06, "loss": 0.3136, "step": 350 }, { "epoch": 1.13, "learning_rate": 6.88864435767478e-06, "loss": 0.3151, "step": 360 }, { "epoch": 1.13, "eval_loss": 0.4205115735530853, "eval_runtime": 62.5792, "eval_samples_per_second": 13.311, "eval_steps_per_second": 0.847, "step": 360 }, { "epoch": 1.16, "learning_rate": 6.735030372202942e-06, "loss": 0.3137, "step": 370 }, { "epoch": 1.19, "learning_rate": 6.579531081398105e-06, "loss": 0.3019, "step": 380 }, { "epoch": 1.19, "eval_loss": 0.4216003119945526, "eval_runtime": 62.3646, "eval_samples_per_second": 13.357, "eval_steps_per_second": 0.85, "step": 380 }, { "epoch": 1.22, "learning_rate": 6.4223154527013755e-06, "loss": 0.3044, "step": 390 }, { "epoch": 1.26, "learning_rate": 6.263554318547713e-06, "loss": 0.3044, "step": 400 }, { "epoch": 1.26, "eval_loss": 0.4185173809528351, "eval_runtime": 61.9058, "eval_samples_per_second": 13.456, "eval_steps_per_second": 0.856, "step": 400 }, { "epoch": 1.29, "learning_rate": 6.1034201907373045e-06, "loss": 0.305, "step": 410 }, { "epoch": 1.32, "learning_rate": 5.942087072982131e-06, "loss": 0.3034, "step": 420 }, { "epoch": 1.32, "eval_loss": 0.41815003752708435, "eval_runtime": 62.0238, "eval_samples_per_second": 13.43, "eval_steps_per_second": 0.855, "step": 420 }, { "epoch": 1.35, "learning_rate": 5.779730271831384e-06, "loss": 0.3115, "step": 430 }, { "epoch": 1.38, "learning_rate": 5.616526206181215e-06, "loss": 0.3026, "step": 440 }, { "epoch": 1.38, "eval_loss": 0.41711267828941345, "eval_runtime": 62.5134, "eval_samples_per_second": 13.325, "eval_steps_per_second": 0.848, "step": 440 }, { "epoch": 1.41, "learning_rate": 5.4526522155758015e-06, "loss": 0.3077, "step": 450 }, { "epoch": 1.45, "learning_rate": 5.288286367508009e-06, "loss": 0.3062, "step": 460 }, { "epoch": 1.45, "eval_loss": 0.41751572489738464, "eval_runtime": 62.561, "eval_samples_per_second": 13.315, "eval_steps_per_second": 0.847, "step": 460 }, { "epoch": 1.48, "learning_rate": 5.123607263929075e-06, "loss": 0.3076, "step": 470 }, { "epoch": 1.51, "learning_rate": 4.958793847177518e-06, "loss": 0.315, "step": 480 }, { "epoch": 1.51, "eval_loss": 0.41455498337745667, "eval_runtime": 62.0669, "eval_samples_per_second": 13.421, "eval_steps_per_second": 0.854, "step": 480 }, { "epoch": 1.54, "learning_rate": 4.7940252055382115e-06, "loss": 0.3024, "step": 490 }, { "epoch": 1.57, "learning_rate": 4.629480378642832e-06, "loss": 0.3124, "step": 500 }, { "epoch": 1.57, "eval_loss": 0.41453319787979126, "eval_runtime": 61.9067, "eval_samples_per_second": 13.456, "eval_steps_per_second": 0.856, "step": 500 }, { "epoch": 1.6, "learning_rate": 4.46533816292321e-06, "loss": 0.31, "step": 510 }, { "epoch": 1.63, "learning_rate": 4.301776917328918e-06, "loss": 0.3096, "step": 520 }, { "epoch": 1.63, "eval_loss": 0.41273748874664307, "eval_runtime": 62.1417, "eval_samples_per_second": 13.405, "eval_steps_per_second": 0.853, "step": 520 }, { "epoch": 1.67, "learning_rate": 4.138974369520252e-06, "loss": 0.3044, "step": 530 }, { "epoch": 1.7, "learning_rate": 3.977107422747163e-06, "loss": 0.3178, "step": 540 }, { "epoch": 1.7, "eval_loss": 0.4111482501029968, "eval_runtime": 62.5639, "eval_samples_per_second": 13.314, "eval_steps_per_second": 0.847, "step": 540 }, { "epoch": 1.73, "learning_rate": 3.816351963624017e-06, "loss": 0.3102, "step": 550 }, { "epoch": 1.76, "learning_rate": 3.6568826710090353e-06, "loss": 0.3044, "step": 560 }, { "epoch": 1.76, "eval_loss": 0.4110707640647888, "eval_runtime": 62.5689, "eval_samples_per_second": 13.313, "eval_steps_per_second": 0.847, "step": 560 }, { "epoch": 1.79, "learning_rate": 3.4988728261960957e-06, "loss": 0.306, "step": 570 }, { "epoch": 1.82, "learning_rate": 3.3424941246251574e-06, "loss": 0.3078, "step": 580 }, { "epoch": 1.82, "eval_loss": 0.4091060757637024, "eval_runtime": 62.0676, "eval_samples_per_second": 13.421, "eval_steps_per_second": 0.854, "step": 580 }, { "epoch": 1.85, "learning_rate": 3.1879164893158713e-06, "loss": 0.2977, "step": 590 }, { "epoch": 1.89, "learning_rate": 3.035307886227156e-06, "loss": 0.2967, "step": 600 }, { "epoch": 1.89, "eval_loss": 0.4094270169734955, "eval_runtime": 62.0655, "eval_samples_per_second": 13.421, "eval_steps_per_second": 0.854, "step": 600 }, { "epoch": 1.92, "learning_rate": 2.8848341417433036e-06, "loss": 0.3069, "step": 610 }, { "epoch": 1.95, "learning_rate": 2.736658762485005e-06, "loss": 0.3068, "step": 620 }, { "epoch": 1.95, "eval_loss": 0.4080323278903961, "eval_runtime": 62.0098, "eval_samples_per_second": 13.433, "eval_steps_per_second": 0.855, "step": 620 }, { "epoch": 1.98, "learning_rate": 2.590942757641035e-06, "loss": 0.3037, "step": 630 }, { "epoch": 2.01, "learning_rate": 2.447844464013703e-06, "loss": 0.276, "step": 640 }, { "epoch": 2.01, "eval_loss": 0.4152510464191437, "eval_runtime": 61.9609, "eval_samples_per_second": 13.444, "eval_steps_per_second": 0.855, "step": 640 }, { "epoch": 2.04, "learning_rate": 2.3075193739681182e-06, "loss": 0.2327, "step": 650 }, { "epoch": 2.07, "learning_rate": 2.170119966472293e-06, "loss": 0.2288, "step": 660 }, { "epoch": 2.07, "eval_loss": 0.43204566836357117, "eval_runtime": 62.0792, "eval_samples_per_second": 13.418, "eval_steps_per_second": 0.854, "step": 660 }, { "epoch": 2.1, "learning_rate": 2.0357955414116075e-06, "loss": 0.2267, "step": 670 }, { "epoch": 2.14, "learning_rate": 1.9046920573577239e-06, "loss": 0.2244, "step": 680 }, { "epoch": 2.14, "eval_loss": 0.4292474389076233, "eval_runtime": 62.0223, "eval_samples_per_second": 13.431, "eval_steps_per_second": 0.855, "step": 680 }, { "epoch": 2.17, "learning_rate": 1.7769519729682105e-06, "loss": 0.2327, "step": 690 }, { "epoch": 2.2, "learning_rate": 1.6527140921892066e-06, "loss": 0.2336, "step": 700 }, { "epoch": 2.2, "eval_loss": 0.427610844373703, "eval_runtime": 62.1107, "eval_samples_per_second": 13.412, "eval_steps_per_second": 0.853, "step": 700 }, { "epoch": 2.23, "learning_rate": 1.532113413429357e-06, "loss": 0.2386, "step": 710 }, { "epoch": 2.26, "learning_rate": 1.4152809828688708e-06, "loss": 0.2266, "step": 720 }, { "epoch": 2.26, "eval_loss": 0.4290391206741333, "eval_runtime": 62.2449, "eval_samples_per_second": 13.383, "eval_steps_per_second": 0.851, "step": 720 }, { "epoch": 2.29, "learning_rate": 1.3023437520631426e-06, "loss": 0.2328, "step": 730 }, { "epoch": 2.32, "learning_rate": 1.1934244399956206e-06, "loss": 0.2312, "step": 740 }, { "epoch": 2.32, "eval_loss": 0.42950907349586487, "eval_runtime": 62.2915, "eval_samples_per_second": 13.373, "eval_steps_per_second": 0.851, "step": 740 }, { "epoch": 2.36, "learning_rate": 1.0886413997298595e-06, "loss": 0.2338, "step": 750 }, { "epoch": 2.39, "learning_rate": 9.881084898056197e-07, "loss": 0.2277, "step": 760 }, { "epoch": 2.39, "eval_loss": 0.4284292161464691, "eval_runtime": 62.4097, "eval_samples_per_second": 13.347, "eval_steps_per_second": 0.849, "step": 760 }, { "epoch": 2.42, "learning_rate": 8.919349505187813e-07, "loss": 0.2333, "step": 770 }, { "epoch": 2.45, "learning_rate": 8.002252852194992e-07, "loss": 0.2332, "step": 780 }, { "epoch": 2.45, "eval_loss": 0.42790091037750244, "eval_runtime": 62.4651, "eval_samples_per_second": 13.335, "eval_steps_per_second": 0.848, "step": 780 }, { "epoch": 2.48, "learning_rate": 7.130791467575676e-07, "loss": 0.2257, "step": 790 }, { "epoch": 2.51, "learning_rate": 6.305912291984229e-07, "loss": 0.2289, "step": 800 }, { "epoch": 2.51, "eval_loss": 0.42792582511901855, "eval_runtime": 62.5296, "eval_samples_per_second": 13.322, "eval_steps_per_second": 0.848, "step": 800 }, { "epoch": 2.54, "learning_rate": 5.528511649273932e-07, "loss": 0.2303, "step": 810 }, { "epoch": 2.58, "learning_rate": 4.799434272540576e-07, "loss": 0.2279, "step": 820 }, { "epoch": 2.58, "eval_loss": 0.4278266131877899, "eval_runtime": 62.5218, "eval_samples_per_second": 13.323, "eval_steps_per_second": 0.848, "step": 820 }, { "epoch": 2.61, "learning_rate": 4.1194723862250317e-07, "loss": 0.2267, "step": 830 }, { "epoch": 2.64, "learning_rate": 3.4893648452724636e-07, "loss": 0.2312, "step": 840 }, { "epoch": 2.64, "eval_loss": 0.4273243546485901, "eval_runtime": 62.5591, "eval_samples_per_second": 13.315, "eval_steps_per_second": 0.847, "step": 840 }, { "epoch": 2.67, "learning_rate": 2.9097963322834597e-07, "loss": 0.2306, "step": 850 }, { "epoch": 2.7, "learning_rate": 2.3813966135294574e-07, "loss": 0.2334, "step": 860 }, { "epoch": 2.7, "eval_loss": 0.42646506428718567, "eval_runtime": 62.6378, "eval_samples_per_second": 13.299, "eval_steps_per_second": 0.846, "step": 860 }, { "epoch": 2.73, "learning_rate": 1.9047398546410633e-07, "loss": 0.2306, "step": 870 }, { "epoch": 2.76, "learning_rate": 1.4803439967125022e-07, "loss": 0.2278, "step": 880 }, { "epoch": 2.76, "eval_loss": 0.42754805088043213, "eval_runtime": 62.1476, "eval_samples_per_second": 13.404, "eval_steps_per_second": 0.853, "step": 880 }, { "epoch": 2.8, "learning_rate": 1.1086701935005606e-07, "loss": 0.2296, "step": 890 }, { "epoch": 2.83, "learning_rate": 7.901223103291833e-08, "loss": 0.2295, "step": 900 }, { "epoch": 2.83, "eval_loss": 0.4276488721370697, "eval_runtime": 61.9957, "eval_samples_per_second": 13.436, "eval_steps_per_second": 0.855, "step": 900 }, { "epoch": 2.86, "learning_rate": 5.250464852444792e-08, "loss": 0.2334, "step": 910 }, { "epoch": 2.89, "learning_rate": 3.137307528968292e-08, "loss": 0.2292, "step": 920 }, { "epoch": 2.89, "eval_loss": 0.4273829162120819, "eval_runtime": 62.0452, "eval_samples_per_second": 13.426, "eval_steps_per_second": 0.854, "step": 920 }, { "epoch": 2.92, "learning_rate": 1.5640473155894566e-08, "loss": 0.2284, "step": 930 }, { "epoch": 2.95, "learning_rate": 5.323937361977338e-09, "loss": 0.2291, "step": 940 }, { "epoch": 2.95, "eval_loss": 0.42734310030937195, "eval_runtime": 61.9825, "eval_samples_per_second": 13.439, "eval_steps_per_second": 0.855, "step": 940 }, { "epoch": 2.98, "learning_rate": 4.346779825575853e-10, "loss": 0.2288, "step": 950 }, { "epoch": 3.0, "step": 954, "total_flos": 6.195687991759864e+18, "train_loss": 0.1527079766776327, "train_runtime": 29559.5606, "train_samples_per_second": 4.138, "train_steps_per_second": 0.032 } ], "max_steps": 954, "num_train_epochs": 3, "total_flos": 6.195687991759864e+18, "trial_name": null, "trial_params": null }