{ "best_metric": 0.28431499004364014, "best_model_checkpoint": "./new_models/gpt2/checkpoint-25000", "epoch": 168.83116883116884, "global_step": 39000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.46, "learning_rate": 4.000000000000001e-06, "loss": 9.4041, "step": 100 }, { "epoch": 6.93, "learning_rate": 8.000000000000001e-06, "loss": 7.6702, "step": 200 }, { "epoch": 10.39, "learning_rate": 1.2e-05, "loss": 6.7042, "step": 300 }, { "epoch": 13.85, "learning_rate": 1.6000000000000003e-05, "loss": 5.8391, "step": 400 }, { "epoch": 17.32, "learning_rate": 2e-05, "loss": 5.1775, "step": 500 }, { "epoch": 20.78, "learning_rate": 1.9963963963963965e-05, "loss": 4.7103, "step": 600 }, { "epoch": 24.24, "learning_rate": 1.992792792792793e-05, "loss": 4.353, "step": 700 }, { "epoch": 27.71, "learning_rate": 1.9891891891891894e-05, "loss": 4.04, "step": 800 }, { "epoch": 31.17, "learning_rate": 1.9855855855855857e-05, "loss": 3.7865, "step": 900 }, { "epoch": 34.63, "learning_rate": 1.981981981981982e-05, "loss": 3.5376, "step": 1000 }, { "epoch": 34.63, "eval_loss": 3.2091352939605713, "eval_runtime": 3.6439, "eval_samples_per_second": 14.27, "eval_steps_per_second": 1.921, "step": 1000 }, { "epoch": 38.1, "learning_rate": 1.9783783783783786e-05, "loss": 3.3258, "step": 1100 }, { "epoch": 41.56, "learning_rate": 1.974774774774775e-05, "loss": 3.1155, "step": 1200 }, { "epoch": 45.02, "learning_rate": 1.9711711711711716e-05, "loss": 2.9341, "step": 1300 }, { "epoch": 48.48, "learning_rate": 1.967567567567568e-05, "loss": 2.7419, "step": 1400 }, { "epoch": 51.95, "learning_rate": 1.963963963963964e-05, "loss": 2.5793, "step": 1500 }, { "epoch": 55.41, "learning_rate": 1.9603603603603604e-05, "loss": 2.4091, "step": 1600 }, { "epoch": 58.87, "learning_rate": 1.956756756756757e-05, "loss": 2.2517, "step": 1700 }, { "epoch": 62.34, "learning_rate": 1.9531531531531534e-05, "loss": 2.0899, "step": 1800 }, { "epoch": 65.8, "learning_rate": 1.9495495495495497e-05, "loss": 1.9464, "step": 1900 }, { "epoch": 69.26, "learning_rate": 1.9459459459459463e-05, "loss": 1.803, "step": 2000 }, { "epoch": 69.26, "eval_loss": 1.7681734561920166, "eval_runtime": 3.5208, "eval_samples_per_second": 14.769, "eval_steps_per_second": 1.988, "step": 2000 }, { "epoch": 72.73, "learning_rate": 1.9423423423423423e-05, "loss": 1.6706, "step": 2100 }, { "epoch": 76.19, "learning_rate": 1.938738738738739e-05, "loss": 1.5401, "step": 2200 }, { "epoch": 79.65, "learning_rate": 1.9351351351351352e-05, "loss": 1.4045, "step": 2300 }, { "epoch": 83.12, "learning_rate": 1.931531531531532e-05, "loss": 1.2934, "step": 2400 }, { "epoch": 86.58, "learning_rate": 1.927927927927928e-05, "loss": 1.1735, "step": 2500 }, { "epoch": 90.04, "learning_rate": 1.9243243243243244e-05, "loss": 1.0624, "step": 2600 }, { "epoch": 93.51, "learning_rate": 1.9207207207207207e-05, "loss": 0.9525, "step": 2700 }, { "epoch": 96.97, "learning_rate": 1.9171171171171174e-05, "loss": 0.8541, "step": 2800 }, { "epoch": 100.43, "learning_rate": 1.9135135135135137e-05, "loss": 0.7571, "step": 2900 }, { "epoch": 103.9, "learning_rate": 1.90990990990991e-05, "loss": 0.6733, "step": 3000 }, { "epoch": 103.9, "eval_loss": 0.9859427213668823, "eval_runtime": 3.5218, "eval_samples_per_second": 14.765, "eval_steps_per_second": 1.988, "step": 3000 }, { "epoch": 107.36, "learning_rate": 1.9063063063063066e-05, "loss": 0.5883, "step": 3100 }, { "epoch": 110.82, "learning_rate": 1.902702702702703e-05, "loss": 0.5167, "step": 3200 }, { "epoch": 114.29, "learning_rate": 1.8990990990990992e-05, "loss": 0.4459, "step": 3300 }, { "epoch": 117.75, "learning_rate": 1.8954954954954955e-05, "loss": 0.385, "step": 3400 }, { "epoch": 121.21, "learning_rate": 1.891891891891892e-05, "loss": 0.3311, "step": 3500 }, { "epoch": 124.68, "learning_rate": 1.8882882882882884e-05, "loss": 0.2853, "step": 3600 }, { "epoch": 128.14, "learning_rate": 1.884684684684685e-05, "loss": 0.2442, "step": 3700 }, { "epoch": 131.6, "learning_rate": 1.8810810810810813e-05, "loss": 0.2097, "step": 3800 }, { "epoch": 135.06, "learning_rate": 1.8774774774774776e-05, "loss": 0.1802, "step": 3900 }, { "epoch": 138.53, "learning_rate": 1.873873873873874e-05, "loss": 0.1561, "step": 4000 }, { "epoch": 138.53, "eval_loss": 0.8047342300415039, "eval_runtime": 3.5244, "eval_samples_per_second": 14.754, "eval_steps_per_second": 1.986, "step": 4000 }, { "epoch": 141.99, "learning_rate": 1.8702702702702706e-05, "loss": 0.1359, "step": 4100 }, { "epoch": 145.45, "learning_rate": 1.866666666666667e-05, "loss": 0.12, "step": 4200 }, { "epoch": 148.92, "learning_rate": 1.863063063063063e-05, "loss": 0.1066, "step": 4300 }, { "epoch": 152.38, "learning_rate": 1.8594594594594598e-05, "loss": 0.0952, "step": 4400 }, { "epoch": 155.84, "learning_rate": 1.855855855855856e-05, "loss": 0.0866, "step": 4500 }, { "epoch": 159.31, "learning_rate": 1.8522522522522524e-05, "loss": 0.0791, "step": 4600 }, { "epoch": 162.77, "learning_rate": 1.8486486486486487e-05, "loss": 0.072, "step": 4700 }, { "epoch": 166.23, "learning_rate": 1.8450450450450453e-05, "loss": 0.0658, "step": 4800 }, { "epoch": 169.7, "learning_rate": 1.8414414414414416e-05, "loss": 0.0622, "step": 4900 }, { "epoch": 173.16, "learning_rate": 1.8378378378378383e-05, "loss": 0.058, "step": 5000 }, { "epoch": 173.16, "eval_loss": 0.8171238303184509, "eval_runtime": 3.5228, "eval_samples_per_second": 14.761, "eval_steps_per_second": 1.987, "step": 5000 }, { "epoch": 176.62, "learning_rate": 1.8342342342342342e-05, "loss": 0.0531, "step": 5100 }, { "epoch": 180.09, "learning_rate": 1.830630630630631e-05, "loss": 0.0504, "step": 5200 }, { "epoch": 183.55, "learning_rate": 1.827027027027027e-05, "loss": 0.046, "step": 5300 }, { "epoch": 187.01, "learning_rate": 1.8234234234234234e-05, "loss": 0.0447, "step": 5400 }, { "epoch": 190.48, "learning_rate": 1.81981981981982e-05, "loss": 0.0543, "step": 5500 }, { "epoch": 193.94, "learning_rate": 1.8162162162162164e-05, "loss": 0.0492, "step": 5600 }, { "epoch": 197.4, "learning_rate": 1.8126126126126127e-05, "loss": 0.0438, "step": 5700 }, { "epoch": 200.87, "learning_rate": 1.809009009009009e-05, "loss": 0.0547, "step": 5800 }, { "epoch": 204.33, "learning_rate": 1.8054054054054056e-05, "loss": 0.0615, "step": 5900 }, { "epoch": 207.79, "learning_rate": 1.801801801801802e-05, "loss": 0.072, "step": 6000 }, { "epoch": 207.79, "eval_loss": 0.8289902210235596, "eval_runtime": 3.5216, "eval_samples_per_second": 14.766, "eval_steps_per_second": 1.988, "step": 6000 }, { "epoch": 211.26, "learning_rate": 1.7981981981981985e-05, "loss": 0.1157, "step": 6100 }, { "epoch": 214.72, "learning_rate": 1.7945945945945948e-05, "loss": 0.0869, "step": 6200 }, { "epoch": 218.18, "learning_rate": 1.790990990990991e-05, "loss": 1.0166, "step": 6300 }, { "epoch": 221.65, "learning_rate": 1.7873873873873874e-05, "loss": 0.0771, "step": 6400 }, { "epoch": 225.11, "learning_rate": 1.783783783783784e-05, "loss": 0.0953, "step": 6500 }, { "epoch": 228.57, "learning_rate": 1.7801801801801804e-05, "loss": 0.6189, "step": 6600 }, { "epoch": 232.03, "learning_rate": 1.7765765765765767e-05, "loss": 0.5593, "step": 6700 }, { "epoch": 235.5, "learning_rate": 1.7729729729729733e-05, "loss": 0.376, "step": 6800 }, { "epoch": 238.96, "learning_rate": 1.7693693693693696e-05, "loss": 0.4129, "step": 6900 }, { "epoch": 242.42, "learning_rate": 1.765765765765766e-05, "loss": 2.2984, "step": 7000 }, { "epoch": 242.42, "eval_loss": 4.4349541664123535, "eval_runtime": 3.5205, "eval_samples_per_second": 14.77, "eval_steps_per_second": 1.988, "step": 7000 }, { "epoch": 245.89, "learning_rate": 1.7621621621621622e-05, "loss": 3.4028, "step": 7100 }, { "epoch": 249.35, "learning_rate": 1.7585585585585588e-05, "loss": 0.7196, "step": 7200 }, { "epoch": 252.81, "learning_rate": 1.754954954954955e-05, "loss": 1.162, "step": 7300 }, { "epoch": 256.28, "learning_rate": 1.7513513513513517e-05, "loss": 0.7413, "step": 7400 }, { "epoch": 259.74, "learning_rate": 1.7477477477477477e-05, "loss": 1.1918, "step": 7500 }, { "epoch": 263.2, "learning_rate": 1.7441441441441443e-05, "loss": 0.8564, "step": 7600 }, { "epoch": 266.67, "learning_rate": 1.7405405405405406e-05, "loss": 0.2815, "step": 7700 }, { "epoch": 270.13, "learning_rate": 1.7369369369369373e-05, "loss": 0.5848, "step": 7800 }, { "epoch": 273.59, "learning_rate": 1.7333333333333336e-05, "loss": 0.6489, "step": 7900 }, { "epoch": 277.06, "learning_rate": 1.72972972972973e-05, "loss": 1.0025, "step": 8000 }, { "epoch": 277.06, "eval_loss": 1.2763237953186035, "eval_runtime": 3.5102, "eval_samples_per_second": 14.814, "eval_steps_per_second": 1.994, "step": 8000 }, { "epoch": 280.52, "learning_rate": 1.726126126126126e-05, "loss": 0.7947, "step": 8100 }, { "epoch": 283.98, "learning_rate": 1.7225225225225225e-05, "loss": 0.558, "step": 8200 }, { "epoch": 287.45, "learning_rate": 1.718918918918919e-05, "loss": 0.6356, "step": 8300 }, { "epoch": 290.91, "learning_rate": 1.7153153153153154e-05, "loss": 0.5268, "step": 8400 }, { "epoch": 294.37, "learning_rate": 1.711711711711712e-05, "loss": 0.2633, "step": 8500 }, { "epoch": 297.84, "learning_rate": 1.7081081081081083e-05, "loss": 0.2457, "step": 8600 }, { "epoch": 301.3, "learning_rate": 1.7045045045045046e-05, "loss": 0.5308, "step": 8700 }, { "epoch": 304.76, "learning_rate": 1.700900900900901e-05, "loss": 0.369, "step": 8800 }, { "epoch": 308.23, "learning_rate": 1.6972972972972975e-05, "loss": 0.3203, "step": 8900 }, { "epoch": 311.69, "learning_rate": 1.693693693693694e-05, "loss": 2.5307, "step": 9000 }, { "epoch": 311.69, "eval_loss": 1.3849806785583496, "eval_runtime": 3.5124, "eval_samples_per_second": 14.805, "eval_steps_per_second": 1.993, "step": 9000 }, { "epoch": 39.39, "learning_rate": 1.96273022751896e-05, "loss": 3.0696, "step": 9100 }, { "epoch": 39.83, "learning_rate": 1.962296858071506e-05, "loss": 3.0068, "step": 9200 }, { "epoch": 40.26, "learning_rate": 1.9618634886240522e-05, "loss": 2.7896, "step": 9300 }, { "epoch": 40.69, "learning_rate": 1.9614301191765985e-05, "loss": 2.5042, "step": 9400 }, { "epoch": 41.13, "learning_rate": 1.960996749729144e-05, "loss": 2.8704, "step": 9500 }, { "epoch": 41.56, "learning_rate": 1.9605633802816904e-05, "loss": 3.4878, "step": 9600 }, { "epoch": 41.99, "learning_rate": 1.9601300108342363e-05, "loss": 3.0682, "step": 9700 }, { "epoch": 42.42, "learning_rate": 1.9596966413867822e-05, "loss": 2.9751, "step": 9800 }, { "epoch": 42.86, "learning_rate": 1.9592632719393285e-05, "loss": 3.3576, "step": 9900 }, { "epoch": 43.29, "learning_rate": 1.9588299024918744e-05, "loss": 2.9478, "step": 10000 }, { "epoch": 43.29, "eval_loss": 1.7224024534225464, "eval_runtime": 3.6186, "eval_samples_per_second": 14.37, "eval_steps_per_second": 1.934, "step": 10000 }, { "epoch": 47.62, "learning_rate": 1.954496208017335e-05, "loss": 2.4401, "step": 11000 }, { "epoch": 47.62, "eval_loss": 1.6094621419906616, "eval_runtime": 3.6227, "eval_samples_per_second": 14.354, "eval_steps_per_second": 1.932, "step": 11000 }, { "epoch": 51.95, "learning_rate": 1.9501625135427952e-05, "loss": 2.3021, "step": 12000 }, { "epoch": 51.95, "eval_loss": 1.9848077297210693, "eval_runtime": 3.511, "eval_samples_per_second": 14.81, "eval_steps_per_second": 1.994, "step": 12000 }, { "epoch": 56.28, "learning_rate": 1.945828819068256e-05, "loss": 1.8831, "step": 13000 }, { "epoch": 56.28, "eval_loss": 0.5190821290016174, "eval_runtime": 3.5109, "eval_samples_per_second": 14.811, "eval_steps_per_second": 1.994, "step": 13000 }, { "epoch": 60.61, "learning_rate": 1.9414951245937164e-05, "loss": 1.1329, "step": 14000 }, { "epoch": 60.61, "eval_loss": 0.9506992101669312, "eval_runtime": 3.511, "eval_samples_per_second": 14.81, "eval_steps_per_second": 1.994, "step": 14000 }, { "epoch": 64.94, "learning_rate": 1.9371614301191768e-05, "loss": 1.8788, "step": 15000 }, { "epoch": 64.94, "eval_loss": 1.937408685684204, "eval_runtime": 3.5081, "eval_samples_per_second": 14.823, "eval_steps_per_second": 1.995, "step": 15000 }, { "epoch": 69.26, "learning_rate": 1.932827735644637e-05, "loss": 1.6736, "step": 16000 }, { "epoch": 69.26, "eval_loss": 0.5699201226234436, "eval_runtime": 3.5113, "eval_samples_per_second": 14.809, "eval_steps_per_second": 1.994, "step": 16000 }, { "epoch": 73.59, "learning_rate": 1.9284940411700976e-05, "loss": 0.5165, "step": 17000 }, { "epoch": 73.59, "eval_loss": 0.4182128310203552, "eval_runtime": 3.5129, "eval_samples_per_second": 14.803, "eval_steps_per_second": 1.993, "step": 17000 }, { "epoch": 77.92, "learning_rate": 1.924160346695558e-05, "loss": 0.4656, "step": 18000 }, { "epoch": 77.92, "eval_loss": 0.4120073914527893, "eval_runtime": 3.5127, "eval_samples_per_second": 14.803, "eval_steps_per_second": 1.993, "step": 18000 }, { "epoch": 82.25, "learning_rate": 1.9198266522210184e-05, "loss": 0.6133, "step": 19000 }, { "epoch": 82.25, "eval_loss": 0.4980267286300659, "eval_runtime": 3.5108, "eval_samples_per_second": 14.811, "eval_steps_per_second": 1.994, "step": 19000 }, { "epoch": 86.58, "learning_rate": 1.9154929577464788e-05, "loss": 0.8087, "step": 20000 }, { "epoch": 86.58, "eval_loss": 0.5801683068275452, "eval_runtime": 3.5099, "eval_samples_per_second": 14.815, "eval_steps_per_second": 1.994, "step": 20000 }, { "epoch": 90.91, "learning_rate": 1.9111592632719395e-05, "loss": 2.2068, "step": 21000 }, { "epoch": 90.91, "eval_loss": 0.7701263427734375, "eval_runtime": 3.5112, "eval_samples_per_second": 14.81, "eval_steps_per_second": 1.994, "step": 21000 }, { "epoch": 95.24, "learning_rate": 1.9068255687974e-05, "loss": 1.0182, "step": 22000 }, { "epoch": 95.24, "eval_loss": 0.42168232798576355, "eval_runtime": 3.5098, "eval_samples_per_second": 14.816, "eval_steps_per_second": 1.994, "step": 22000 }, { "epoch": 99.57, "learning_rate": 1.9024918743228603e-05, "loss": 0.3515, "step": 23000 }, { "epoch": 99.57, "eval_loss": 0.2897047996520996, "eval_runtime": 3.5082, "eval_samples_per_second": 14.822, "eval_steps_per_second": 1.995, "step": 23000 }, { "epoch": 103.9, "learning_rate": 1.8981581798483207e-05, "loss": 1.007, "step": 24000 }, { "epoch": 103.9, "eval_loss": 0.28924015164375305, "eval_runtime": 3.5076, "eval_samples_per_second": 14.825, "eval_steps_per_second": 1.996, "step": 24000 }, { "epoch": 108.23, "learning_rate": 1.8938244853737814e-05, "loss": 0.1892, "step": 25000 }, { "epoch": 108.23, "eval_loss": 0.28431499004364014, "eval_runtime": 3.5124, "eval_samples_per_second": 14.805, "eval_steps_per_second": 1.993, "step": 25000 }, { "epoch": 112.55, "learning_rate": 1.8894907908992418e-05, "loss": 0.2349, "step": 26000 }, { "epoch": 112.55, "eval_loss": 0.2943420112133026, "eval_runtime": 3.5082, "eval_samples_per_second": 14.822, "eval_steps_per_second": 1.995, "step": 26000 }, { "epoch": 116.88, "learning_rate": 1.8851570964247022e-05, "loss": 0.1959, "step": 27000 }, { "epoch": 116.88, "eval_loss": 0.2937524616718292, "eval_runtime": 3.5084, "eval_samples_per_second": 14.822, "eval_steps_per_second": 1.995, "step": 27000 }, { "epoch": 121.21, "learning_rate": 1.8808234019501626e-05, "loss": 0.5489, "step": 28000 }, { "epoch": 121.21, "eval_loss": 0.3693106770515442, "eval_runtime": 3.5038, "eval_samples_per_second": 14.841, "eval_steps_per_second": 1.998, "step": 28000 }, { "epoch": 125.54, "learning_rate": 1.8764897074756233e-05, "loss": 0.1798, "step": 29000 }, { "epoch": 125.54, "eval_loss": 0.2986227571964264, "eval_runtime": 3.5089, "eval_samples_per_second": 14.819, "eval_steps_per_second": 1.995, "step": 29000 }, { "epoch": 129.87, "learning_rate": 1.8721560130010837e-05, "loss": 0.1638, "step": 30000 }, { "epoch": 129.87, "eval_loss": 0.3518519103527069, "eval_runtime": 3.5068, "eval_samples_per_second": 14.828, "eval_steps_per_second": 1.996, "step": 30000 }, { "epoch": 134.2, "learning_rate": 1.867822318526544e-05, "loss": 0.3161, "step": 31000 }, { "epoch": 134.2, "eval_loss": 0.37139639258384705, "eval_runtime": 3.5102, "eval_samples_per_second": 14.814, "eval_steps_per_second": 1.994, "step": 31000 }, { "epoch": 138.53, "learning_rate": 1.8634886240520045e-05, "loss": 0.4443, "step": 32000 }, { "epoch": 138.53, "eval_loss": 0.4150441288948059, "eval_runtime": 3.5081, "eval_samples_per_second": 14.823, "eval_steps_per_second": 1.995, "step": 32000 }, { "epoch": 142.86, "learning_rate": 1.859154929577465e-05, "loss": 0.6043, "step": 33000 }, { "epoch": 142.86, "eval_loss": 0.6062866449356079, "eval_runtime": 3.5067, "eval_samples_per_second": 14.829, "eval_steps_per_second": 1.996, "step": 33000 }, { "epoch": 147.19, "learning_rate": 1.8548212351029253e-05, "loss": 1.0402, "step": 34000 }, { "epoch": 147.19, "eval_loss": 0.5321042537689209, "eval_runtime": 3.6131, "eval_samples_per_second": 14.392, "eval_steps_per_second": 1.937, "step": 34000 }, { "epoch": 151.52, "learning_rate": 1.8504875406283857e-05, "loss": 0.8064, "step": 35000 }, { "epoch": 151.52, "eval_loss": 0.5623323917388916, "eval_runtime": 3.5113, "eval_samples_per_second": 14.809, "eval_steps_per_second": 1.994, "step": 35000 }, { "epoch": 155.84, "learning_rate": 1.8461538461538465e-05, "loss": 1.0081, "step": 36000 }, { "epoch": 155.84, "eval_loss": 0.8560149669647217, "eval_runtime": 3.5137, "eval_samples_per_second": 14.799, "eval_steps_per_second": 1.992, "step": 36000 }, { "epoch": 160.17, "learning_rate": 1.841820151679307e-05, "loss": 1.4319, "step": 37000 }, { "epoch": 160.17, "eval_loss": 0.7755089998245239, "eval_runtime": 3.5088, "eval_samples_per_second": 14.82, "eval_steps_per_second": 1.995, "step": 37000 }, { "epoch": 164.5, "learning_rate": 1.8374864572047673e-05, "loss": 1.5845, "step": 38000 }, { "epoch": 164.5, "eval_loss": 0.8413295745849609, "eval_runtime": 3.5072, "eval_samples_per_second": 14.827, "eval_steps_per_second": 1.996, "step": 38000 }, { "epoch": 168.83, "learning_rate": 1.8331527627302277e-05, "loss": 1.1751, "step": 39000 }, { "epoch": 168.83, "eval_loss": 1.2155665159225464, "eval_runtime": 3.5106, "eval_samples_per_second": 14.812, "eval_steps_per_second": 1.994, "step": 39000 } ], "max_steps": 462000, "num_train_epochs": 2000, "total_flos": 1.06376689483776e+17, "trial_name": null, "trial_params": null }