{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.921671018276763, "global_step": 57000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "learning_rate": 1.9825935596170586e-05, "loss": 2.4238, "step": 500 }, { "epoch": 0.17, "learning_rate": 1.9651871192341167e-05, "loss": 2.1067, "step": 1000 }, { "epoch": 0.26, "learning_rate": 1.947780678851175e-05, "loss": 1.9034, "step": 1500 }, { "epoch": 0.35, "learning_rate": 1.9303742384682335e-05, "loss": 1.8061, "step": 2000 }, { "epoch": 0.44, "learning_rate": 1.9129677980852916e-05, "loss": 1.7317, "step": 2500 }, { "epoch": 0.52, "learning_rate": 1.89556135770235e-05, "loss": 1.6795, "step": 3000 }, { "epoch": 0.61, "learning_rate": 1.8781549173194084e-05, "loss": 1.6346, "step": 3500 }, { "epoch": 0.7, "learning_rate": 1.8607484769364665e-05, "loss": 1.5838, "step": 4000 }, { "epoch": 0.78, "learning_rate": 1.843342036553525e-05, "loss": 1.5477, "step": 4500 }, { "epoch": 0.87, "learning_rate": 1.8259355961705833e-05, "loss": 1.5084, "step": 5000 }, { "epoch": 0.96, "learning_rate": 1.8085291557876417e-05, "loss": 1.4765, "step": 5500 }, { "epoch": 1.0, "eval_loss": 1.2889001369476318, "eval_runtime": 523.3258, "eval_samples_per_second": 126.487, "eval_steps_per_second": 1.978, "step": 5745 }, { "epoch": 1.04, "learning_rate": 1.7911227154046998e-05, "loss": 1.4277, "step": 6000 }, { "epoch": 1.13, "learning_rate": 1.7737162750217582e-05, "loss": 1.3842, "step": 6500 }, { "epoch": 1.22, "learning_rate": 1.7563098346388167e-05, "loss": 1.3659, "step": 7000 }, { "epoch": 1.31, "learning_rate": 1.7389033942558747e-05, "loss": 1.346, "step": 7500 }, { "epoch": 1.39, "learning_rate": 1.721496953872933e-05, "loss": 1.3252, "step": 8000 }, { "epoch": 1.48, "learning_rate": 1.7040905134899916e-05, "loss": 1.3148, "step": 8500 }, { "epoch": 1.57, "learning_rate": 1.6866840731070497e-05, "loss": 1.3071, "step": 9000 }, { "epoch": 1.65, "learning_rate": 1.669277632724108e-05, "loss": 1.287, "step": 9500 }, { "epoch": 1.74, "learning_rate": 1.6518711923411665e-05, "loss": 1.2802, "step": 10000 }, { "epoch": 1.83, "learning_rate": 1.6344647519582246e-05, "loss": 1.2637, "step": 10500 }, { "epoch": 1.91, "learning_rate": 1.617058311575283e-05, "loss": 1.2451, "step": 11000 }, { "epoch": 2.0, "eval_loss": 1.1364690065383911, "eval_runtime": 527.405, "eval_samples_per_second": 125.509, "eval_steps_per_second": 1.962, "step": 11490 }, { "epoch": 2.0, "learning_rate": 1.599651871192341e-05, "loss": 1.2384, "step": 11500 }, { "epoch": 2.09, "learning_rate": 1.5822454308093995e-05, "loss": 1.1927, "step": 12000 }, { "epoch": 2.18, "learning_rate": 1.564838990426458e-05, "loss": 1.1753, "step": 12500 }, { "epoch": 2.26, "learning_rate": 1.5474325500435163e-05, "loss": 1.1781, "step": 13000 }, { "epoch": 2.35, "learning_rate": 1.5300261096605747e-05, "loss": 1.1705, "step": 13500 }, { "epoch": 2.44, "learning_rate": 1.512619669277633e-05, "loss": 1.1653, "step": 14000 }, { "epoch": 2.52, "learning_rate": 1.4952132288946912e-05, "loss": 1.1541, "step": 14500 }, { "epoch": 2.61, "learning_rate": 1.4778067885117495e-05, "loss": 1.1485, "step": 15000 }, { "epoch": 2.7, "learning_rate": 1.4604003481288079e-05, "loss": 1.1479, "step": 15500 }, { "epoch": 2.79, "learning_rate": 1.4429939077458661e-05, "loss": 1.1399, "step": 16000 }, { "epoch": 2.87, "learning_rate": 1.4255874673629244e-05, "loss": 1.1355, "step": 16500 }, { "epoch": 2.96, "learning_rate": 1.4081810269799826e-05, "loss": 1.1291, "step": 17000 }, { "epoch": 3.0, "eval_loss": 1.0703651905059814, "eval_runtime": 523.7516, "eval_samples_per_second": 126.384, "eval_steps_per_second": 1.976, "step": 17235 }, { "epoch": 3.05, "learning_rate": 1.390774586597041e-05, "loss": 1.1075, "step": 17500 }, { "epoch": 3.13, "learning_rate": 1.3733681462140993e-05, "loss": 1.0931, "step": 18000 }, { "epoch": 3.22, "learning_rate": 1.3559617058311576e-05, "loss": 1.0864, "step": 18500 }, { "epoch": 3.31, "learning_rate": 1.338555265448216e-05, "loss": 1.0822, "step": 19000 }, { "epoch": 3.39, "learning_rate": 1.3211488250652742e-05, "loss": 1.0847, "step": 19500 }, { "epoch": 3.48, "learning_rate": 1.3037423846823325e-05, "loss": 1.0795, "step": 20000 }, { "epoch": 3.57, "learning_rate": 1.2863359442993907e-05, "loss": 1.0704, "step": 20500 }, { "epoch": 3.66, "learning_rate": 1.2689295039164491e-05, "loss": 1.072, "step": 21000 }, { "epoch": 3.74, "learning_rate": 1.2515230635335076e-05, "loss": 1.0704, "step": 21500 }, { "epoch": 3.83, "learning_rate": 1.234116623150566e-05, "loss": 1.0623, "step": 22000 }, { "epoch": 3.92, "learning_rate": 1.2167101827676242e-05, "loss": 1.0657, "step": 22500 }, { "epoch": 4.0, "eval_loss": 1.0373072624206543, "eval_runtime": 524.067, "eval_samples_per_second": 126.308, "eval_steps_per_second": 1.975, "step": 22980 }, { "epoch": 4.0, "learning_rate": 1.1993037423846825e-05, "loss": 1.0612, "step": 23000 }, { "epoch": 4.09, "learning_rate": 1.1818973020017407e-05, "loss": 1.0267, "step": 23500 }, { "epoch": 4.18, "learning_rate": 1.1644908616187991e-05, "loss": 1.035, "step": 24000 }, { "epoch": 4.26, "learning_rate": 1.1470844212358574e-05, "loss": 1.0298, "step": 24500 }, { "epoch": 4.35, "learning_rate": 1.1296779808529156e-05, "loss": 1.0265, "step": 25000 }, { "epoch": 4.44, "learning_rate": 1.112271540469974e-05, "loss": 1.0337, "step": 25500 }, { "epoch": 4.53, "learning_rate": 1.0948651000870323e-05, "loss": 1.0265, "step": 26000 }, { "epoch": 4.61, "learning_rate": 1.0774586597040905e-05, "loss": 1.0264, "step": 26500 }, { "epoch": 4.7, "learning_rate": 1.0600522193211488e-05, "loss": 1.0181, "step": 27000 }, { "epoch": 4.79, "learning_rate": 1.0426457789382072e-05, "loss": 1.021, "step": 27500 }, { "epoch": 4.87, "learning_rate": 1.0252393385552655e-05, "loss": 1.0249, "step": 28000 }, { "epoch": 4.96, "learning_rate": 1.0078328981723237e-05, "loss": 1.0205, "step": 28500 }, { "epoch": 5.0, "eval_loss": 1.0189781188964844, "eval_runtime": 521.0752, "eval_samples_per_second": 127.033, "eval_steps_per_second": 1.986, "step": 28725 }, { "epoch": 5.05, "learning_rate": 9.904264577893821e-06, "loss": 1.0044, "step": 29000 }, { "epoch": 5.13, "learning_rate": 9.730200174064405e-06, "loss": 0.9887, "step": 29500 }, { "epoch": 5.22, "learning_rate": 9.556135770234988e-06, "loss": 0.993, "step": 30000 }, { "epoch": 5.31, "learning_rate": 9.38207136640557e-06, "loss": 0.9946, "step": 30500 }, { "epoch": 5.4, "learning_rate": 9.208006962576153e-06, "loss": 0.9898, "step": 31000 }, { "epoch": 5.48, "learning_rate": 9.033942558746737e-06, "loss": 0.9902, "step": 31500 }, { "epoch": 5.57, "learning_rate": 8.85987815491732e-06, "loss": 0.9899, "step": 32000 }, { "epoch": 5.66, "learning_rate": 8.685813751087904e-06, "loss": 0.9884, "step": 32500 }, { "epoch": 5.74, "learning_rate": 8.511749347258486e-06, "loss": 0.9906, "step": 33000 }, { "epoch": 5.83, "learning_rate": 8.33768494342907e-06, "loss": 0.9856, "step": 33500 }, { "epoch": 5.92, "learning_rate": 8.163620539599653e-06, "loss": 0.9923, "step": 34000 }, { "epoch": 6.0, "eval_loss": 1.0085190534591675, "eval_runtime": 521.5738, "eval_samples_per_second": 126.912, "eval_steps_per_second": 1.984, "step": 34470 }, { "epoch": 6.01, "learning_rate": 7.989556135770235e-06, "loss": 0.9843, "step": 34500 }, { "epoch": 6.09, "learning_rate": 7.815491731940818e-06, "loss": 0.9674, "step": 35000 }, { "epoch": 6.18, "learning_rate": 7.641427328111402e-06, "loss": 0.9647, "step": 35500 }, { "epoch": 6.27, "learning_rate": 7.4673629242819845e-06, "loss": 0.9664, "step": 36000 }, { "epoch": 6.35, "learning_rate": 7.293298520452569e-06, "loss": 0.9689, "step": 36500 }, { "epoch": 6.44, "learning_rate": 7.119234116623151e-06, "loss": 0.9668, "step": 37000 }, { "epoch": 6.53, "learning_rate": 6.9451697127937345e-06, "loss": 0.9617, "step": 37500 }, { "epoch": 6.61, "learning_rate": 6.771105308964318e-06, "loss": 0.962, "step": 38000 }, { "epoch": 6.7, "learning_rate": 6.5970409051349e-06, "loss": 0.9633, "step": 38500 }, { "epoch": 6.79, "learning_rate": 6.422976501305484e-06, "loss": 0.9611, "step": 39000 }, { "epoch": 6.88, "learning_rate": 6.248912097476066e-06, "loss": 0.9617, "step": 39500 }, { "epoch": 6.96, "learning_rate": 6.0748476936466495e-06, "loss": 0.959, "step": 40000 }, { "epoch": 7.0, "eval_loss": 1.0030972957611084, "eval_runtime": 523.5292, "eval_samples_per_second": 126.438, "eval_steps_per_second": 1.977, "step": 40215 }, { "epoch": 7.05, "learning_rate": 5.900783289817232e-06, "loss": 0.9511, "step": 40500 }, { "epoch": 7.14, "learning_rate": 5.726718885987816e-06, "loss": 0.9432, "step": 41000 }, { "epoch": 7.22, "learning_rate": 5.5526544821583995e-06, "loss": 0.942, "step": 41500 }, { "epoch": 7.31, "learning_rate": 5.378590078328983e-06, "loss": 0.9415, "step": 42000 }, { "epoch": 7.4, "learning_rate": 5.204525674499565e-06, "loss": 0.9427, "step": 42500 }, { "epoch": 7.48, "learning_rate": 5.030461270670149e-06, "loss": 0.9477, "step": 43000 }, { "epoch": 7.57, "learning_rate": 4.856396866840731e-06, "loss": 0.9439, "step": 43500 }, { "epoch": 7.66, "learning_rate": 4.682332463011314e-06, "loss": 0.945, "step": 44000 }, { "epoch": 7.75, "learning_rate": 4.508268059181898e-06, "loss": 0.9456, "step": 44500 }, { "epoch": 7.83, "learning_rate": 4.334203655352481e-06, "loss": 0.9436, "step": 45000 }, { "epoch": 7.92, "learning_rate": 4.1601392515230636e-06, "loss": 0.9447, "step": 45500 }, { "epoch": 8.0, "eval_loss": 1.0001778602600098, "eval_runtime": 520.9721, "eval_samples_per_second": 127.059, "eval_steps_per_second": 1.987, "step": 45960 }, { "epoch": 8.01, "learning_rate": 3.986074847693647e-06, "loss": 0.9459, "step": 46000 }, { "epoch": 8.09, "learning_rate": 3.8120104438642302e-06, "loss": 0.9265, "step": 46500 }, { "epoch": 8.18, "learning_rate": 3.637946040034813e-06, "loss": 0.9271, "step": 47000 }, { "epoch": 8.27, "learning_rate": 3.463881636205396e-06, "loss": 0.9286, "step": 47500 }, { "epoch": 8.36, "learning_rate": 3.2898172323759794e-06, "loss": 0.9302, "step": 48000 }, { "epoch": 8.44, "learning_rate": 3.1157528285465627e-06, "loss": 0.933, "step": 48500 }, { "epoch": 8.53, "learning_rate": 2.9416884247171456e-06, "loss": 0.9272, "step": 49000 }, { "epoch": 8.62, "learning_rate": 2.7676240208877285e-06, "loss": 0.9266, "step": 49500 }, { "epoch": 8.7, "learning_rate": 2.5935596170583114e-06, "loss": 0.9294, "step": 50000 }, { "epoch": 8.79, "learning_rate": 2.4194952132288948e-06, "loss": 0.9312, "step": 50500 }, { "epoch": 8.88, "learning_rate": 2.245430809399478e-06, "loss": 0.9306, "step": 51000 }, { "epoch": 8.96, "learning_rate": 2.071366405570061e-06, "loss": 0.9278, "step": 51500 }, { "epoch": 9.0, "eval_loss": 0.9995460510253906, "eval_runtime": 522.976, "eval_samples_per_second": 126.572, "eval_steps_per_second": 1.979, "step": 51705 }, { "epoch": 9.05, "learning_rate": 1.8973020017406443e-06, "loss": 0.92, "step": 52000 }, { "epoch": 9.14, "learning_rate": 1.7232375979112272e-06, "loss": 0.9178, "step": 52500 }, { "epoch": 9.23, "learning_rate": 1.5491731940818103e-06, "loss": 0.9189, "step": 53000 }, { "epoch": 9.31, "learning_rate": 1.3751087902523935e-06, "loss": 0.9164, "step": 53500 }, { "epoch": 9.4, "learning_rate": 1.2010443864229766e-06, "loss": 0.9186, "step": 54000 }, { "epoch": 9.49, "learning_rate": 1.0269799825935597e-06, "loss": 0.9199, "step": 54500 }, { "epoch": 9.57, "learning_rate": 8.529155787641428e-07, "loss": 0.918, "step": 55000 }, { "epoch": 9.66, "learning_rate": 6.788511749347259e-07, "loss": 0.9229, "step": 55500 }, { "epoch": 9.75, "learning_rate": 5.04786771105309e-07, "loss": 0.9203, "step": 56000 }, { "epoch": 9.83, "learning_rate": 3.3072236727589213e-07, "loss": 0.9202, "step": 56500 }, { "epoch": 9.92, "learning_rate": 1.566579634464752e-07, "loss": 0.9168, "step": 57000 } ], "max_steps": 57450, "num_train_epochs": 10, "total_flos": 8.267810851042099e+18, "trial_name": null, "trial_params": null }