{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.99758337361044, "global_step": 516, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 0.0003999073369028083, "loss": 4.4497, "step": 5 }, { "epoch": 0.08, "learning_rate": 0.00039962943347572903, "loss": 3.7433, "step": 10 }, { "epoch": 0.12, "learning_rate": 0.00039916654723268495, "loss": 3.4124, "step": 15 }, { "epoch": 0.15, "learning_rate": 0.0003985191070984053, "loss": 3.3596, "step": 20 }, { "epoch": 0.19, "learning_rate": 0.000397687713010971, "loss": 3.2361, "step": 25 }, { "epoch": 0.23, "learning_rate": 0.00039667313536589325, "loss": 3.2741, "step": 30 }, { "epoch": 0.27, "learning_rate": 0.0003954763143022415, "loss": 3.1754, "step": 35 }, { "epoch": 0.31, "learning_rate": 0.00039409835883148103, "loss": 3.1644, "step": 40 }, { "epoch": 0.35, "learning_rate": 0.00039254054580982917, "loss": 3.158, "step": 45 }, { "epoch": 0.39, "learning_rate": 0.00039080431875508013, "loss": 3.1109, "step": 50 }, { "epoch": 0.43, "learning_rate": 0.00038889128650899707, "loss": 3.1248, "step": 55 }, { "epoch": 0.46, "learning_rate": 0.0003868032217465096, "loss": 3.01, "step": 60 }, { "epoch": 0.5, "learning_rate": 0.00038454205933309793, "loss": 3.1002, "step": 65 }, { "epoch": 0.54, "learning_rate": 0.0003821098945318869, "loss": 3.001, "step": 70 }, { "epoch": 0.58, "learning_rate": 0.0003795089810621101, "loss": 3.0187, "step": 75 }, { "epoch": 0.62, "learning_rate": 0.00037674172901074385, "loss": 2.9759, "step": 80 }, { "epoch": 0.66, "learning_rate": 0.0003738107025992461, "loss": 2.8984, "step": 85 }, { "epoch": 0.7, "learning_rate": 0.0003707186178074693, "loss": 2.9715, "step": 90 }, { "epoch": 0.73, "learning_rate": 0.0003674683398569492, "loss": 3.0181, "step": 95 }, { "epoch": 0.77, "learning_rate": 0.0003640628805559022, "loss": 3.0082, "step": 100 }, { "epoch": 0.81, "learning_rate": 0.00036050539550839015, "loss": 2.9286, "step": 105 }, { "epoch": 0.85, "learning_rate": 0.0003567991811902403, "loss": 2.9238, "step": 110 }, { "epoch": 0.89, "learning_rate": 0.0003529476718944283, "loss": 2.9692, "step": 115 }, { "epoch": 0.93, "learning_rate": 0.0003489544365487564, "loss": 2.9091, "step": 120 }, { "epoch": 0.97, "learning_rate": 0.000344823175408774, "loss": 2.8245, "step": 125 }, { "epoch": 1.0, "eval_loss": 2.8029613494873047, "eval_runtime": 57.7962, "eval_samples_per_second": 33.013, "eval_steps_per_second": 4.135, "step": 129 }, { "epoch": 1.01, "learning_rate": 0.00034055771662900637, "loss": 3.0457, "step": 130 }, { "epoch": 1.05, "learning_rate": 0.00033616201271566845, "loss": 2.5236, "step": 135 }, { "epoch": 1.09, "learning_rate": 0.0003316401368641496, "loss": 2.6179, "step": 140 }, { "epoch": 1.12, "learning_rate": 0.0003269962791846651, "loss": 2.5468, "step": 145 }, { "epoch": 1.16, "learning_rate": 0.00032223474281956987, "loss": 2.6191, "step": 150 }, { "epoch": 1.2, "learning_rate": 0.0003173599399559337, "loss": 2.639, "step": 155 }, { "epoch": 1.24, "learning_rate": 0.0003123763877370721, "loss": 2.6327, "step": 160 }, { "epoch": 1.28, "learning_rate": 0.00030728870407682117, "loss": 2.6121, "step": 165 }, { "epoch": 1.32, "learning_rate": 0.0003021016033804358, "loss": 2.465, "step": 170 }, { "epoch": 1.36, "learning_rate": 0.0002968198921760758, "loss": 2.619, "step": 175 }, { "epoch": 1.39, "learning_rate": 0.0002914484646609277, "loss": 2.5499, "step": 180 }, { "epoch": 1.43, "learning_rate": 0.0002859922981660906, "loss": 2.5248, "step": 185 }, { "epoch": 1.47, "learning_rate": 0.0002804564485444265, "loss": 2.6074, "step": 190 }, { "epoch": 1.51, "learning_rate": 0.00027484604548565065, "loss": 2.5412, "step": 195 }, { "epoch": 1.55, "learning_rate": 0.0002691662877630023, "loss": 2.5677, "step": 200 }, { "epoch": 1.59, "learning_rate": 0.0002634224384159003, "loss": 2.5344, "step": 205 }, { "epoch": 1.63, "learning_rate": 0.00025761981987304755, "loss": 2.5655, "step": 210 }, { "epoch": 1.67, "learning_rate": 0.00025176380902050413, "loss": 2.5598, "step": 215 }, { "epoch": 1.7, "learning_rate": 0.00024585983221929805, "loss": 2.5926, "step": 220 }, { "epoch": 1.74, "learning_rate": 0.00023991336027719059, "loss": 2.4798, "step": 225 }, { "epoch": 1.78, "learning_rate": 0.00023392990337925694, "loss": 2.4976, "step": 230 }, { "epoch": 1.82, "learning_rate": 0.000227915005981978, "loss": 2.5969, "step": 235 }, { "epoch": 1.86, "learning_rate": 0.00022187424167557492, "loss": 2.5677, "step": 240 }, { "epoch": 1.9, "learning_rate": 0.000215813208019348, "loss": 2.6221, "step": 245 }, { "epoch": 1.94, "learning_rate": 0.00020973752135480505, "loss": 2.5241, "step": 250 }, { "epoch": 1.97, "learning_rate": 0.000203652811601385, "loss": 2.521, "step": 255 }, { "epoch": 2.0, "eval_loss": 2.634265899658203, "eval_runtime": 57.8082, "eval_samples_per_second": 33.006, "eval_steps_per_second": 4.134, "step": 258 }, { "epoch": 2.02, "learning_rate": 0.0001975647170396005, "loss": 2.5802, "step": 260 }, { "epoch": 2.05, "learning_rate": 0.00019147887908643253, "loss": 2.2447, "step": 265 }, { "epoch": 2.09, "learning_rate": 0.0001854009370678185, "loss": 2.2409, "step": 270 }, { "epoch": 2.13, "learning_rate": 0.00017933652299307847, "loss": 2.2383, "step": 275 }, { "epoch": 2.17, "learning_rate": 0.00017329125633612044, "loss": 2.2028, "step": 280 }, { "epoch": 2.21, "learning_rate": 0.00016727073882826242, "loss": 2.1655, "step": 285 }, { "epoch": 2.25, "learning_rate": 0.00016128054926749404, "loss": 2.2198, "step": 290 }, { "epoch": 2.29, "learning_rate": 0.00015532623834899002, "loss": 2.2186, "step": 295 }, { "epoch": 2.32, "learning_rate": 0.00014941332352166385, "loss": 2.2225, "step": 300 }, { "epoch": 2.36, "learning_rate": 0.00014354728387552882, "loss": 2.2135, "step": 305 }, { "epoch": 2.4, "learning_rate": 0.00013773355506460367, "loss": 2.1537, "step": 310 }, { "epoch": 2.44, "learning_rate": 0.00013197752427006679, "loss": 2.2074, "step": 315 }, { "epoch": 2.48, "learning_rate": 0.00012628452520832765, "loss": 2.1916, "step": 320 }, { "epoch": 2.52, "learning_rate": 0.00012065983318864011, "loss": 2.2548, "step": 325 }, { "epoch": 2.56, "learning_rate": 0.00011510866022483702, "loss": 2.2211, "step": 330 }, { "epoch": 2.6, "learning_rate": 0.00010963615020571705, "loss": 2.1348, "step": 335 }, { "epoch": 2.63, "learning_rate": 0.00010424737412855825, "loss": 2.2425, "step": 340 }, { "epoch": 2.67, "learning_rate": 9.894732540017415e-05, "loss": 2.207, "step": 345 }, { "epoch": 2.71, "learning_rate": 9.374091520986936e-05, "loss": 2.1436, "step": 350 }, { "epoch": 2.75, "learning_rate": 8.863296797857865e-05, "loss": 2.1318, "step": 355 }, { "epoch": 2.79, "learning_rate": 8.362821688840947e-05, "loss": 2.193, "step": 360 }, { "epoch": 2.83, "learning_rate": 7.873129949672862e-05, "loss": 2.1751, "step": 365 }, { "epoch": 2.87, "learning_rate": 7.394675343885826e-05, "loss": 2.2015, "step": 370 }, { "epoch": 2.9, "learning_rate": 6.927901222336221e-05, "loss": 2.1469, "step": 375 }, { "epoch": 2.94, "learning_rate": 6.473240112381944e-05, "loss": 2.1919, "step": 380 }, { "epoch": 2.98, "learning_rate": 6.0311133170892234e-05, "loss": 2.2074, "step": 385 }, { "epoch": 3.0, "eval_loss": 2.5594911575317383, "eval_runtime": 57.8697, "eval_samples_per_second": 32.971, "eval_steps_per_second": 4.13, "step": 387 }, { "epoch": 3.02, "learning_rate": 5.601930524840086e-05, "loss": 2.1884, "step": 390 }, { "epoch": 3.06, "learning_rate": 5.186089429702436e-05, "loss": 2.0386, "step": 395 }, { "epoch": 3.1, "learning_rate": 4.783975362914439e-05, "loss": 2.0487, "step": 400 }, { "epoch": 3.14, "learning_rate": 4.395960935824621e-05, "loss": 1.8492, "step": 405 }, { "epoch": 3.18, "learning_rate": 4.022405694618658e-05, "loss": 2.0609, "step": 410 }, { "epoch": 3.22, "learning_rate": 3.663655787152791e-05, "loss": 1.9626, "step": 415 }, { "epoch": 3.26, "learning_rate": 3.320043642202444e-05, "loss": 1.9572, "step": 420 }, { "epoch": 3.29, "learning_rate": 2.9918876614234493e-05, "loss": 1.9518, "step": 425 }, { "epoch": 3.33, "learning_rate": 2.679491924311226e-05, "loss": 2.0956, "step": 430 }, { "epoch": 3.37, "learning_rate": 2.3831459064312743e-05, "loss": 1.997, "step": 435 }, { "epoch": 3.41, "learning_rate": 2.1031242111821635e-05, "loss": 1.965, "step": 440 }, { "epoch": 3.45, "learning_rate": 1.8396863153395216e-05, "loss": 2.0042, "step": 445 }, { "epoch": 3.49, "learning_rate": 1.593076328616814e-05, "loss": 2.009, "step": 450 }, { "epoch": 3.53, "learning_rate": 1.3635227674657147e-05, "loss": 1.9708, "step": 455 }, { "epoch": 3.56, "learning_rate": 1.1512383433257112e-05, "loss": 1.9904, "step": 460 }, { "epoch": 3.6, "learning_rate": 9.564197655190609e-06, "loss": 1.9272, "step": 465 }, { "epoch": 3.64, "learning_rate": 7.792475589738679e-06, "loss": 1.9683, "step": 470 }, { "epoch": 3.68, "learning_rate": 6.198858969440747e-06, "loss": 1.9925, "step": 475 }, { "epoch": 3.72, "learning_rate": 4.784824488814588e-06, "loss": 1.969, "step": 480 }, { "epoch": 3.76, "learning_rate": 3.5516824360052546e-06, "loss": 1.9914, "step": 485 }, { "epoch": 3.8, "learning_rate": 2.500575478631717e-06, "loss": 1.867, "step": 490 }, { "epoch": 3.84, "learning_rate": 1.6324776049554401e-06, "loss": 2.0106, "step": 495 }, { "epoch": 3.87, "learning_rate": 9.481932213528443e-07, "loss": 1.9669, "step": 500 }, { "epoch": 3.91, "learning_rate": 4.483564069273527e-07, "loss": 1.9383, "step": 505 }, { "epoch": 3.95, "learning_rate": 1.334303259521219e-07, "loss": 1.9087, "step": 510 }, { "epoch": 3.99, "learning_rate": 3.7067986876637704e-09, "loss": 2.0145, "step": 515 }, { "epoch": 4.0, "eval_loss": 2.555163860321045, "eval_runtime": 57.8174, "eval_samples_per_second": 33.0, "eval_steps_per_second": 4.134, "step": 516 }, { "epoch": 4.0, "step": 516, "total_flos": 3.457207135056691e+16, "train_loss": 2.476019163464391, "train_runtime": 8176.7734, "train_samples_per_second": 8.095, "train_steps_per_second": 0.063 } ], "max_steps": 516, "num_train_epochs": 4, "total_flos": 3.457207135056691e+16, "trial_name": null, "trial_params": null }