{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5104809805533613, "global_step": 570000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.977610483309063e-05, "loss": 0.0854, "step": 5000 }, { "epoch": 0.01, "learning_rate": 4.955220966618126e-05, "loss": 0.0801, "step": 10000 }, { "epoch": 0.01, "learning_rate": 4.9328314499271894e-05, "loss": 0.082, "step": 15000 }, { "epoch": 0.02, "learning_rate": 4.910441933236253e-05, "loss": 0.0851, "step": 20000 }, { "epoch": 0.02, "learning_rate": 4.8880524165453154e-05, "loss": 0.0858, "step": 25000 }, { "epoch": 0.03, "learning_rate": 4.865662899854379e-05, "loss": 0.0777, "step": 30000 }, { "epoch": 0.03, "learning_rate": 4.843273383163442e-05, "loss": 0.0853, "step": 35000 }, { "epoch": 0.04, "learning_rate": 4.8208838664725045e-05, "loss": 0.0838, "step": 40000 }, { "epoch": 0.04, "learning_rate": 4.798494349781568e-05, "loss": 0.084, "step": 45000 }, { "epoch": 0.04, "learning_rate": 4.776104833090631e-05, "loss": 0.0794, "step": 50000 }, { "epoch": 0.05, "learning_rate": 4.753715316399694e-05, "loss": 0.0805, "step": 55000 }, { "epoch": 0.05, "learning_rate": 4.7313257997087574e-05, "loss": 0.0876, "step": 60000 }, { "epoch": 0.06, "learning_rate": 4.708936283017821e-05, "loss": 0.0773, "step": 65000 }, { "epoch": 0.06, "learning_rate": 4.6865467663268834e-05, "loss": 0.0846, "step": 70000 }, { "epoch": 0.07, "learning_rate": 4.6641572496359465e-05, "loss": 0.0867, "step": 75000 }, { "epoch": 0.07, "eval_loss": 0.07421331852674484, "eval_runtime": 1684.9579, "eval_samples_per_second": 36.816, "eval_steps_per_second": 36.816, "step": 75000 }, { "epoch": 0.07, "learning_rate": 4.64176773294501e-05, "loss": 0.0779, "step": 80000 }, { "epoch": 0.08, "learning_rate": 4.6193782162540726e-05, "loss": 0.0795, "step": 85000 }, { "epoch": 0.08, "learning_rate": 4.596988699563136e-05, "loss": 0.082, "step": 90000 }, { "epoch": 0.09, "learning_rate": 4.574599182872199e-05, "loss": 0.0821, "step": 95000 }, { "epoch": 0.09, "learning_rate": 4.5522096661812623e-05, "loss": 0.0787, "step": 100000 }, { "epoch": 0.09, "learning_rate": 4.5298201494903254e-05, "loss": 0.0803, "step": 105000 }, { "epoch": 0.1, "learning_rate": 4.5074306327993884e-05, "loss": 0.0788, "step": 110000 }, { "epoch": 0.1, "learning_rate": 4.4850411161084515e-05, "loss": 0.0822, "step": 115000 }, { "epoch": 0.11, "learning_rate": 4.4626515994175145e-05, "loss": 0.0782, "step": 120000 }, { "epoch": 0.11, "learning_rate": 4.4402620827265775e-05, "loss": 0.0804, "step": 125000 }, { "epoch": 0.12, "learning_rate": 4.4178725660356406e-05, "loss": 0.0778, "step": 130000 }, { "epoch": 0.12, "learning_rate": 4.3954830493447036e-05, "loss": 0.0772, "step": 135000 }, { "epoch": 0.13, "learning_rate": 4.3730935326537667e-05, "loss": 0.0819, "step": 140000 }, { "epoch": 0.13, "learning_rate": 4.35070401596283e-05, "loss": 0.0741, "step": 145000 }, { "epoch": 0.13, "learning_rate": 4.3283144992718934e-05, "loss": 0.0783, "step": 150000 }, { "epoch": 0.13, "eval_loss": 0.06954144686460495, "eval_runtime": 1688.3337, "eval_samples_per_second": 36.742, "eval_steps_per_second": 36.742, "step": 150000 }, { "epoch": 0.14, "learning_rate": 4.3059249825809564e-05, "loss": 0.072, "step": 155000 }, { "epoch": 0.14, "learning_rate": 4.2835354658900195e-05, "loss": 0.0824, "step": 160000 }, { "epoch": 0.15, "learning_rate": 4.2611459491990825e-05, "loss": 0.0779, "step": 165000 }, { "epoch": 0.15, "learning_rate": 4.2387564325081456e-05, "loss": 0.0733, "step": 170000 }, { "epoch": 0.16, "learning_rate": 4.2163669158172086e-05, "loss": 0.0784, "step": 175000 }, { "epoch": 0.16, "learning_rate": 4.1939773991262716e-05, "loss": 0.0745, "step": 180000 }, { "epoch": 0.17, "learning_rate": 4.171587882435335e-05, "loss": 0.0778, "step": 185000 }, { "epoch": 0.17, "learning_rate": 4.149198365744398e-05, "loss": 0.0753, "step": 190000 }, { "epoch": 0.17, "learning_rate": 4.1268088490534614e-05, "loss": 0.0735, "step": 195000 }, { "epoch": 0.18, "learning_rate": 4.104419332362524e-05, "loss": 0.072, "step": 200000 }, { "epoch": 0.18, "learning_rate": 4.082029815671587e-05, "loss": 0.0752, "step": 205000 }, { "epoch": 0.19, "learning_rate": 4.0596402989806505e-05, "loss": 0.0759, "step": 210000 }, { "epoch": 0.19, "learning_rate": 4.037250782289713e-05, "loss": 0.0715, "step": 215000 }, { "epoch": 0.2, "learning_rate": 4.0148612655987766e-05, "loss": 0.0734, "step": 220000 }, { "epoch": 0.2, "learning_rate": 3.9924717489078396e-05, "loss": 0.0719, "step": 225000 }, { "epoch": 0.2, "eval_loss": 0.07320380210876465, "eval_runtime": 1689.2398, "eval_samples_per_second": 36.722, "eval_steps_per_second": 36.722, "step": 225000 }, { "epoch": 0.21, "learning_rate": 3.970082232216903e-05, "loss": 0.0782, "step": 230000 }, { "epoch": 0.21, "learning_rate": 3.947692715525966e-05, "loss": 0.0689, "step": 235000 }, { "epoch": 0.21, "learning_rate": 3.925303198835029e-05, "loss": 0.0721, "step": 240000 }, { "epoch": 0.22, "learning_rate": 3.902913682144092e-05, "loss": 0.0717, "step": 245000 }, { "epoch": 0.22, "learning_rate": 3.880524165453155e-05, "loss": 0.0751, "step": 250000 }, { "epoch": 0.23, "learning_rate": 3.8581346487622186e-05, "loss": 0.075, "step": 255000 }, { "epoch": 0.23, "learning_rate": 3.835745132071281e-05, "loss": 0.0774, "step": 260000 }, { "epoch": 0.24, "learning_rate": 3.8133556153803446e-05, "loss": 0.0729, "step": 265000 }, { "epoch": 0.24, "learning_rate": 3.790966098689408e-05, "loss": 0.0666, "step": 270000 }, { "epoch": 0.25, "learning_rate": 3.76857658199847e-05, "loss": 0.0668, "step": 275000 }, { "epoch": 0.25, "learning_rate": 3.746187065307534e-05, "loss": 0.0688, "step": 280000 }, { "epoch": 0.26, "learning_rate": 3.723797548616597e-05, "loss": 0.0732, "step": 285000 }, { "epoch": 0.26, "learning_rate": 3.70140803192566e-05, "loss": 0.0729, "step": 290000 }, { "epoch": 0.26, "learning_rate": 3.679018515234723e-05, "loss": 0.0746, "step": 295000 }, { "epoch": 0.27, "learning_rate": 3.6566289985437866e-05, "loss": 0.0743, "step": 300000 }, { "epoch": 0.27, "eval_loss": 0.06632131338119507, "eval_runtime": 1749.4291, "eval_samples_per_second": 35.459, "eval_steps_per_second": 35.459, "step": 300000 }, { "epoch": 0.27, "learning_rate": 3.634239481852849e-05, "loss": 0.0689, "step": 305000 }, { "epoch": 0.28, "learning_rate": 3.611849965161912e-05, "loss": 0.0705, "step": 310000 }, { "epoch": 0.28, "learning_rate": 3.589460448470976e-05, "loss": 0.0691, "step": 315000 }, { "epoch": 0.29, "learning_rate": 3.567070931780038e-05, "loss": 0.0673, "step": 320000 }, { "epoch": 0.29, "learning_rate": 3.544681415089102e-05, "loss": 0.0713, "step": 325000 }, { "epoch": 0.3, "learning_rate": 3.522291898398165e-05, "loss": 0.0704, "step": 330000 }, { "epoch": 0.3, "learning_rate": 3.499902381707228e-05, "loss": 0.0734, "step": 335000 }, { "epoch": 0.3, "learning_rate": 3.477512865016291e-05, "loss": 0.0731, "step": 340000 }, { "epoch": 0.31, "learning_rate": 3.455123348325354e-05, "loss": 0.0682, "step": 345000 }, { "epoch": 0.31, "learning_rate": 3.432733831634417e-05, "loss": 0.0719, "step": 350000 }, { "epoch": 0.32, "learning_rate": 3.41034431494348e-05, "loss": 0.0724, "step": 355000 }, { "epoch": 0.32, "learning_rate": 3.387954798252544e-05, "loss": 0.0677, "step": 360000 }, { "epoch": 0.33, "learning_rate": 3.365565281561606e-05, "loss": 0.0697, "step": 365000 }, { "epoch": 0.33, "learning_rate": 3.343175764870669e-05, "loss": 0.0759, "step": 370000 }, { "epoch": 0.34, "learning_rate": 3.320786248179733e-05, "loss": 0.0659, "step": 375000 }, { "epoch": 0.34, "eval_loss": 0.06863044947385788, "eval_runtime": 1687.8956, "eval_samples_per_second": 36.752, "eval_steps_per_second": 36.752, "step": 375000 }, { "epoch": 0.34, "learning_rate": 3.298396731488795e-05, "loss": 0.0704, "step": 380000 }, { "epoch": 0.34, "learning_rate": 3.276007214797859e-05, "loss": 0.0711, "step": 385000 }, { "epoch": 0.35, "learning_rate": 3.253617698106922e-05, "loss": 0.0689, "step": 390000 }, { "epoch": 0.35, "learning_rate": 3.231228181415985e-05, "loss": 0.0689, "step": 395000 }, { "epoch": 0.36, "learning_rate": 3.208838664725048e-05, "loss": 0.0657, "step": 400000 }, { "epoch": 0.36, "learning_rate": 3.186449148034111e-05, "loss": 0.0695, "step": 405000 }, { "epoch": 0.37, "learning_rate": 3.164059631343174e-05, "loss": 0.07, "step": 410000 }, { "epoch": 0.37, "learning_rate": 3.141670114652237e-05, "loss": 0.0654, "step": 415000 }, { "epoch": 0.38, "learning_rate": 3.1192805979613e-05, "loss": 0.0668, "step": 420000 }, { "epoch": 0.38, "learning_rate": 3.096891081270363e-05, "loss": 0.0692, "step": 425000 }, { "epoch": 0.39, "learning_rate": 3.074501564579427e-05, "loss": 0.0692, "step": 430000 }, { "epoch": 0.39, "learning_rate": 3.052112047888489e-05, "loss": 0.0707, "step": 435000 }, { "epoch": 0.39, "learning_rate": 3.0297225311975523e-05, "loss": 0.0677, "step": 440000 }, { "epoch": 0.4, "learning_rate": 3.007333014506616e-05, "loss": 0.0721, "step": 445000 }, { "epoch": 0.4, "learning_rate": 2.9849434978156787e-05, "loss": 0.0664, "step": 450000 }, { "epoch": 0.4, "eval_loss": 0.06826464831829071, "eval_runtime": 1692.5106, "eval_samples_per_second": 36.651, "eval_steps_per_second": 36.651, "step": 450000 }, { "epoch": 0.41, "learning_rate": 2.962553981124742e-05, "loss": 0.0647, "step": 455000 }, { "epoch": 0.41, "learning_rate": 2.940164464433805e-05, "loss": 0.0683, "step": 460000 }, { "epoch": 0.42, "learning_rate": 2.9177749477428685e-05, "loss": 0.0693, "step": 465000 }, { "epoch": 0.42, "learning_rate": 2.8953854310519312e-05, "loss": 0.0677, "step": 470000 }, { "epoch": 0.43, "learning_rate": 2.8729959143609942e-05, "loss": 0.0674, "step": 475000 }, { "epoch": 0.43, "learning_rate": 2.8506063976700576e-05, "loss": 0.0675, "step": 480000 }, { "epoch": 0.43, "learning_rate": 2.8282168809791203e-05, "loss": 0.0676, "step": 485000 }, { "epoch": 0.44, "learning_rate": 2.8058273642881837e-05, "loss": 0.066, "step": 490000 }, { "epoch": 0.44, "learning_rate": 2.7834378475972467e-05, "loss": 0.0692, "step": 495000 }, { "epoch": 0.45, "learning_rate": 2.76104833090631e-05, "loss": 0.0709, "step": 500000 }, { "epoch": 0.45, "learning_rate": 2.7386588142153728e-05, "loss": 0.0651, "step": 505000 }, { "epoch": 0.46, "learning_rate": 2.716269297524436e-05, "loss": 0.0684, "step": 510000 }, { "epoch": 0.46, "learning_rate": 2.6938797808334992e-05, "loss": 0.067, "step": 515000 }, { "epoch": 0.47, "learning_rate": 2.6714902641425623e-05, "loss": 0.0631, "step": 520000 }, { "epoch": 0.47, "learning_rate": 2.6491007474516256e-05, "loss": 0.0637, "step": 525000 }, { "epoch": 0.47, "eval_loss": 0.06799625605344772, "eval_runtime": 1690.2201, "eval_samples_per_second": 36.701, "eval_steps_per_second": 36.701, "step": 525000 }, { "epoch": 0.47, "learning_rate": 2.6267112307606883e-05, "loss": 0.0691, "step": 530000 }, { "epoch": 0.48, "learning_rate": 2.6043217140697517e-05, "loss": 0.0707, "step": 535000 }, { "epoch": 0.48, "learning_rate": 2.5819321973788147e-05, "loss": 0.0692, "step": 540000 }, { "epoch": 0.49, "learning_rate": 2.5595426806878774e-05, "loss": 0.067, "step": 545000 }, { "epoch": 0.49, "learning_rate": 2.5371531639969408e-05, "loss": 0.067, "step": 550000 }, { "epoch": 0.5, "learning_rate": 2.514763647306004e-05, "loss": 0.0734, "step": 555000 }, { "epoch": 0.5, "learning_rate": 2.492374130615067e-05, "loss": 0.0663, "step": 560000 }, { "epoch": 0.51, "learning_rate": 2.46998461392413e-05, "loss": 0.0689, "step": 565000 }, { "epoch": 0.51, "learning_rate": 2.4475950972331933e-05, "loss": 0.0642, "step": 570000 } ], "max_steps": 1116594, "num_train_epochs": 1, "total_flos": 1.7128824397599744e+17, "trial_name": null, "trial_params": null }