{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9934640522875817, "global_step": 304, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 0.0001125, "loss": 3.8539, "step": 3 }, { "epoch": 0.08, "learning_rate": 0.000225, "loss": 3.5808, "step": 6 }, { "epoch": 0.12, "learning_rate": 0.00033749999999999996, "loss": 3.335, "step": 9 }, { "epoch": 0.16, "learning_rate": 0.00045, "loss": 3.2031, "step": 12 }, { "epoch": 0.2, "learning_rate": 0.0005625, "loss": 3.0706, "step": 15 }, { "epoch": 0.24, "learning_rate": 0.0005999286081239726, "loss": 3.084, "step": 18 }, { "epoch": 0.27, "learning_rate": 0.0005995538936819289, "loss": 2.9744, "step": 21 }, { "epoch": 0.31, "learning_rate": 0.0005988584094275236, "loss": 2.8987, "step": 24 }, { "epoch": 0.35, "learning_rate": 0.0005978429001027164, "loss": 2.8293, "step": 27 }, { "epoch": 0.39, "learning_rate": 0.0005965084531403281, "loss": 2.7609, "step": 30 }, { "epoch": 0.43, "learning_rate": 0.0005948564974995903, "loss": 2.7155, "step": 33 }, { "epoch": 0.47, "learning_rate": 0.00059288880213598, "loss": 2.7184, "step": 36 }, { "epoch": 0.51, "learning_rate": 0.0005906074741069779, "loss": 2.6533, "step": 39 }, { "epoch": 0.55, "learning_rate": 0.0005880149563157786, "loss": 2.6509, "step": 42 }, { "epoch": 0.59, "learning_rate": 0.0005851140248953683, "loss": 2.5859, "step": 45 }, { "epoch": 0.63, "learning_rate": 0.0005819077862357724, "loss": 2.5516, "step": 48 }, { "epoch": 0.67, "learning_rate": 0.0005783996736576553, "loss": 2.5855, "step": 51 }, { "epoch": 0.71, "learning_rate": 0.0005745934437358341, "loss": 2.5306, "step": 54 }, { "epoch": 0.75, "learning_rate": 0.0005704931722766448, "loss": 2.5159, "step": 57 }, { "epoch": 0.78, "learning_rate": 0.0005661032499534664, "loss": 2.5304, "step": 60 }, { "epoch": 0.82, "learning_rate": 0.0005614283776050784, "loss": 2.5005, "step": 63 }, { "epoch": 0.86, "learning_rate": 0.0005564735612018839, "loss": 2.4884, "step": 66 }, { "epoch": 0.9, "learning_rate": 0.0005512441064853923, "loss": 2.4729, "step": 69 }, { "epoch": 0.94, "learning_rate": 0.0005457456132866975, "loss": 2.4226, "step": 72 }, { "epoch": 0.98, "learning_rate": 0.0005399839695300389, "loss": 2.412, "step": 75 }, { "epoch": 0.99, "eval_accuracy": 0.5457690353760842, "eval_loss": 2.502744436264038, "eval_runtime": 56.9596, "eval_samples_per_second": 19.154, "eval_steps_per_second": 19.154, "step": 76 }, { "epoch": 1.03, "learning_rate": 0.0005339653449278644, "loss": 2.6265, "step": 78 }, { "epoch": 1.07, "learning_rate": 0.0005276961843741485, "loss": 2.148, "step": 81 }, { "epoch": 1.1, "learning_rate": 0.0005211832010430372, "loss": 2.1056, "step": 84 }, { "epoch": 1.14, "learning_rate": 0.0005144333692002139, "loss": 2.1074, "step": 87 }, { "epoch": 1.18, "learning_rate": 0.0005074539167346808, "loss": 2.118, "step": 90 }, { "epoch": 1.22, "learning_rate": 0.0005002523174189542, "loss": 2.1045, "step": 93 }, { "epoch": 1.26, "learning_rate": 0.0004928362829059618, "loss": 2.061, "step": 96 }, { "epoch": 1.3, "learning_rate": 0.0004852137544712115, "loss": 2.0959, "step": 99 }, { "epoch": 1.34, "learning_rate": 0.0004773928945090747, "loss": 2.0637, "step": 102 }, { "epoch": 1.38, "learning_rate": 0.0004693820777922901, "loss": 2.004, "step": 105 }, { "epoch": 1.42, "learning_rate": 0.00046118988250404714, "loss": 2.0516, "step": 108 }, { "epoch": 1.46, "learning_rate": 0.00045282508105225254, "loss": 2.1182, "step": 111 }, { "epoch": 1.5, "learning_rate": 0.00044429663067581626, "loss": 2.0648, "step": 114 }, { "epoch": 1.54, "learning_rate": 0.0004356136638530159, "loss": 2.058, "step": 117 }, { "epoch": 1.58, "learning_rate": 0.0004267854785222098, "loss": 2.012, "step": 120 }, { "epoch": 1.61, "learning_rate": 0.00041782152812537223, "loss": 2.0105, "step": 123 }, { "epoch": 1.65, "learning_rate": 0.00040873141148511043, "loss": 1.9976, "step": 126 }, { "epoch": 1.69, "learning_rate": 0.00039952486252600565, "loss": 2.0034, "step": 129 }, { "epoch": 1.73, "learning_rate": 0.00039021173985128186, "loss": 2.0171, "step": 132 }, { "epoch": 1.77, "learning_rate": 0.00038080201618596784, "loss": 2.0163, "step": 135 }, { "epoch": 1.81, "learning_rate": 0.0003713057676978519, "loss": 1.9683, "step": 138 }, { "epoch": 1.85, "learning_rate": 0.00036173316320767046, "loss": 2.0209, "step": 141 }, { "epoch": 1.89, "learning_rate": 0.0003520944533000791, "loss": 1.9655, "step": 144 }, { "epoch": 1.93, "learning_rate": 0.0003423999593470703, "loss": 2.0127, "step": 147 }, { "epoch": 1.97, "learning_rate": 0.00033266006245558934, "loss": 1.9702, "step": 150 }, { "epoch": 1.99, "eval_accuracy": 0.5849799255079998, "eval_loss": 2.2756919860839844, "eval_runtime": 56.7417, "eval_samples_per_second": 19.227, "eval_steps_per_second": 19.227, "step": 152 }, { "epoch": 2.01, "learning_rate": 0.00032288519235118573, "loss": 2.1744, "step": 153 }, { "epoch": 2.05, "learning_rate": 0.00031308581620960083, "loss": 1.6022, "step": 156 }, { "epoch": 2.09, "learning_rate": 0.0003032724274482547, "loss": 1.6168, "step": 159 }, { "epoch": 2.13, "learning_rate": 0.0002934555344896317, "loss": 1.5467, "step": 162 }, { "epoch": 2.17, "learning_rate": 0.00028364564950859807, "loss": 1.6023, "step": 165 }, { "epoch": 2.21, "learning_rate": 0.0002738532771757025, "loss": 1.4977, "step": 168 }, { "epoch": 2.25, "learning_rate": 0.0002640889034085113, "loss": 1.5448, "step": 171 }, { "epoch": 2.29, "learning_rate": 0.00025436298414302494, "loss": 1.5738, "step": 174 }, { "epoch": 2.33, "learning_rate": 0.000244685934137201, "loss": 1.559, "step": 177 }, { "epoch": 2.37, "learning_rate": 0.00023506811581856912, "loss": 1.5734, "step": 180 }, { "epoch": 2.41, "learning_rate": 0.00022551982818788506, "loss": 1.4986, "step": 183 }, { "epoch": 2.44, "learning_rate": 0.00021605129579070238, "loss": 1.545, "step": 186 }, { "epoch": 2.48, "learning_rate": 0.00020667265776867276, "loss": 1.5496, "step": 189 }, { "epoch": 2.52, "learning_rate": 0.00019739395700229937, "loss": 1.5426, "step": 192 }, { "epoch": 2.56, "learning_rate": 0.0001882251293567691, "loss": 1.4687, "step": 195 }, { "epoch": 2.6, "learning_rate": 0.00017917599304237886, "loss": 1.5415, "step": 198 }, { "epoch": 2.64, "learning_rate": 0.0001702562381009501, "loss": 1.5289, "step": 201 }, { "epoch": 2.68, "learning_rate": 0.0001614754160294899, "loss": 1.5449, "step": 204 }, { "epoch": 2.72, "learning_rate": 0.0001528429295522076, "loss": 1.5273, "step": 207 }, { "epoch": 2.76, "learning_rate": 0.0001443680225518435, "loss": 1.5146, "step": 210 }, { "epoch": 2.8, "learning_rate": 0.000136059770171087, "loss": 1.5164, "step": 213 }, { "epoch": 2.84, "learning_rate": 0.00012792706909468623, "loss": 1.5239, "step": 216 }, { "epoch": 2.88, "learning_rate": 0.00011997862802265573, "loss": 1.4905, "step": 219 }, { "epoch": 2.92, "learning_rate": 0.00011222295834478227, "loss": 1.4968, "step": 222 }, { "epoch": 2.95, "learning_rate": 0.0001046683650264153, "loss": 1.4934, "step": 225 }, { "epoch": 2.99, "learning_rate": 9.732293771530192e-05, "loss": 1.4628, "step": 228 }, { "epoch": 2.99, "eval_accuracy": 0.6081706452777681, "eval_loss": 2.2162108421325684, "eval_runtime": 57.053, "eval_samples_per_second": 19.123, "eval_steps_per_second": 19.123, "step": 228 }, { "epoch": 3.04, "learning_rate": 9.019454207898983e-05, "loss": 1.4385, "step": 231 }, { "epoch": 3.08, "learning_rate": 8.329081138207334e-05, "loss": 1.2032, "step": 234 }, { "epoch": 3.12, "learning_rate": 7.661913831230212e-05, "loss": 1.1659, "step": 237 }, { "epoch": 3.16, "learning_rate": 7.018666706430662e-05, "loss": 1.1521, "step": 240 }, { "epoch": 3.2, "learning_rate": 6.40002856894149e-05, "loss": 1.1916, "step": 243 }, { "epoch": 3.24, "learning_rate": 5.8066618719755195e-05, "loss": 1.163, "step": 246 }, { "epoch": 3.27, "learning_rate": 5.239202007454086e-05, "loss": 1.1624, "step": 249 }, { "epoch": 3.31, "learning_rate": 4.698256625613435e-05, "loss": 1.1203, "step": 252 }, { "epoch": 3.35, "learning_rate": 4.1844049843176334e-05, "loss": 1.1709, "step": 255 }, { "epoch": 3.39, "learning_rate": 3.698197328774769e-05, "loss": 1.1263, "step": 258 }, { "epoch": 3.43, "learning_rate": 3.2401543023205764e-05, "loss": 1.1277, "step": 261 }, { "epoch": 3.47, "learning_rate": 2.8107663889005016e-05, "loss": 1.1759, "step": 264 }, { "epoch": 3.51, "learning_rate": 2.410493387847232e-05, "loss": 1.157, "step": 267 }, { "epoch": 3.55, "learning_rate": 2.0397639215160466e-05, "loss": 1.1721, "step": 270 }, { "epoch": 3.59, "learning_rate": 1.698974976305243e-05, "loss": 1.1577, "step": 273 }, { "epoch": 3.63, "learning_rate": 1.3884914775531952e-05, "loss": 1.1546, "step": 276 }, { "epoch": 3.67, "learning_rate": 1.1086458987671187e-05, "loss": 1.1564, "step": 279 }, { "epoch": 3.71, "learning_rate": 8.59737905602157e-06, "loss": 1.1241, "step": 282 }, { "epoch": 3.75, "learning_rate": 6.4203403497185e-06, "loss": 1.1605, "step": 285 }, { "epoch": 3.78, "learning_rate": 4.557674096337593e-06, "loss": 1.1114, "step": 288 }, { "epoch": 3.82, "learning_rate": 3.011374885557638e-06, "loss": 1.1377, "step": 291 }, { "epoch": 3.86, "learning_rate": 1.783098533304106e-06, "loss": 1.1541, "step": 294 }, { "epoch": 3.9, "learning_rate": 8.741603086600102e-07, "loss": 1.174, "step": 297 }, { "epoch": 3.94, "learning_rate": 2.855335254426605e-07, "loss": 1.1415, "step": 300 }, { "epoch": 3.98, "learning_rate": 1.7848499955075423e-08, "loss": 1.1662, "step": 303 }, { "epoch": 3.99, "eval_accuracy": 0.611281497151223, "eval_loss": 2.2855756282806396, "eval_runtime": 56.9619, "eval_samples_per_second": 19.153, "eval_steps_per_second": 19.153, "step": 304 }, { "epoch": 3.99, "step": 304, "total_flos": 2.9714731304484864e+16, "train_loss": 1.8919498779271777, "train_runtime": 5682.4938, "train_samples_per_second": 6.892, "train_steps_per_second": 0.053 } ], "max_steps": 304, "num_train_epochs": 4, "total_flos": 2.9714731304484864e+16, "trial_name": null, "trial_params": null }