{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.88888888888889, "eval_steps": 256, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23, "learning_rate": 4.872000000000001e-05, "loss": 0.8173, "step": 256 }, { "epoch": 0.23, "eval_test_accuracy": 0.09690721649484536, "eval_test_loss": 0.4266754984855652, "eval_test_runtime": 233.5115, "eval_test_samples_per_second": 10.385, "eval_test_steps_per_second": 1.302, "step": 256 }, { "epoch": 0.46, "learning_rate": 4.744e-05, "loss": 0.441, "step": 512 }, { "epoch": 0.46, "eval_test_accuracy": 0.16742268041237113, "eval_test_loss": 0.36029067635536194, "eval_test_runtime": 232.4589, "eval_test_samples_per_second": 10.432, "eval_test_steps_per_second": 1.308, "step": 512 }, { "epoch": 0.68, "learning_rate": 4.6160000000000005e-05, "loss": 0.397, "step": 768 }, { "epoch": 0.68, "eval_test_accuracy": 0.17649484536082474, "eval_test_loss": 0.3348633348941803, "eval_test_runtime": 227.1069, "eval_test_samples_per_second": 10.678, "eval_test_steps_per_second": 1.339, "step": 768 }, { "epoch": 0.91, "learning_rate": 4.488e-05, "loss": 0.3611, "step": 1024 }, { "epoch": 0.91, "eval_test_accuracy": 0.23010309278350516, "eval_test_loss": 0.3209882974624634, "eval_test_runtime": 226.4132, "eval_test_samples_per_second": 10.711, "eval_test_steps_per_second": 1.343, "step": 1024 }, { "epoch": 1.14, "learning_rate": 4.36e-05, "loss": 0.3469, "step": 1280 }, { "epoch": 1.14, "eval_test_accuracy": 0.19505154639175257, "eval_test_loss": 0.31938230991363525, "eval_test_runtime": 227.1405, "eval_test_samples_per_second": 10.676, "eval_test_steps_per_second": 1.338, "step": 1280 }, { "epoch": 1.37, "learning_rate": 4.232e-05, "loss": 0.3301, "step": 1536 }, { "epoch": 1.37, "eval_test_accuracy": 0.21608247422680413, "eval_test_loss": 0.3045227825641632, "eval_test_runtime": 229.033, "eval_test_samples_per_second": 10.588, "eval_test_steps_per_second": 1.327, "step": 1536 }, { "epoch": 1.59, "learning_rate": 4.104e-05, "loss": 0.3194, "step": 1792 }, { "epoch": 1.59, "eval_test_accuracy": 0.2354639175257732, "eval_test_loss": 0.2938610017299652, "eval_test_runtime": 225.016, "eval_test_samples_per_second": 10.777, "eval_test_steps_per_second": 1.351, "step": 1792 }, { "epoch": 1.82, "learning_rate": 3.9760000000000006e-05, "loss": 0.3189, "step": 2048 }, { "epoch": 1.82, "eval_test_accuracy": 0.23876288659793815, "eval_test_loss": 0.2826090455055237, "eval_test_runtime": 235.6637, "eval_test_samples_per_second": 10.29, "eval_test_steps_per_second": 1.29, "step": 2048 }, { "epoch": 2.05, "learning_rate": 3.848e-05, "loss": 0.3031, "step": 2304 }, { "epoch": 2.05, "eval_test_accuracy": 0.23752577319587628, "eval_test_loss": 0.276397705078125, "eval_test_runtime": 228.4566, "eval_test_samples_per_second": 10.615, "eval_test_steps_per_second": 1.331, "step": 2304 }, { "epoch": 2.28, "learning_rate": 3.72e-05, "loss": 0.3008, "step": 2560 }, { "epoch": 2.28, "eval_test_accuracy": 0.231340206185567, "eval_test_loss": 0.27680695056915283, "eval_test_runtime": 232.6253, "eval_test_samples_per_second": 10.424, "eval_test_steps_per_second": 1.307, "step": 2560 }, { "epoch": 2.5, "learning_rate": 3.592e-05, "loss": 0.2941, "step": 2816 }, { "epoch": 2.5, "eval_test_accuracy": 0.24082474226804124, "eval_test_loss": 0.27100667357444763, "eval_test_runtime": 234.9203, "eval_test_samples_per_second": 10.323, "eval_test_steps_per_second": 1.294, "step": 2816 }, { "epoch": 2.73, "learning_rate": 3.464e-05, "loss": 0.2855, "step": 3072 }, { "epoch": 2.73, "eval_test_accuracy": 0.2552577319587629, "eval_test_loss": 0.26808857917785645, "eval_test_runtime": 233.699, "eval_test_samples_per_second": 10.377, "eval_test_steps_per_second": 1.301, "step": 3072 }, { "epoch": 2.96, "learning_rate": 3.336e-05, "loss": 0.2827, "step": 3328 }, { "epoch": 2.96, "eval_test_accuracy": 0.23876288659793815, "eval_test_loss": 0.26740676164627075, "eval_test_runtime": 232.615, "eval_test_samples_per_second": 10.425, "eval_test_steps_per_second": 1.307, "step": 3328 }, { "epoch": 3.19, "learning_rate": 3.208e-05, "loss": 0.2824, "step": 3584 }, { "epoch": 3.19, "eval_test_accuracy": 0.23670103092783504, "eval_test_loss": 0.26493921875953674, "eval_test_runtime": 234.3958, "eval_test_samples_per_second": 10.346, "eval_test_steps_per_second": 1.297, "step": 3584 }, { "epoch": 3.41, "learning_rate": 3.08e-05, "loss": 0.2699, "step": 3840 }, { "epoch": 3.41, "eval_test_accuracy": 0.24082474226804124, "eval_test_loss": 0.2653910517692566, "eval_test_runtime": 236.6454, "eval_test_samples_per_second": 10.247, "eval_test_steps_per_second": 1.285, "step": 3840 }, { "epoch": 3.64, "learning_rate": 2.9520000000000002e-05, "loss": 0.2744, "step": 4096 }, { "epoch": 3.64, "eval_test_accuracy": 0.2503092783505155, "eval_test_loss": 0.25940871238708496, "eval_test_runtime": 230.5104, "eval_test_samples_per_second": 10.52, "eval_test_steps_per_second": 1.319, "step": 4096 }, { "epoch": 3.87, "learning_rate": 2.824e-05, "loss": 0.2679, "step": 4352 }, { "epoch": 3.87, "eval_test_accuracy": 0.24824742268041236, "eval_test_loss": 0.25839442014694214, "eval_test_runtime": 279.8888, "eval_test_samples_per_second": 8.664, "eval_test_steps_per_second": 1.086, "step": 4352 }, { "epoch": 4.1, "learning_rate": 2.6960000000000003e-05, "loss": 0.27, "step": 4608 }, { "epoch": 4.1, "eval_test_accuracy": 0.2465979381443299, "eval_test_loss": 0.25590091943740845, "eval_test_runtime": 229.037, "eval_test_samples_per_second": 10.588, "eval_test_steps_per_second": 1.327, "step": 4608 }, { "epoch": 4.32, "learning_rate": 2.5679999999999998e-05, "loss": 0.2644, "step": 4864 }, { "epoch": 4.32, "eval_test_accuracy": 0.23835051546391753, "eval_test_loss": 0.25387293100357056, "eval_test_runtime": 231.73, "eval_test_samples_per_second": 10.465, "eval_test_steps_per_second": 1.312, "step": 4864 }, { "epoch": 4.55, "learning_rate": 2.44e-05, "loss": 0.2664, "step": 5120 }, { "epoch": 4.55, "eval_test_accuracy": 0.24783505154639177, "eval_test_loss": 0.2522536516189575, "eval_test_runtime": 233.9019, "eval_test_samples_per_second": 10.368, "eval_test_steps_per_second": 1.3, "step": 5120 }, { "epoch": 4.78, "learning_rate": 2.312e-05, "loss": 0.2557, "step": 5376 }, { "epoch": 4.78, "eval_test_accuracy": 0.23628865979381444, "eval_test_loss": 0.2543907165527344, "eval_test_runtime": 230.6139, "eval_test_samples_per_second": 10.515, "eval_test_steps_per_second": 1.318, "step": 5376 }, { "epoch": 5.01, "learning_rate": 2.184e-05, "loss": 0.2544, "step": 5632 }, { "epoch": 5.01, "eval_test_accuracy": 0.2445360824742268, "eval_test_loss": 0.24817436933517456, "eval_test_runtime": 231.9449, "eval_test_samples_per_second": 10.455, "eval_test_steps_per_second": 1.311, "step": 5632 }, { "epoch": 5.23, "learning_rate": 2.0560000000000003e-05, "loss": 0.2549, "step": 5888 }, { "epoch": 5.23, "eval_test_accuracy": 0.24618556701030928, "eval_test_loss": 0.24915936589241028, "eval_test_runtime": 226.5831, "eval_test_samples_per_second": 10.702, "eval_test_steps_per_second": 1.342, "step": 5888 }, { "epoch": 5.46, "learning_rate": 1.9280000000000002e-05, "loss": 0.2545, "step": 6144 }, { "epoch": 5.46, "eval_test_accuracy": 0.2465979381443299, "eval_test_loss": 0.2478923350572586, "eval_test_runtime": 233.3756, "eval_test_samples_per_second": 10.391, "eval_test_steps_per_second": 1.303, "step": 6144 }, { "epoch": 5.69, "learning_rate": 1.8e-05, "loss": 0.2539, "step": 6400 }, { "epoch": 5.69, "eval_test_accuracy": 0.24989690721649485, "eval_test_loss": 0.24747271835803986, "eval_test_runtime": 231.4653, "eval_test_samples_per_second": 10.477, "eval_test_steps_per_second": 1.313, "step": 6400 }, { "epoch": 5.92, "learning_rate": 1.672e-05, "loss": 0.2466, "step": 6656 }, { "epoch": 5.92, "eval_test_accuracy": 0.24329896907216494, "eval_test_loss": 0.24743035435676575, "eval_test_runtime": 233.3396, "eval_test_samples_per_second": 10.393, "eval_test_steps_per_second": 1.303, "step": 6656 }, { "epoch": 6.14, "learning_rate": 1.544e-05, "loss": 0.2507, "step": 6912 }, { "epoch": 6.14, "eval_test_accuracy": 0.25237113402061856, "eval_test_loss": 0.24656306207180023, "eval_test_runtime": 228.6153, "eval_test_samples_per_second": 10.607, "eval_test_steps_per_second": 1.33, "step": 6912 }, { "epoch": 6.37, "learning_rate": 1.4160000000000002e-05, "loss": 0.2456, "step": 7168 }, { "epoch": 6.37, "eval_test_accuracy": 0.24701030927835052, "eval_test_loss": 0.24853350222110748, "eval_test_runtime": 228.5698, "eval_test_samples_per_second": 10.609, "eval_test_steps_per_second": 1.33, "step": 7168 }, { "epoch": 6.6, "learning_rate": 1.288e-05, "loss": 0.241, "step": 7424 }, { "epoch": 6.6, "eval_test_accuracy": 0.2490721649484536, "eval_test_loss": 0.2469691038131714, "eval_test_runtime": 225.5172, "eval_test_samples_per_second": 10.753, "eval_test_steps_per_second": 1.348, "step": 7424 }, { "epoch": 6.83, "learning_rate": 1.16e-05, "loss": 0.2443, "step": 7680 }, { "epoch": 6.83, "eval_test_accuracy": 0.24824742268041236, "eval_test_loss": 0.24246640503406525, "eval_test_runtime": 226.9617, "eval_test_samples_per_second": 10.685, "eval_test_steps_per_second": 1.339, "step": 7680 }, { "epoch": 7.05, "learning_rate": 1.0320000000000001e-05, "loss": 0.2429, "step": 7936 }, { "epoch": 7.05, "eval_test_accuracy": 0.2515463917525773, "eval_test_loss": 0.24250201880931854, "eval_test_runtime": 228.9785, "eval_test_samples_per_second": 10.591, "eval_test_steps_per_second": 1.328, "step": 7936 }, { "epoch": 7.28, "learning_rate": 9.04e-06, "loss": 0.241, "step": 8192 }, { "epoch": 7.28, "eval_test_accuracy": 0.25195876288659796, "eval_test_loss": 0.24321790039539337, "eval_test_runtime": 228.3774, "eval_test_samples_per_second": 10.618, "eval_test_steps_per_second": 1.331, "step": 8192 }, { "epoch": 7.51, "learning_rate": 7.76e-06, "loss": 0.2418, "step": 8448 }, { "epoch": 7.51, "eval_test_accuracy": 0.2490721649484536, "eval_test_loss": 0.24221281707286835, "eval_test_runtime": 228.7396, "eval_test_samples_per_second": 10.602, "eval_test_steps_per_second": 1.329, "step": 8448 }, { "epoch": 7.74, "learning_rate": 6.48e-06, "loss": 0.2378, "step": 8704 }, { "epoch": 7.74, "eval_test_accuracy": 0.2465979381443299, "eval_test_loss": 0.24190692603588104, "eval_test_runtime": 228.3602, "eval_test_samples_per_second": 10.619, "eval_test_steps_per_second": 1.331, "step": 8704 }, { "epoch": 7.96, "learning_rate": 5.2e-06, "loss": 0.2388, "step": 8960 }, { "epoch": 7.96, "eval_test_accuracy": 0.25237113402061856, "eval_test_loss": 0.24081237614154816, "eval_test_runtime": 228.954, "eval_test_samples_per_second": 10.592, "eval_test_steps_per_second": 1.328, "step": 8960 }, { "epoch": 8.19, "learning_rate": 3.92e-06, "loss": 0.2304, "step": 9216 }, { "epoch": 8.19, "eval_test_accuracy": 0.257319587628866, "eval_test_loss": 0.24084854125976562, "eval_test_runtime": 227.9624, "eval_test_samples_per_second": 10.638, "eval_test_steps_per_second": 1.334, "step": 9216 }, { "epoch": 8.42, "learning_rate": 2.64e-06, "loss": 0.2423, "step": 9472 }, { "epoch": 8.42, "eval_test_accuracy": 0.2556701030927835, "eval_test_loss": 0.2404756098985672, "eval_test_runtime": 228.8532, "eval_test_samples_per_second": 10.596, "eval_test_steps_per_second": 1.328, "step": 9472 }, { "epoch": 8.65, "learning_rate": 1.36e-06, "loss": 0.2366, "step": 9728 }, { "epoch": 8.65, "eval_test_accuracy": 0.2556701030927835, "eval_test_loss": 0.24009202420711517, "eval_test_runtime": 229.0313, "eval_test_samples_per_second": 10.588, "eval_test_steps_per_second": 1.327, "step": 9728 }, { "epoch": 8.87, "learning_rate": 8e-08, "loss": 0.2321, "step": 9984 }, { "epoch": 8.87, "eval_test_accuracy": 0.25278350515463915, "eval_test_loss": 0.23984022438526154, "eval_test_runtime": 231.7368, "eval_test_samples_per_second": 10.464, "eval_test_steps_per_second": 1.312, "step": 9984 } ], "logging_steps": 256, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 256, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }