{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 78.1366195678711, "learning_rate": 2.5e-05, "loss": 1.9894, "step": 1 }, { "epoch": 0.2, "eval_accuracy": 0.55, "eval_loss": 1.7598272562026978, "eval_runtime": 1.0005, "eval_samples_per_second": 99.95, "eval_steps_per_second": 2.999, "step": 1 }, { "epoch": 0.4, "grad_norm": 73.63365936279297, "learning_rate": 5e-05, "loss": 1.8056, "step": 2 }, { "epoch": 0.4, "eval_accuracy": 0.55, "eval_loss": 1.6865791082382202, "eval_runtime": 0.9969, "eval_samples_per_second": 100.314, "eval_steps_per_second": 3.009, "step": 2 }, { "epoch": 0.6, "grad_norm": 85.16033172607422, "learning_rate": 4.8958333333333335e-05, "loss": 1.9709, "step": 3 }, { "epoch": 0.6, "eval_accuracy": 0.55, "eval_loss": 1.3770341873168945, "eval_runtime": 1.0002, "eval_samples_per_second": 99.978, "eval_steps_per_second": 2.999, "step": 3 }, { "epoch": 0.8, "grad_norm": 69.2215347290039, "learning_rate": 4.791666666666667e-05, "loss": 1.4335, "step": 4 }, { "epoch": 0.8, "eval_accuracy": 0.54, "eval_loss": 1.0766552686691284, "eval_runtime": 0.9982, "eval_samples_per_second": 100.18, "eval_steps_per_second": 3.005, "step": 4 }, { "epoch": 1.0, "grad_norm": 69.37948608398438, "learning_rate": 4.6875e-05, "loss": 1.3065, "step": 5 }, { "epoch": 1.0, "eval_accuracy": 0.49, "eval_loss": 0.8532812595367432, "eval_runtime": 0.9986, "eval_samples_per_second": 100.144, "eval_steps_per_second": 3.004, "step": 5 }, { "epoch": 1.2, "grad_norm": 15.690597534179688, "learning_rate": 4.5833333333333334e-05, "loss": 0.8055, "step": 6 }, { "epoch": 1.2, "eval_accuracy": 0.43, "eval_loss": 0.8792579174041748, "eval_runtime": 0.9976, "eval_samples_per_second": 100.242, "eval_steps_per_second": 3.007, "step": 6 }, { "epoch": 1.4, "grad_norm": 20.13286018371582, "learning_rate": 4.4791666666666673e-05, "loss": 0.7901, "step": 7 }, { "epoch": 1.4, "eval_accuracy": 0.44, "eval_loss": 0.9624806046485901, "eval_runtime": 0.9981, "eval_samples_per_second": 100.193, "eval_steps_per_second": 3.006, "step": 7 }, { "epoch": 1.6, "grad_norm": 5.53256893157959, "learning_rate": 4.375e-05, "loss": 0.7524, "step": 8 }, { "epoch": 1.6, "eval_accuracy": 0.45, "eval_loss": 0.9771338105201721, "eval_runtime": 0.997, "eval_samples_per_second": 100.301, "eval_steps_per_second": 3.009, "step": 8 }, { "epoch": 1.8, "grad_norm": 29.057296752929688, "learning_rate": 4.270833333333333e-05, "loss": 0.7744, "step": 9 }, { "epoch": 1.8, "eval_accuracy": 0.41, "eval_loss": 0.9202635884284973, "eval_runtime": 0.997, "eval_samples_per_second": 100.301, "eval_steps_per_second": 3.009, "step": 9 }, { "epoch": 2.0, "grad_norm": 23.430646896362305, "learning_rate": 4.166666666666667e-05, "loss": 0.7863, "step": 10 }, { "epoch": 2.0, "eval_accuracy": 0.44, "eval_loss": 0.8504493832588196, "eval_runtime": 0.9988, "eval_samples_per_second": 100.118, "eval_steps_per_second": 3.004, "step": 10 }, { "epoch": 2.2, "grad_norm": 17.951738357543945, "learning_rate": 4.0625000000000005e-05, "loss": 0.6977, "step": 11 }, { "epoch": 2.2, "eval_accuracy": 0.43, "eval_loss": 0.8065915703773499, "eval_runtime": 0.9996, "eval_samples_per_second": 100.035, "eval_steps_per_second": 3.001, "step": 11 }, { "epoch": 2.4, "grad_norm": 16.09232521057129, "learning_rate": 3.958333333333333e-05, "loss": 0.75, "step": 12 }, { "epoch": 2.4, "eval_accuracy": 0.43, "eval_loss": 0.7995800971984863, "eval_runtime": 0.9964, "eval_samples_per_second": 100.364, "eval_steps_per_second": 3.011, "step": 12 }, { "epoch": 2.6, "grad_norm": 16.51948356628418, "learning_rate": 3.854166666666667e-05, "loss": 0.8325, "step": 13 }, { "epoch": 2.6, "eval_accuracy": 0.44, "eval_loss": 0.7968456745147705, "eval_runtime": 0.9962, "eval_samples_per_second": 100.386, "eval_steps_per_second": 3.012, "step": 13 }, { "epoch": 2.8, "grad_norm": 18.92729949951172, "learning_rate": 3.7500000000000003e-05, "loss": 0.7324, "step": 14 }, { "epoch": 2.8, "eval_accuracy": 0.46, "eval_loss": 0.792890727519989, "eval_runtime": 0.9961, "eval_samples_per_second": 100.395, "eval_steps_per_second": 3.012, "step": 14 }, { "epoch": 3.0, "grad_norm": 8.970295906066895, "learning_rate": 3.6458333333333336e-05, "loss": 0.7882, "step": 15 }, { "epoch": 3.0, "eval_accuracy": 0.43, "eval_loss": 0.786152184009552, "eval_runtime": 0.999, "eval_samples_per_second": 100.104, "eval_steps_per_second": 3.003, "step": 15 }, { "epoch": 3.2, "grad_norm": 7.676571369171143, "learning_rate": 3.541666666666667e-05, "loss": 0.7451, "step": 16 }, { "epoch": 3.2, "eval_accuracy": 0.44, "eval_loss": 0.785478413105011, "eval_runtime": 0.9973, "eval_samples_per_second": 100.272, "eval_steps_per_second": 3.008, "step": 16 }, { "epoch": 3.4, "grad_norm": 17.803117752075195, "learning_rate": 3.4375e-05, "loss": 0.7071, "step": 17 }, { "epoch": 3.4, "eval_accuracy": 0.44, "eval_loss": 0.7892186045646667, "eval_runtime": 1.0007, "eval_samples_per_second": 99.933, "eval_steps_per_second": 2.998, "step": 17 }, { "epoch": 3.6, "grad_norm": 16.301389694213867, "learning_rate": 3.3333333333333335e-05, "loss": 0.7665, "step": 18 }, { "epoch": 3.6, "eval_accuracy": 0.44, "eval_loss": 0.8022948503494263, "eval_runtime": 0.9993, "eval_samples_per_second": 100.071, "eval_steps_per_second": 3.002, "step": 18 }, { "epoch": 3.8, "grad_norm": 9.103254318237305, "learning_rate": 3.229166666666667e-05, "loss": 0.7503, "step": 19 }, { "epoch": 3.8, "eval_accuracy": 0.44, "eval_loss": 0.8114063739776611, "eval_runtime": 0.9942, "eval_samples_per_second": 100.58, "eval_steps_per_second": 3.017, "step": 19 }, { "epoch": 4.0, "grad_norm": 14.845060348510742, "learning_rate": 3.125e-05, "loss": 0.6844, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.47, "eval_loss": 0.8062010407447815, "eval_runtime": 0.9971, "eval_samples_per_second": 100.295, "eval_steps_per_second": 3.009, "step": 20 }, { "epoch": 4.2, "grad_norm": 19.840051651000977, "learning_rate": 3.0208333333333334e-05, "loss": 0.7454, "step": 21 }, { "epoch": 4.2, "eval_accuracy": 0.44, "eval_loss": 0.7904101610183716, "eval_runtime": 0.9973, "eval_samples_per_second": 100.271, "eval_steps_per_second": 3.008, "step": 21 }, { "epoch": 4.4, "grad_norm": 3.92164945602417, "learning_rate": 2.916666666666667e-05, "loss": 0.6977, "step": 22 }, { "epoch": 4.4, "eval_accuracy": 0.45, "eval_loss": 0.7847166657447815, "eval_runtime": 0.9986, "eval_samples_per_second": 100.142, "eval_steps_per_second": 3.004, "step": 22 }, { "epoch": 4.6, "grad_norm": 7.615128040313721, "learning_rate": 2.8125000000000003e-05, "loss": 0.7374, "step": 23 }, { "epoch": 4.6, "eval_accuracy": 0.46, "eval_loss": 0.7748146057128906, "eval_runtime": 0.9975, "eval_samples_per_second": 100.247, "eval_steps_per_second": 3.007, "step": 23 }, { "epoch": 4.8, "grad_norm": 7.336858749389648, "learning_rate": 2.7083333333333332e-05, "loss": 0.6703, "step": 24 }, { "epoch": 4.8, "eval_accuracy": 0.48, "eval_loss": 0.7589452266693115, "eval_runtime": 0.9975, "eval_samples_per_second": 100.247, "eval_steps_per_second": 3.007, "step": 24 }, { "epoch": 5.0, "grad_norm": 8.808582305908203, "learning_rate": 2.604166666666667e-05, "loss": 0.6783, "step": 25 }, { "epoch": 5.0, "eval_accuracy": 0.49, "eval_loss": 0.7491601705551147, "eval_runtime": 0.9972, "eval_samples_per_second": 100.279, "eval_steps_per_second": 3.008, "step": 25 }, { "epoch": 5.2, "grad_norm": 3.35099720954895, "learning_rate": 2.5e-05, "loss": 0.6878, "step": 26 }, { "epoch": 5.2, "eval_accuracy": 0.47, "eval_loss": 0.7470411062240601, "eval_runtime": 0.9968, "eval_samples_per_second": 100.316, "eval_steps_per_second": 3.009, "step": 26 }, { "epoch": 5.4, "grad_norm": 6.638521194458008, "learning_rate": 2.3958333333333334e-05, "loss": 0.6909, "step": 27 }, { "epoch": 5.4, "eval_accuracy": 0.48, "eval_loss": 0.74560546875, "eval_runtime": 0.9981, "eval_samples_per_second": 100.19, "eval_steps_per_second": 3.006, "step": 27 }, { "epoch": 5.6, "grad_norm": 11.796442031860352, "learning_rate": 2.2916666666666667e-05, "loss": 0.6564, "step": 28 }, { "epoch": 5.6, "eval_accuracy": 0.5, "eval_loss": 0.750224769115448, "eval_runtime": 1.0001, "eval_samples_per_second": 99.987, "eval_steps_per_second": 3.0, "step": 28 }, { "epoch": 5.8, "grad_norm": 10.697065353393555, "learning_rate": 2.1875e-05, "loss": 0.7397, "step": 29 }, { "epoch": 5.8, "eval_accuracy": 0.5, "eval_loss": 0.7502343654632568, "eval_runtime": 0.9996, "eval_samples_per_second": 100.035, "eval_steps_per_second": 3.001, "step": 29 }, { "epoch": 6.0, "grad_norm": 16.58921241760254, "learning_rate": 2.0833333333333336e-05, "loss": 0.641, "step": 30 }, { "epoch": 6.0, "eval_accuracy": 0.51, "eval_loss": 0.7463575601577759, "eval_runtime": 0.9995, "eval_samples_per_second": 100.051, "eval_steps_per_second": 3.002, "step": 30 }, { "epoch": 6.2, "grad_norm": 7.851564407348633, "learning_rate": 1.9791666666666665e-05, "loss": 0.6272, "step": 31 }, { "epoch": 6.2, "eval_accuracy": 0.52, "eval_loss": 0.7356445789337158, "eval_runtime": 0.9987, "eval_samples_per_second": 100.129, "eval_steps_per_second": 3.004, "step": 31 }, { "epoch": 6.4, "grad_norm": 11.272120475769043, "learning_rate": 1.8750000000000002e-05, "loss": 0.6667, "step": 32 }, { "epoch": 6.4, "eval_accuracy": 0.51, "eval_loss": 0.7220800518989563, "eval_runtime": 0.9996, "eval_samples_per_second": 100.044, "eval_steps_per_second": 3.001, "step": 32 }, { "epoch": 6.6, "grad_norm": 4.333158493041992, "learning_rate": 1.7708333333333335e-05, "loss": 0.6604, "step": 33 }, { "epoch": 6.6, "eval_accuracy": 0.5, "eval_loss": 0.7124804854393005, "eval_runtime": 0.997, "eval_samples_per_second": 100.298, "eval_steps_per_second": 3.009, "step": 33 }, { "epoch": 6.8, "grad_norm": 4.1158127784729, "learning_rate": 1.6666666666666667e-05, "loss": 0.6196, "step": 34 }, { "epoch": 6.8, "eval_accuracy": 0.51, "eval_loss": 0.7032715082168579, "eval_runtime": 0.9988, "eval_samples_per_second": 100.122, "eval_steps_per_second": 3.004, "step": 34 }, { "epoch": 7.0, "grad_norm": 12.1450834274292, "learning_rate": 1.5625e-05, "loss": 0.6995, "step": 35 }, { "epoch": 7.0, "eval_accuracy": 0.52, "eval_loss": 0.6988573670387268, "eval_runtime": 0.9973, "eval_samples_per_second": 100.27, "eval_steps_per_second": 3.008, "step": 35 }, { "epoch": 7.2, "grad_norm": 5.940031051635742, "learning_rate": 1.4583333333333335e-05, "loss": 0.6227, "step": 36 }, { "epoch": 7.2, "eval_accuracy": 0.52, "eval_loss": 0.6967969536781311, "eval_runtime": 0.9976, "eval_samples_per_second": 100.24, "eval_steps_per_second": 3.007, "step": 36 }, { "epoch": 7.4, "grad_norm": 17.160533905029297, "learning_rate": 1.3541666666666666e-05, "loss": 0.6482, "step": 37 }, { "epoch": 7.4, "eval_accuracy": 0.54, "eval_loss": 0.6968165636062622, "eval_runtime": 0.9978, "eval_samples_per_second": 100.221, "eval_steps_per_second": 3.007, "step": 37 }, { "epoch": 7.6, "grad_norm": 2.506211757659912, "learning_rate": 1.25e-05, "loss": 0.6068, "step": 38 }, { "epoch": 7.6, "eval_accuracy": 0.57, "eval_loss": 0.6994236707687378, "eval_runtime": 0.9998, "eval_samples_per_second": 100.024, "eval_steps_per_second": 3.001, "step": 38 }, { "epoch": 7.8, "grad_norm": 4.562036991119385, "learning_rate": 1.1458333333333333e-05, "loss": 0.6346, "step": 39 }, { "epoch": 7.8, "eval_accuracy": 0.59, "eval_loss": 0.6964159607887268, "eval_runtime": 0.9979, "eval_samples_per_second": 100.214, "eval_steps_per_second": 3.006, "step": 39 }, { "epoch": 8.0, "grad_norm": 19.430461883544922, "learning_rate": 1.0416666666666668e-05, "loss": 0.6444, "step": 40 }, { "epoch": 8.0, "eval_accuracy": 0.59, "eval_loss": 0.691386878490448, "eval_runtime": 0.9975, "eval_samples_per_second": 100.251, "eval_steps_per_second": 3.008, "step": 40 }, { "epoch": 8.2, "grad_norm": 12.345283508300781, "learning_rate": 9.375000000000001e-06, "loss": 0.6287, "step": 41 }, { "epoch": 8.2, "eval_accuracy": 0.6, "eval_loss": 0.6884568929672241, "eval_runtime": 0.9962, "eval_samples_per_second": 100.38, "eval_steps_per_second": 3.011, "step": 41 }, { "epoch": 8.4, "grad_norm": 14.297842025756836, "learning_rate": 8.333333333333334e-06, "loss": 0.6198, "step": 42 }, { "epoch": 8.4, "eval_accuracy": 0.58, "eval_loss": 0.6812109351158142, "eval_runtime": 0.998, "eval_samples_per_second": 100.196, "eval_steps_per_second": 3.006, "step": 42 }, { "epoch": 8.6, "grad_norm": 4.835755825042725, "learning_rate": 7.2916666666666674e-06, "loss": 0.6104, "step": 43 }, { "epoch": 8.6, "eval_accuracy": 0.59, "eval_loss": 0.6766307353973389, "eval_runtime": 1.0003, "eval_samples_per_second": 99.967, "eval_steps_per_second": 2.999, "step": 43 }, { "epoch": 8.8, "grad_norm": 7.960282325744629, "learning_rate": 6.25e-06, "loss": 0.6007, "step": 44 }, { "epoch": 8.8, "eval_accuracy": 0.6, "eval_loss": 0.6741991639137268, "eval_runtime": 0.9973, "eval_samples_per_second": 100.273, "eval_steps_per_second": 3.008, "step": 44 }, { "epoch": 9.0, "grad_norm": 6.917936325073242, "learning_rate": 5.208333333333334e-06, "loss": 0.6092, "step": 45 }, { "epoch": 9.0, "eval_accuracy": 0.61, "eval_loss": 0.6733300685882568, "eval_runtime": 0.9989, "eval_samples_per_second": 100.109, "eval_steps_per_second": 3.003, "step": 45 }, { "epoch": 9.2, "grad_norm": 2.6899542808532715, "learning_rate": 4.166666666666667e-06, "loss": 0.6177, "step": 46 }, { "epoch": 9.2, "eval_accuracy": 0.6, "eval_loss": 0.6735841631889343, "eval_runtime": 1.0001, "eval_samples_per_second": 99.991, "eval_steps_per_second": 3.0, "step": 46 }, { "epoch": 9.4, "grad_norm": 3.7043120861053467, "learning_rate": 3.125e-06, "loss": 0.6172, "step": 47 }, { "epoch": 9.4, "eval_accuracy": 0.6, "eval_loss": 0.673154354095459, "eval_runtime": 0.9971, "eval_samples_per_second": 100.293, "eval_steps_per_second": 3.009, "step": 47 }, { "epoch": 9.6, "grad_norm": 4.5208659172058105, "learning_rate": 2.0833333333333334e-06, "loss": 0.6193, "step": 48 }, { "epoch": 9.6, "eval_accuracy": 0.61, "eval_loss": 0.6734569072723389, "eval_runtime": 0.9958, "eval_samples_per_second": 100.42, "eval_steps_per_second": 3.013, "step": 48 }, { "epoch": 9.8, "grad_norm": 10.0933837890625, "learning_rate": 1.0416666666666667e-06, "loss": 0.5712, "step": 49 }, { "epoch": 9.8, "eval_accuracy": 0.61, "eval_loss": 0.6726464629173279, "eval_runtime": 0.9963, "eval_samples_per_second": 100.375, "eval_steps_per_second": 3.011, "step": 49 }, { "epoch": 10.0, "grad_norm": 3.2562851905822754, "learning_rate": 0.0, "loss": 0.5908, "step": 50 }, { "epoch": 10.0, "eval_accuracy": 0.6, "eval_loss": 0.6712987422943115, "eval_runtime": 0.9969, "eval_samples_per_second": 100.308, "eval_steps_per_second": 3.009, "step": 50 }, { "epoch": 10.0, "step": 50, "total_flos": 27858578374656.0, "train_loss": 0.7865762293338776, "train_runtime": 193.0614, "train_samples_per_second": 20.719, "train_steps_per_second": 0.259 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 27858578374656.0, "train_batch_size": 10, "trial_name": null, "trial_params": null }