{ "best_metric": 3.715158224105835, "best_model_checkpoint": "runs/checkpoint-1000", "epoch": 4.587155963302752, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "learning_rate": 6.666666666666667e-06, "loss": 6.9799, "step": 20 }, { "epoch": 0.18, "learning_rate": 1.3333333333333333e-05, "loss": 6.7268, "step": 40 }, { "epoch": 0.28, "learning_rate": 2e-05, "loss": 6.4767, "step": 60 }, { "epoch": 0.37, "learning_rate": 2.6666666666666667e-05, "loss": 6.3317, "step": 80 }, { "epoch": 0.46, "learning_rate": 3.3333333333333335e-05, "loss": 6.2151, "step": 100 }, { "epoch": 0.46, "eval_accuracy": 0.010721727291912753, "eval_loss": 6.165957927703857, "eval_runtime": 954.4032, "eval_samples_per_second": 3.09, "eval_steps_per_second": 0.065, "step": 100 }, { "epoch": 0.55, "learning_rate": 4e-05, "loss": 6.13, "step": 120 }, { "epoch": 0.64, "learning_rate": 4.666666666666667e-05, "loss": 6.0514, "step": 140 }, { "epoch": 0.73, "learning_rate": 5.333333333333333e-05, "loss": 5.9384, "step": 160 }, { "epoch": 0.83, "learning_rate": 6e-05, "loss": 5.679, "step": 180 }, { "epoch": 0.92, "learning_rate": 6.666666666666667e-05, "loss": 5.2673, "step": 200 }, { "epoch": 0.92, "eval_accuracy": 0.0003641545871791163, "eval_loss": 4.998520851135254, "eval_runtime": 3395.6647, "eval_samples_per_second": 0.868, "eval_steps_per_second": 0.018, "step": 200 }, { "epoch": 1.01, "learning_rate": 7.333333333333333e-05, "loss": 4.8258, "step": 220 }, { "epoch": 1.1, "learning_rate": 8e-05, "loss": 4.6404, "step": 240 }, { "epoch": 1.19, "learning_rate": 8.666666666666667e-05, "loss": 4.5153, "step": 260 }, { "epoch": 1.28, "learning_rate": 9.333333333333334e-05, "loss": 4.4108, "step": 280 }, { "epoch": 1.38, "learning_rate": 0.0001, "loss": 4.3584, "step": 300 }, { "epoch": 1.38, "eval_accuracy": 2.3712391723291295e-06, "eval_loss": 4.306852340698242, "eval_runtime": 3914.0936, "eval_samples_per_second": 0.753, "eval_steps_per_second": 0.016, "step": 300 }, { "epoch": 1.47, "learning_rate": 9.979871469976196e-05, "loss": 4.2798, "step": 320 }, { "epoch": 1.56, "learning_rate": 9.919647942993148e-05, "loss": 4.2084, "step": 340 }, { "epoch": 1.65, "learning_rate": 9.819814303479267e-05, "loss": 4.173, "step": 360 }, { "epoch": 1.74, "learning_rate": 9.681174353198687e-05, "loss": 4.1324, "step": 380 }, { "epoch": 1.83, "learning_rate": 9.504844339512095e-05, "loss": 4.0826, "step": 400 }, { "epoch": 1.83, "eval_accuracy": 8.299337103151952e-05, "eval_loss": 4.064357280731201, "eval_runtime": 625.6747, "eval_samples_per_second": 4.713, "eval_steps_per_second": 0.099, "step": 400 }, { "epoch": 1.93, "learning_rate": 9.292243968009331e-05, "loss": 4.0714, "step": 420 }, { "epoch": 2.02, "learning_rate": 9.045084971874738e-05, "loss": 4.0267, "step": 440 }, { "epoch": 2.11, "learning_rate": 8.765357330018056e-05, "loss": 3.9907, "step": 460 }, { "epoch": 2.2, "learning_rate": 8.455313244934324e-05, "loss": 3.9733, "step": 480 }, { "epoch": 2.29, "learning_rate": 8.117449009293668e-05, "loss": 3.9506, "step": 500 }, { "epoch": 2.29, "eval_accuracy": 0.000247963867734989, "eval_loss": 3.9375269412994385, "eval_runtime": 646.8101, "eval_samples_per_second": 4.559, "eval_steps_per_second": 0.096, "step": 500 }, { "epoch": 2.39, "learning_rate": 7.754484907260513e-05, "loss": 3.9255, "step": 520 }, { "epoch": 2.48, "learning_rate": 7.369343312364993e-05, "loss": 3.9064, "step": 540 }, { "epoch": 2.57, "learning_rate": 6.965125158269619e-05, "loss": 3.8806, "step": 560 }, { "epoch": 2.66, "learning_rate": 6.545084971874738e-05, "loss": 3.89, "step": 580 }, { "epoch": 2.75, "learning_rate": 6.112604669781572e-05, "loss": 3.8576, "step": 600 }, { "epoch": 2.75, "eval_accuracy": 0.0002405114017648117, "eval_loss": 3.8471720218658447, "eval_runtime": 641.884, "eval_samples_per_second": 4.594, "eval_steps_per_second": 0.097, "step": 600 }, { "epoch": 2.84, "learning_rate": 5.6711663290882776e-05, "loss": 3.8314, "step": 620 }, { "epoch": 2.94, "learning_rate": 5.2243241517525754e-05, "loss": 3.8356, "step": 640 }, { "epoch": 3.03, "learning_rate": 4.775675848247427e-05, "loss": 3.8056, "step": 660 }, { "epoch": 3.12, "learning_rate": 4.328833670911724e-05, "loss": 3.7761, "step": 680 }, { "epoch": 3.21, "learning_rate": 3.887395330218429e-05, "loss": 3.7937, "step": 700 }, { "epoch": 3.21, "eval_accuracy": 0.00025236759762645736, "eval_loss": 3.7817437648773193, "eval_runtime": 1546.8275, "eval_samples_per_second": 1.906, "eval_steps_per_second": 0.04, "step": 700 }, { "epoch": 3.3, "learning_rate": 3.4549150281252636e-05, "loss": 3.7696, "step": 720 }, { "epoch": 3.39, "learning_rate": 3.0348748417303823e-05, "loss": 3.7522, "step": 740 }, { "epoch": 3.49, "learning_rate": 2.630656687635007e-05, "loss": 3.743, "step": 760 }, { "epoch": 3.58, "learning_rate": 2.245515092739488e-05, "loss": 3.7389, "step": 780 }, { "epoch": 3.67, "learning_rate": 1.8825509907063327e-05, "loss": 3.7287, "step": 800 }, { "epoch": 3.67, "eval_accuracy": 0.00031435856456020456, "eval_loss": 3.7385406494140625, "eval_runtime": 664.5264, "eval_samples_per_second": 4.438, "eval_steps_per_second": 0.093, "step": 800 }, { "epoch": 3.76, "learning_rate": 1.544686755065677e-05, "loss": 3.7229, "step": 820 }, { "epoch": 3.85, "learning_rate": 1.2346426699819458e-05, "loss": 3.7138, "step": 840 }, { "epoch": 3.94, "learning_rate": 9.549150281252633e-06, "loss": 3.7314, "step": 860 }, { "epoch": 4.04, "learning_rate": 7.077560319906695e-06, "loss": 3.7059, "step": 880 }, { "epoch": 4.13, "learning_rate": 4.951556604879048e-06, "loss": 3.7205, "step": 900 }, { "epoch": 4.13, "eval_accuracy": 0.0003251985150622806, "eval_loss": 3.7189857959747314, "eval_runtime": 713.2391, "eval_samples_per_second": 4.135, "eval_steps_per_second": 0.087, "step": 900 }, { "epoch": 4.22, "learning_rate": 3.18825646801314e-06, "loss": 3.71, "step": 920 }, { "epoch": 4.31, "learning_rate": 1.8018569652073381e-06, "loss": 3.6848, "step": 940 }, { "epoch": 4.4, "learning_rate": 8.035205700685167e-07, "loss": 3.6929, "step": 960 }, { "epoch": 4.5, "learning_rate": 2.012853002380466e-07, "loss": 3.7022, "step": 980 }, { "epoch": 4.59, "learning_rate": 0.0, "loss": 3.6858, "step": 1000 }, { "epoch": 4.59, "eval_accuracy": 0.00034552342225367314, "eval_loss": 3.715158224105835, "eval_runtime": 606.7664, "eval_samples_per_second": 4.86, "eval_steps_per_second": 0.102, "step": 1000 } ], "logging_steps": 20, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "total_flos": 9439661425655808.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }