{ "best_metric": 0.6989061236381531, "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-cot7b/checkpoint-1600", "epoch": 2.813805231919103, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 5.9999999999999995e-05, "loss": 2.3604, "step": 20 }, { "epoch": 0.07, "learning_rate": 0.00011999999999999999, "loss": 1.944, "step": 40 }, { "epoch": 0.11, "learning_rate": 0.00017999999999999998, "loss": 1.2746, "step": 60 }, { "epoch": 0.14, "learning_rate": 0.00023999999999999998, "loss": 0.9818, "step": 80 }, { "epoch": 0.18, "learning_rate": 0.0003, "loss": 0.863, "step": 100 }, { "epoch": 0.21, "learning_rate": 0.0002962593516209476, "loss": 0.8083, "step": 120 }, { "epoch": 0.25, "learning_rate": 0.00029251870324189524, "loss": 0.7923, "step": 140 }, { "epoch": 0.28, "learning_rate": 0.00028877805486284284, "loss": 0.7623, "step": 160 }, { "epoch": 0.32, "learning_rate": 0.0002850374064837905, "loss": 0.7665, "step": 180 }, { "epoch": 0.35, "learning_rate": 0.0002812967581047381, "loss": 0.7554, "step": 200 }, { "epoch": 0.35, "eval_loss": 0.7609586715698242, "eval_runtime": 32.1749, "eval_samples_per_second": 62.16, "eval_steps_per_second": 1.958, "step": 200 }, { "epoch": 0.39, "learning_rate": 0.00027755610972568577, "loss": 0.7353, "step": 220 }, { "epoch": 0.42, "learning_rate": 0.0002738154613466334, "loss": 0.7354, "step": 240 }, { "epoch": 0.46, "learning_rate": 0.00027007481296758103, "loss": 0.7435, "step": 260 }, { "epoch": 0.49, "learning_rate": 0.00026633416458852864, "loss": 0.7373, "step": 280 }, { "epoch": 0.53, "learning_rate": 0.0002625935162094763, "loss": 0.7332, "step": 300 }, { "epoch": 0.56, "learning_rate": 0.0002588528678304239, "loss": 0.7265, "step": 320 }, { "epoch": 0.6, "learning_rate": 0.00025511221945137156, "loss": 0.7274, "step": 340 }, { "epoch": 0.63, "learning_rate": 0.00025137157107231917, "loss": 0.7279, "step": 360 }, { "epoch": 0.67, "learning_rate": 0.00024763092269326683, "loss": 0.7231, "step": 380 }, { "epoch": 0.7, "learning_rate": 0.00024389027431421443, "loss": 0.7238, "step": 400 }, { "epoch": 0.7, "eval_loss": 0.7300755381584167, "eval_runtime": 32.1181, "eval_samples_per_second": 62.27, "eval_steps_per_second": 1.962, "step": 400 }, { "epoch": 0.74, "learning_rate": 0.00024014962593516207, "loss": 0.7161, "step": 420 }, { "epoch": 0.77, "learning_rate": 0.0002364089775561097, "loss": 0.7146, "step": 440 }, { "epoch": 0.81, "learning_rate": 0.00023266832917705733, "loss": 0.7152, "step": 460 }, { "epoch": 0.84, "learning_rate": 0.00022892768079800496, "loss": 0.7202, "step": 480 }, { "epoch": 0.88, "learning_rate": 0.0002251870324189526, "loss": 0.7142, "step": 500 }, { "epoch": 0.91, "learning_rate": 0.00022144638403990023, "loss": 0.7146, "step": 520 }, { "epoch": 0.95, "learning_rate": 0.00021770573566084786, "loss": 0.7104, "step": 540 }, { "epoch": 0.98, "learning_rate": 0.0002139650872817955, "loss": 0.7034, "step": 560 }, { "epoch": 1.02, "learning_rate": 0.00021022443890274313, "loss": 0.7153, "step": 580 }, { "epoch": 1.06, "learning_rate": 0.00020648379052369076, "loss": 0.7052, "step": 600 }, { "epoch": 1.06, "eval_loss": 0.7185753583908081, "eval_runtime": 32.4703, "eval_samples_per_second": 61.595, "eval_steps_per_second": 1.94, "step": 600 }, { "epoch": 1.09, "learning_rate": 0.0002027431421446384, "loss": 0.7061, "step": 620 }, { "epoch": 1.13, "learning_rate": 0.00019900249376558603, "loss": 0.7096, "step": 640 }, { "epoch": 1.16, "learning_rate": 0.00019526184538653366, "loss": 0.7065, "step": 660 }, { "epoch": 1.2, "learning_rate": 0.00019152119700748126, "loss": 0.7046, "step": 680 }, { "epoch": 1.23, "learning_rate": 0.0001877805486284289, "loss": 0.701, "step": 700 }, { "epoch": 1.27, "learning_rate": 0.00018403990024937653, "loss": 0.6922, "step": 720 }, { "epoch": 1.3, "learning_rate": 0.00018029925187032416, "loss": 0.6982, "step": 740 }, { "epoch": 1.34, "learning_rate": 0.0001765586034912718, "loss": 0.6993, "step": 760 }, { "epoch": 1.37, "learning_rate": 0.00017281795511221943, "loss": 0.6922, "step": 780 }, { "epoch": 1.41, "learning_rate": 0.00016907730673316706, "loss": 0.6989, "step": 800 }, { "epoch": 1.41, "eval_loss": 0.7114558219909668, "eval_runtime": 32.0158, "eval_samples_per_second": 62.469, "eval_steps_per_second": 1.968, "step": 800 }, { "epoch": 1.44, "learning_rate": 0.0001653366583541147, "loss": 0.6964, "step": 820 }, { "epoch": 1.48, "learning_rate": 0.00016159600997506232, "loss": 0.6969, "step": 840 }, { "epoch": 1.51, "learning_rate": 0.00015785536159600996, "loss": 0.6982, "step": 860 }, { "epoch": 1.55, "learning_rate": 0.0001541147132169576, "loss": 0.6977, "step": 880 }, { "epoch": 1.58, "learning_rate": 0.00015037406483790522, "loss": 0.7019, "step": 900 }, { "epoch": 1.62, "learning_rate": 0.00014663341645885285, "loss": 0.6963, "step": 920 }, { "epoch": 1.65, "learning_rate": 0.00014289276807980049, "loss": 0.7006, "step": 940 }, { "epoch": 1.69, "learning_rate": 0.00013915211970074812, "loss": 0.6935, "step": 960 }, { "epoch": 1.72, "learning_rate": 0.00013541147132169575, "loss": 0.6846, "step": 980 }, { "epoch": 1.76, "learning_rate": 0.00013167082294264338, "loss": 0.701, "step": 1000 }, { "epoch": 1.76, "eval_loss": 0.7069133520126343, "eval_runtime": 31.9943, "eval_samples_per_second": 62.511, "eval_steps_per_second": 1.969, "step": 1000 }, { "epoch": 1.79, "learning_rate": 0.00012793017456359102, "loss": 0.6896, "step": 1020 }, { "epoch": 1.83, "learning_rate": 0.00012418952618453862, "loss": 0.702, "step": 1040 }, { "epoch": 1.86, "learning_rate": 0.00012044887780548627, "loss": 0.6952, "step": 1060 }, { "epoch": 1.9, "learning_rate": 0.0001167082294264339, "loss": 0.6902, "step": 1080 }, { "epoch": 1.93, "learning_rate": 0.00011296758104738153, "loss": 0.6866, "step": 1100 }, { "epoch": 1.97, "learning_rate": 0.00010922693266832918, "loss": 0.6929, "step": 1120 }, { "epoch": 2.0, "learning_rate": 0.00010548628428927681, "loss": 0.6846, "step": 1140 }, { "epoch": 2.04, "learning_rate": 0.00010174563591022444, "loss": 0.6944, "step": 1160 }, { "epoch": 2.08, "learning_rate": 9.800498753117206e-05, "loss": 0.6868, "step": 1180 }, { "epoch": 2.11, "learning_rate": 9.42643391521197e-05, "loss": 0.6938, "step": 1200 }, { "epoch": 2.11, "eval_loss": 0.7033773064613342, "eval_runtime": 32.1557, "eval_samples_per_second": 62.197, "eval_steps_per_second": 1.959, "step": 1200 }, { "epoch": 2.15, "learning_rate": 9.052369077306733e-05, "loss": 0.6868, "step": 1220 }, { "epoch": 2.18, "learning_rate": 8.678304239401496e-05, "loss": 0.6795, "step": 1240 }, { "epoch": 2.22, "learning_rate": 8.304239401496259e-05, "loss": 0.6887, "step": 1260 }, { "epoch": 2.25, "learning_rate": 7.930174563591023e-05, "loss": 0.6795, "step": 1280 }, { "epoch": 2.29, "learning_rate": 7.556109725685786e-05, "loss": 0.6934, "step": 1300 }, { "epoch": 2.32, "learning_rate": 7.182044887780548e-05, "loss": 0.6905, "step": 1320 }, { "epoch": 2.36, "learning_rate": 6.807980049875311e-05, "loss": 0.685, "step": 1340 }, { "epoch": 2.39, "learning_rate": 6.433915211970074e-05, "loss": 0.6887, "step": 1360 }, { "epoch": 2.43, "learning_rate": 6.0598503740648375e-05, "loss": 0.6875, "step": 1380 }, { "epoch": 2.46, "learning_rate": 5.6857855361596e-05, "loss": 0.6807, "step": 1400 }, { "epoch": 2.46, "eval_loss": 0.7009322643280029, "eval_runtime": 32.0972, "eval_samples_per_second": 62.311, "eval_steps_per_second": 1.963, "step": 1400 }, { "epoch": 2.5, "learning_rate": 5.311720698254363e-05, "loss": 0.6743, "step": 1420 }, { "epoch": 2.53, "learning_rate": 4.9376558603491265e-05, "loss": 0.6872, "step": 1440 }, { "epoch": 2.57, "learning_rate": 4.56359102244389e-05, "loss": 0.6776, "step": 1460 }, { "epoch": 2.6, "learning_rate": 4.1895261845386524e-05, "loss": 0.6769, "step": 1480 }, { "epoch": 2.64, "learning_rate": 3.8154613466334156e-05, "loss": 0.68, "step": 1500 }, { "epoch": 2.67, "learning_rate": 3.4413965087281796e-05, "loss": 0.6804, "step": 1520 }, { "epoch": 2.71, "learning_rate": 3.067331670822942e-05, "loss": 0.6848, "step": 1540 }, { "epoch": 2.74, "learning_rate": 2.6932668329177054e-05, "loss": 0.6825, "step": 1560 }, { "epoch": 2.78, "learning_rate": 2.3192019950124686e-05, "loss": 0.6899, "step": 1580 }, { "epoch": 2.81, "learning_rate": 1.945137157107232e-05, "loss": 0.6775, "step": 1600 }, { "epoch": 2.81, "eval_loss": 0.6989061236381531, "eval_runtime": 32.1279, "eval_samples_per_second": 62.251, "eval_steps_per_second": 1.961, "step": 1600 } ], "max_steps": 1704, "num_train_epochs": 3, "total_flos": 4.159132701791617e+18, "trial_name": null, "trial_params": null }