{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "global_step": 43890, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 0.0002965823650034176, "loss": 3.7122, "step": 500 }, { "epoch": 0.16, "learning_rate": 0.00029316473000683526, "loss": 1.7581, "step": 1000 }, { "epoch": 0.24, "learning_rate": 0.0002897470950102529, "loss": 1.5341, "step": 1500 }, { "epoch": 0.32, "learning_rate": 0.00028632946001367054, "loss": 1.4748, "step": 2000 }, { "epoch": 0.4, "learning_rate": 0.0002829118250170881, "loss": 1.3464, "step": 2500 }, { "epoch": 0.48, "learning_rate": 0.0002794941900205058, "loss": 1.2892, "step": 3000 }, { "epoch": 0.56, "learning_rate": 0.0002760765550239234, "loss": 1.2375, "step": 3500 }, { "epoch": 0.64, "learning_rate": 0.00027265892002734105, "loss": 1.2408, "step": 4000 }, { "epoch": 0.72, "learning_rate": 0.0002692412850307587, "loss": 1.1577, "step": 4500 }, { "epoch": 0.8, "learning_rate": 0.0002658236500341763, "loss": 1.1164, "step": 5000 }, { "epoch": 0.8, "eval_gen_len": 4.541, "eval_loss": 0.8244166970252991, "eval_rouge1": 66.4678, "eval_rouge2": 35.3554, "eval_rougeL": 66.4543, "eval_rougeLsum": 66.4522, "eval_runtime": 109.1952, "eval_samples_per_second": 36.632, "eval_steps_per_second": 4.579, "step": 5000 }, { "epoch": 0.88, "learning_rate": 0.00026240601503759397, "loss": 1.1345, "step": 5500 }, { "epoch": 0.96, "learning_rate": 0.0002589883800410116, "loss": 1.1116, "step": 6000 }, { "epoch": 1.04, "learning_rate": 0.00025557074504442925, "loss": 1.0336, "step": 6500 }, { "epoch": 1.12, "learning_rate": 0.0002521531100478469, "loss": 0.9108, "step": 7000 }, { "epoch": 1.2, "learning_rate": 0.0002487354750512645, "loss": 0.92, "step": 7500 }, { "epoch": 1.28, "learning_rate": 0.0002453178400546821, "loss": 0.9071, "step": 8000 }, { "epoch": 1.36, "learning_rate": 0.00024190020505809978, "loss": 0.8936, "step": 8500 }, { "epoch": 1.44, "learning_rate": 0.0002384825700615174, "loss": 0.888, "step": 9000 }, { "epoch": 1.52, "learning_rate": 0.00023506493506493504, "loss": 0.8847, "step": 9500 }, { "epoch": 1.59, "learning_rate": 0.00023164730006835268, "loss": 0.9097, "step": 10000 }, { "epoch": 1.59, "eval_gen_len": 4.5548, "eval_loss": 0.7299422025680542, "eval_rouge1": 70.0574, "eval_rouge2": 37.5535, "eval_rougeL": 69.9512, "eval_rougeLsum": 70.0084, "eval_runtime": 106.5141, "eval_samples_per_second": 37.554, "eval_steps_per_second": 4.694, "step": 10000 }, { "epoch": 1.67, "learning_rate": 0.00022822966507177032, "loss": 0.8895, "step": 10500 }, { "epoch": 1.75, "learning_rate": 0.00022481203007518796, "loss": 0.8544, "step": 11000 }, { "epoch": 1.83, "learning_rate": 0.0002213943950786056, "loss": 0.8807, "step": 11500 }, { "epoch": 1.91, "learning_rate": 0.00021797676008202322, "loss": 0.8451, "step": 12000 }, { "epoch": 1.99, "learning_rate": 0.00021455912508544086, "loss": 0.82, "step": 12500 }, { "epoch": 2.07, "learning_rate": 0.00021114149008885847, "loss": 0.6878, "step": 13000 }, { "epoch": 2.15, "learning_rate": 0.00020772385509227614, "loss": 0.6759, "step": 13500 }, { "epoch": 2.23, "learning_rate": 0.00020430622009569378, "loss": 0.6998, "step": 14000 }, { "epoch": 2.31, "learning_rate": 0.0002008885850991114, "loss": 0.6751, "step": 14500 }, { "epoch": 2.39, "learning_rate": 0.00019747095010252903, "loss": 0.6637, "step": 15000 }, { "epoch": 2.39, "eval_gen_len": 4.703, "eval_loss": 0.7314157485961914, "eval_rouge1": 72.0767, "eval_rouge2": 39.2263, "eval_rougeL": 72.0257, "eval_rougeLsum": 72.0473, "eval_runtime": 110.4087, "eval_samples_per_second": 36.229, "eval_steps_per_second": 4.529, "step": 15000 }, { "epoch": 2.47, "learning_rate": 0.00019405331510594667, "loss": 0.6698, "step": 15500 }, { "epoch": 2.55, "learning_rate": 0.0001906356801093643, "loss": 0.672, "step": 16000 }, { "epoch": 2.63, "learning_rate": 0.00018721804511278195, "loss": 0.6431, "step": 16500 }, { "epoch": 2.71, "learning_rate": 0.00018380041011619957, "loss": 0.6653, "step": 17000 }, { "epoch": 2.79, "learning_rate": 0.0001803827751196172, "loss": 0.6824, "step": 17500 }, { "epoch": 2.87, "learning_rate": 0.00017696514012303485, "loss": 0.6668, "step": 18000 }, { "epoch": 2.95, "learning_rate": 0.00017354750512645246, "loss": 0.6318, "step": 18500 }, { "epoch": 3.03, "learning_rate": 0.0001701298701298701, "loss": 0.5934, "step": 19000 }, { "epoch": 3.11, "learning_rate": 0.00016671223513328777, "loss": 0.5086, "step": 19500 }, { "epoch": 3.19, "learning_rate": 0.00016329460013670539, "loss": 0.5015, "step": 20000 }, { "epoch": 3.19, "eval_gen_len": 4.75, "eval_loss": 0.7147404551506042, "eval_rouge1": 73.0185, "eval_rouge2": 39.9998, "eval_rougeL": 72.9347, "eval_rougeLsum": 72.9576, "eval_runtime": 106.9579, "eval_samples_per_second": 37.398, "eval_steps_per_second": 4.675, "step": 20000 }, { "epoch": 3.27, "learning_rate": 0.00015987696514012303, "loss": 0.4909, "step": 20500 }, { "epoch": 3.35, "learning_rate": 0.00015645933014354064, "loss": 0.5114, "step": 21000 }, { "epoch": 3.43, "learning_rate": 0.00015304169514695828, "loss": 0.5314, "step": 21500 }, { "epoch": 3.51, "learning_rate": 0.00014962406015037592, "loss": 0.5089, "step": 22000 }, { "epoch": 3.59, "learning_rate": 0.00014620642515379356, "loss": 0.5133, "step": 22500 }, { "epoch": 3.67, "learning_rate": 0.0001427887901572112, "loss": 0.5057, "step": 23000 }, { "epoch": 3.75, "learning_rate": 0.00013937115516062882, "loss": 0.5181, "step": 23500 }, { "epoch": 3.83, "learning_rate": 0.00013595352016404648, "loss": 0.4826, "step": 24000 }, { "epoch": 3.91, "learning_rate": 0.0001325358851674641, "loss": 0.497, "step": 24500 }, { "epoch": 3.99, "learning_rate": 0.00012911825017088174, "loss": 0.5101, "step": 25000 }, { "epoch": 3.99, "eval_gen_len": 4.8728, "eval_loss": 0.7054756283760071, "eval_rouge1": 73.7898, "eval_rouge2": 40.5481, "eval_rougeL": 73.7235, "eval_rougeLsum": 73.7901, "eval_runtime": 110.0456, "eval_samples_per_second": 36.349, "eval_steps_per_second": 4.544, "step": 25000 }, { "epoch": 4.07, "learning_rate": 0.00012570061517429938, "loss": 0.3961, "step": 25500 }, { "epoch": 4.15, "learning_rate": 0.00012228298017771702, "loss": 0.3725, "step": 26000 }, { "epoch": 4.23, "learning_rate": 0.00011886534518113465, "loss": 0.3698, "step": 26500 }, { "epoch": 4.31, "learning_rate": 0.00011544771018455227, "loss": 0.3946, "step": 27000 }, { "epoch": 4.39, "learning_rate": 0.00011203007518796991, "loss": 0.4009, "step": 27500 }, { "epoch": 4.47, "learning_rate": 0.00010861244019138756, "loss": 0.391, "step": 28000 }, { "epoch": 4.55, "learning_rate": 0.00010519480519480518, "loss": 0.3787, "step": 28500 }, { "epoch": 4.63, "learning_rate": 0.00010177717019822282, "loss": 0.3736, "step": 29000 }, { "epoch": 4.7, "learning_rate": 9.835953520164045e-05, "loss": 0.3842, "step": 29500 }, { "epoch": 4.78, "learning_rate": 9.494190020505809e-05, "loss": 0.3903, "step": 30000 }, { "epoch": 4.78, "eval_gen_len": 4.5938, "eval_loss": 0.7442232370376587, "eval_rouge1": 74.0845, "eval_rouge2": 39.9841, "eval_rougeL": 74.0172, "eval_rougeLsum": 74.0635, "eval_runtime": 110.762, "eval_samples_per_second": 36.113, "eval_steps_per_second": 4.514, "step": 30000 }, { "epoch": 4.86, "learning_rate": 9.152426520847573e-05, "loss": 0.3945, "step": 30500 }, { "epoch": 4.94, "learning_rate": 8.810663021189336e-05, "loss": 0.4128, "step": 31000 }, { "epoch": 5.02, "learning_rate": 8.468899521531099e-05, "loss": 0.363, "step": 31500 }, { "epoch": 5.1, "learning_rate": 8.127136021872864e-05, "loss": 0.3144, "step": 32000 }, { "epoch": 5.18, "learning_rate": 7.785372522214627e-05, "loss": 0.3106, "step": 32500 }, { "epoch": 5.26, "learning_rate": 7.44360902255639e-05, "loss": 0.2982, "step": 33000 }, { "epoch": 5.34, "learning_rate": 7.101845522898154e-05, "loss": 0.3016, "step": 33500 }, { "epoch": 5.42, "learning_rate": 6.760082023239918e-05, "loss": 0.3095, "step": 34000 }, { "epoch": 5.5, "learning_rate": 6.41831852358168e-05, "loss": 0.2863, "step": 34500 }, { "epoch": 5.58, "learning_rate": 6.076555023923445e-05, "loss": 0.2993, "step": 35000 }, { "epoch": 5.58, "eval_gen_len": 4.7412, "eval_loss": 0.8183711171150208, "eval_rouge1": 73.8405, "eval_rouge2": 40.2569, "eval_rougeL": 73.7756, "eval_rougeLsum": 73.7972, "eval_runtime": 109.7934, "eval_samples_per_second": 36.432, "eval_steps_per_second": 4.554, "step": 35000 }, { "epoch": 5.66, "learning_rate": 5.734791524265208e-05, "loss": 0.2975, "step": 35500 }, { "epoch": 5.74, "learning_rate": 5.393028024606972e-05, "loss": 0.3014, "step": 36000 }, { "epoch": 5.82, "learning_rate": 5.051264524948735e-05, "loss": 0.2996, "step": 36500 }, { "epoch": 5.9, "learning_rate": 4.7095010252904986e-05, "loss": 0.3068, "step": 37000 }, { "epoch": 5.98, "learning_rate": 4.367737525632262e-05, "loss": 0.2993, "step": 37500 }, { "epoch": 6.06, "learning_rate": 4.025974025974026e-05, "loss": 0.2447, "step": 38000 }, { "epoch": 6.14, "learning_rate": 3.684210526315789e-05, "loss": 0.2379, "step": 38500 }, { "epoch": 6.22, "learning_rate": 3.342447026657553e-05, "loss": 0.2447, "step": 39000 }, { "epoch": 6.3, "learning_rate": 3.0006835269993163e-05, "loss": 0.2452, "step": 39500 }, { "epoch": 6.38, "learning_rate": 2.65892002734108e-05, "loss": 0.2227, "step": 40000 }, { "epoch": 6.38, "eval_gen_len": 4.742, "eval_loss": 0.8277584910392761, "eval_rouge1": 74.0159, "eval_rouge2": 40.6403, "eval_rougeL": 73.9412, "eval_rougeLsum": 73.9722, "eval_runtime": 108.0867, "eval_samples_per_second": 37.007, "eval_steps_per_second": 4.626, "step": 40000 }, { "epoch": 6.46, "learning_rate": 2.3171565276828434e-05, "loss": 0.2331, "step": 40500 }, { "epoch": 6.54, "learning_rate": 1.9753930280246068e-05, "loss": 0.2374, "step": 41000 }, { "epoch": 6.62, "learning_rate": 1.6336295283663705e-05, "loss": 0.2462, "step": 41500 }, { "epoch": 6.7, "learning_rate": 1.2918660287081339e-05, "loss": 0.24, "step": 42000 }, { "epoch": 6.78, "learning_rate": 9.501025290498975e-06, "loss": 0.2217, "step": 42500 }, { "epoch": 6.86, "learning_rate": 6.083390293916609e-06, "loss": 0.2283, "step": 43000 }, { "epoch": 6.94, "learning_rate": 2.6657552973342446e-06, "loss": 0.2345, "step": 43500 }, { "epoch": 7.0, "step": 43890, "total_flos": 8.113234147780608e+16, "train_loss": 0.639304427944139, "train_runtime": 13849.1505, "train_samples_per_second": 25.353, "train_steps_per_second": 3.169 } ], "max_steps": 43890, "num_train_epochs": 7, "total_flos": 8.113234147780608e+16, "trial_name": null, "trial_params": null }