{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "global_step": 143540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 0.00029895499512331056, "loss": 4.0695, "step": 500 }, { "epoch": 0.07, "learning_rate": 0.00029790999024662115, "loss": 3.433, "step": 1000 }, { "epoch": 0.1, "learning_rate": 0.0002968649853699317, "loss": 3.198, "step": 1500 }, { "epoch": 0.14, "learning_rate": 0.00029581998049324227, "loss": 3.1807, "step": 2000 }, { "epoch": 0.17, "learning_rate": 0.00029477497561655286, "loss": 3.0421, "step": 2500 }, { "epoch": 0.21, "learning_rate": 0.00029372997073986344, "loss": 3.0883, "step": 3000 }, { "epoch": 0.24, "learning_rate": 0.00029268496586317403, "loss": 3.0714, "step": 3500 }, { "epoch": 0.28, "learning_rate": 0.00029163996098648457, "loss": 2.979, "step": 4000 }, { "epoch": 0.31, "learning_rate": 0.00029059495610979515, "loss": 2.9596, "step": 4500 }, { "epoch": 0.35, "learning_rate": 0.00028954995123310574, "loss": 2.9377, "step": 5000 }, { "epoch": 0.35, "eval_gen_len": 12.8717, "eval_loss": 2.5157084465026855, "eval_rouge1": 54.6148, "eval_rouge2": 35.1518, "eval_rougeL": 51.8908, "eval_rougeLsum": 51.8957, "eval_runtime": 121.4402, "eval_samples_per_second": 39.929, "eval_steps_per_second": 4.998, "step": 5000 }, { "epoch": 0.38, "learning_rate": 0.0002885049463564163, "loss": 2.9158, "step": 5500 }, { "epoch": 0.42, "learning_rate": 0.0002874599414797269, "loss": 2.9109, "step": 6000 }, { "epoch": 0.45, "learning_rate": 0.00028641493660303745, "loss": 2.857, "step": 6500 }, { "epoch": 0.49, "learning_rate": 0.00028536993172634804, "loss": 2.9129, "step": 7000 }, { "epoch": 0.52, "learning_rate": 0.0002843249268496586, "loss": 2.8741, "step": 7500 }, { "epoch": 0.56, "learning_rate": 0.00028327992197296916, "loss": 2.8337, "step": 8000 }, { "epoch": 0.59, "learning_rate": 0.00028223491709627975, "loss": 2.7539, "step": 8500 }, { "epoch": 0.63, "learning_rate": 0.00028118991221959033, "loss": 2.8348, "step": 9000 }, { "epoch": 0.66, "learning_rate": 0.0002801449073429009, "loss": 2.8118, "step": 9500 }, { "epoch": 0.7, "learning_rate": 0.0002790999024662115, "loss": 2.803, "step": 10000 }, { "epoch": 0.7, "eval_gen_len": 12.7513, "eval_loss": 2.4086406230926514, "eval_rouge1": 55.641, "eval_rouge2": 36.1214, "eval_rougeL": 52.8683, "eval_rougeLsum": 52.8572, "eval_runtime": 112.31, "eval_samples_per_second": 43.175, "eval_steps_per_second": 5.405, "step": 10000 }, { "epoch": 0.73, "learning_rate": 0.00027805489758952204, "loss": 2.7639, "step": 10500 }, { "epoch": 0.77, "learning_rate": 0.00027700989271283263, "loss": 2.7668, "step": 11000 }, { "epoch": 0.8, "learning_rate": 0.0002759648878361432, "loss": 2.7692, "step": 11500 }, { "epoch": 0.84, "learning_rate": 0.0002749198829594538, "loss": 2.7589, "step": 12000 }, { "epoch": 0.87, "learning_rate": 0.00027387487808276434, "loss": 2.7104, "step": 12500 }, { "epoch": 0.91, "learning_rate": 0.0002728298732060749, "loss": 2.7085, "step": 13000 }, { "epoch": 0.94, "learning_rate": 0.0002717848683293855, "loss": 2.734, "step": 13500 }, { "epoch": 0.98, "learning_rate": 0.0002707398634526961, "loss": 2.7221, "step": 14000 }, { "epoch": 1.01, "learning_rate": 0.0002696948585760067, "loss": 2.6748, "step": 14500 }, { "epoch": 1.05, "learning_rate": 0.0002686498536993172, "loss": 2.5483, "step": 15000 }, { "epoch": 1.05, "eval_gen_len": 12.7754, "eval_loss": 2.341980218887329, "eval_rouge1": 55.6604, "eval_rouge2": 36.0085, "eval_rougeL": 52.9599, "eval_rougeLsum": 52.9433, "eval_runtime": 111.7054, "eval_samples_per_second": 43.409, "eval_steps_per_second": 5.434, "step": 15000 }, { "epoch": 1.08, "learning_rate": 0.0002676048488226278, "loss": 2.5811, "step": 15500 }, { "epoch": 1.11, "learning_rate": 0.0002665598439459384, "loss": 2.5846, "step": 16000 }, { "epoch": 1.15, "learning_rate": 0.00026551483906924893, "loss": 2.543, "step": 16500 }, { "epoch": 1.18, "learning_rate": 0.00026446983419255957, "loss": 2.5682, "step": 17000 }, { "epoch": 1.22, "learning_rate": 0.0002634248293158701, "loss": 2.5234, "step": 17500 }, { "epoch": 1.25, "learning_rate": 0.0002623798244391807, "loss": 2.5756, "step": 18000 }, { "epoch": 1.29, "learning_rate": 0.0002613348195624913, "loss": 2.5471, "step": 18500 }, { "epoch": 1.32, "learning_rate": 0.0002602898146858018, "loss": 2.5657, "step": 19000 }, { "epoch": 1.36, "learning_rate": 0.00025924480980911245, "loss": 2.536, "step": 19500 }, { "epoch": 1.39, "learning_rate": 0.000258199804932423, "loss": 2.4978, "step": 20000 }, { "epoch": 1.39, "eval_gen_len": 12.8804, "eval_loss": 2.314547061920166, "eval_rouge1": 56.204, "eval_rouge2": 36.5896, "eval_rougeL": 53.338, "eval_rougeLsum": 53.3351, "eval_runtime": 109.7008, "eval_samples_per_second": 44.202, "eval_steps_per_second": 5.533, "step": 20000 }, { "epoch": 1.43, "learning_rate": 0.0002571548000557336, "loss": 2.5415, "step": 20500 }, { "epoch": 1.46, "learning_rate": 0.00025610979517904416, "loss": 2.5634, "step": 21000 }, { "epoch": 1.5, "learning_rate": 0.0002550647903023547, "loss": 2.5392, "step": 21500 }, { "epoch": 1.53, "learning_rate": 0.00025401978542566534, "loss": 2.567, "step": 22000 }, { "epoch": 1.57, "learning_rate": 0.00025297478054897587, "loss": 2.5332, "step": 22500 }, { "epoch": 1.6, "learning_rate": 0.00025192977567228646, "loss": 2.5615, "step": 23000 }, { "epoch": 1.64, "learning_rate": 0.00025088477079559705, "loss": 2.5275, "step": 23500 }, { "epoch": 1.67, "learning_rate": 0.0002498397659189076, "loss": 2.5342, "step": 24000 }, { "epoch": 1.71, "learning_rate": 0.00024879476104221817, "loss": 2.5248, "step": 24500 }, { "epoch": 1.74, "learning_rate": 0.00024774975616552875, "loss": 2.5383, "step": 25000 }, { "epoch": 1.74, "eval_gen_len": 12.795, "eval_loss": 2.2696738243103027, "eval_rouge1": 56.1356, "eval_rouge2": 36.6963, "eval_rougeL": 53.3579, "eval_rougeLsum": 53.3664, "eval_runtime": 112.8783, "eval_samples_per_second": 42.958, "eval_steps_per_second": 5.377, "step": 25000 }, { "epoch": 1.78, "learning_rate": 0.00024670475128883934, "loss": 2.4931, "step": 25500 }, { "epoch": 1.81, "learning_rate": 0.00024565974641214993, "loss": 2.4933, "step": 26000 }, { "epoch": 1.85, "learning_rate": 0.00024461474153546046, "loss": 2.5195, "step": 26500 }, { "epoch": 1.88, "learning_rate": 0.00024356973665877105, "loss": 2.5158, "step": 27000 }, { "epoch": 1.92, "learning_rate": 0.00024252473178208164, "loss": 2.5311, "step": 27500 }, { "epoch": 1.95, "learning_rate": 0.0002414797269053922, "loss": 2.5037, "step": 28000 }, { "epoch": 1.99, "learning_rate": 0.00024043472202870276, "loss": 2.5134, "step": 28500 }, { "epoch": 2.02, "learning_rate": 0.00023938971715201337, "loss": 2.4096, "step": 29000 }, { "epoch": 2.06, "learning_rate": 0.00023834471227532393, "loss": 2.3211, "step": 29500 }, { "epoch": 2.09, "learning_rate": 0.00023729970739863452, "loss": 2.3368, "step": 30000 }, { "epoch": 2.09, "eval_gen_len": 12.7478, "eval_loss": 2.260253667831421, "eval_rouge1": 56.0271, "eval_rouge2": 36.4249, "eval_rougeL": 53.3113, "eval_rougeLsum": 53.3272, "eval_runtime": 110.9671, "eval_samples_per_second": 43.698, "eval_steps_per_second": 5.47, "step": 30000 }, { "epoch": 2.12, "learning_rate": 0.00023625470252194508, "loss": 2.3855, "step": 30500 }, { "epoch": 2.16, "learning_rate": 0.00023520969764525564, "loss": 2.3562, "step": 31000 }, { "epoch": 2.19, "learning_rate": 0.00023416469276856623, "loss": 2.345, "step": 31500 }, { "epoch": 2.23, "learning_rate": 0.00023311968789187682, "loss": 2.371, "step": 32000 }, { "epoch": 2.26, "learning_rate": 0.00023207468301518738, "loss": 2.3653, "step": 32500 }, { "epoch": 2.3, "learning_rate": 0.00023102967813849797, "loss": 2.3069, "step": 33000 }, { "epoch": 2.33, "learning_rate": 0.00022998467326180853, "loss": 2.3499, "step": 33500 }, { "epoch": 2.37, "learning_rate": 0.0002289396683851191, "loss": 2.3479, "step": 34000 }, { "epoch": 2.4, "learning_rate": 0.00022789466350842967, "loss": 2.3473, "step": 34500 }, { "epoch": 2.44, "learning_rate": 0.00022684965863174026, "loss": 2.371, "step": 35000 }, { "epoch": 2.44, "eval_gen_len": 12.8243, "eval_loss": 2.2327780723571777, "eval_rouge1": 56.5041, "eval_rouge2": 36.8718, "eval_rougeL": 53.8064, "eval_rougeLsum": 53.7995, "eval_runtime": 110.8303, "eval_samples_per_second": 43.752, "eval_steps_per_second": 5.477, "step": 35000 }, { "epoch": 2.47, "learning_rate": 0.00022580465375505085, "loss": 2.3728, "step": 35500 }, { "epoch": 2.51, "learning_rate": 0.0002247596488783614, "loss": 2.3489, "step": 36000 }, { "epoch": 2.54, "learning_rate": 0.00022371464400167197, "loss": 2.4012, "step": 36500 }, { "epoch": 2.58, "learning_rate": 0.00022266963912498256, "loss": 2.3648, "step": 37000 }, { "epoch": 2.61, "learning_rate": 0.00022162463424829312, "loss": 2.3887, "step": 37500 }, { "epoch": 2.65, "learning_rate": 0.00022057962937160373, "loss": 2.3375, "step": 38000 }, { "epoch": 2.68, "learning_rate": 0.0002195346244949143, "loss": 2.3742, "step": 38500 }, { "epoch": 2.72, "learning_rate": 0.00021848961961822485, "loss": 2.3393, "step": 39000 }, { "epoch": 2.75, "learning_rate": 0.00021744461474153544, "loss": 2.327, "step": 39500 }, { "epoch": 2.79, "learning_rate": 0.000216399609864846, "loss": 2.3567, "step": 40000 }, { "epoch": 2.79, "eval_gen_len": 12.6851, "eval_loss": 2.207930088043213, "eval_rouge1": 56.5318, "eval_rouge2": 36.9437, "eval_rougeL": 53.8359, "eval_rougeLsum": 53.8254, "eval_runtime": 109.4256, "eval_samples_per_second": 44.313, "eval_steps_per_second": 5.547, "step": 40000 }, { "epoch": 2.82, "learning_rate": 0.0002153546049881566, "loss": 2.3582, "step": 40500 }, { "epoch": 2.86, "learning_rate": 0.00021430960011146718, "loss": 2.3617, "step": 41000 }, { "epoch": 2.89, "learning_rate": 0.00021326459523477774, "loss": 2.3521, "step": 41500 }, { "epoch": 2.93, "learning_rate": 0.00021221959035808832, "loss": 2.3396, "step": 42000 }, { "epoch": 2.96, "learning_rate": 0.00021117458548139888, "loss": 2.3598, "step": 42500 }, { "epoch": 3.0, "learning_rate": 0.00021012958060470945, "loss": 2.3396, "step": 43000 }, { "epoch": 3.03, "learning_rate": 0.00020908457572802006, "loss": 2.1784, "step": 43500 }, { "epoch": 3.07, "learning_rate": 0.00020803957085133062, "loss": 2.2141, "step": 44000 }, { "epoch": 3.1, "learning_rate": 0.0002069945659746412, "loss": 2.2108, "step": 44500 }, { "epoch": 3.14, "learning_rate": 0.00020594956109795177, "loss": 2.1753, "step": 45000 }, { "epoch": 3.14, "eval_gen_len": 12.67, "eval_loss": 2.216768980026245, "eval_rouge1": 56.3831, "eval_rouge2": 36.8896, "eval_rougeL": 53.6542, "eval_rougeLsum": 53.6708, "eval_runtime": 109.4674, "eval_samples_per_second": 44.296, "eval_steps_per_second": 5.545, "step": 45000 }, { "epoch": 3.17, "learning_rate": 0.00020490455622126233, "loss": 2.1618, "step": 45500 }, { "epoch": 3.2, "learning_rate": 0.00020385955134457294, "loss": 2.205, "step": 46000 }, { "epoch": 3.24, "learning_rate": 0.0002028145464678835, "loss": 2.2087, "step": 46500 }, { "epoch": 3.27, "learning_rate": 0.00020176954159119406, "loss": 2.1862, "step": 47000 }, { "epoch": 3.31, "learning_rate": 0.00020072453671450465, "loss": 2.1947, "step": 47500 }, { "epoch": 3.34, "learning_rate": 0.0001996795318378152, "loss": 2.203, "step": 48000 }, { "epoch": 3.38, "learning_rate": 0.00019863452696112583, "loss": 2.2225, "step": 48500 }, { "epoch": 3.41, "learning_rate": 0.0001975895220844364, "loss": 2.2253, "step": 49000 }, { "epoch": 3.45, "learning_rate": 0.00019654451720774695, "loss": 2.2147, "step": 49500 }, { "epoch": 3.48, "learning_rate": 0.00019549951233105754, "loss": 2.2069, "step": 50000 }, { "epoch": 3.48, "eval_gen_len": 12.8014, "eval_loss": 2.2055139541625977, "eval_rouge1": 56.7171, "eval_rouge2": 37.1665, "eval_rougeL": 53.9299, "eval_rougeLsum": 53.9259, "eval_runtime": 108.9678, "eval_samples_per_second": 44.499, "eval_steps_per_second": 5.57, "step": 50000 }, { "epoch": 3.52, "learning_rate": 0.0001944545074543681, "loss": 2.2185, "step": 50500 }, { "epoch": 3.55, "learning_rate": 0.00019340950257767866, "loss": 2.2145, "step": 51000 }, { "epoch": 3.59, "learning_rate": 0.00019236449770098927, "loss": 2.2661, "step": 51500 }, { "epoch": 3.62, "learning_rate": 0.00019131949282429983, "loss": 2.2281, "step": 52000 }, { "epoch": 3.66, "learning_rate": 0.00019027448794761042, "loss": 2.2344, "step": 52500 }, { "epoch": 3.69, "learning_rate": 0.00018922948307092098, "loss": 2.2283, "step": 53000 }, { "epoch": 3.73, "learning_rate": 0.00018818447819423154, "loss": 2.2424, "step": 53500 }, { "epoch": 3.76, "learning_rate": 0.00018713947331754215, "loss": 2.2265, "step": 54000 }, { "epoch": 3.8, "learning_rate": 0.00018609446844085271, "loss": 2.1928, "step": 54500 }, { "epoch": 3.83, "learning_rate": 0.00018504946356416328, "loss": 2.2396, "step": 55000 }, { "epoch": 3.83, "eval_gen_len": 12.7989, "eval_loss": 2.1801397800445557, "eval_rouge1": 56.936, "eval_rouge2": 37.5465, "eval_rougeL": 54.1064, "eval_rougeLsum": 54.1125, "eval_runtime": 110.8635, "eval_samples_per_second": 43.738, "eval_steps_per_second": 5.475, "step": 55000 }, { "epoch": 3.87, "learning_rate": 0.00018400445868747386, "loss": 2.2357, "step": 55500 }, { "epoch": 3.9, "learning_rate": 0.00018295945381078442, "loss": 2.2118, "step": 56000 }, { "epoch": 3.94, "learning_rate": 0.00018191444893409504, "loss": 2.226, "step": 56500 }, { "epoch": 3.97, "learning_rate": 0.0001808694440574056, "loss": 2.213, "step": 57000 }, { "epoch": 4.01, "learning_rate": 0.00017982443918071616, "loss": 2.2106, "step": 57500 }, { "epoch": 4.04, "learning_rate": 0.00017877943430402675, "loss": 2.0718, "step": 58000 }, { "epoch": 4.08, "learning_rate": 0.0001777344294273373, "loss": 2.1013, "step": 58500 }, { "epoch": 4.11, "learning_rate": 0.00017668942455064787, "loss": 2.0646, "step": 59000 }, { "epoch": 4.15, "learning_rate": 0.00017564441967395848, "loss": 2.102, "step": 59500 }, { "epoch": 4.18, "learning_rate": 0.00017459941479726904, "loss": 2.0657, "step": 60000 }, { "epoch": 4.18, "eval_gen_len": 12.6987, "eval_loss": 2.1915152072906494, "eval_rouge1": 56.6312, "eval_rouge2": 37.1618, "eval_rougeL": 53.8646, "eval_rougeLsum": 53.8791, "eval_runtime": 111.8739, "eval_samples_per_second": 43.343, "eval_steps_per_second": 5.426, "step": 60000 }, { "epoch": 4.21, "learning_rate": 0.00017355440992057963, "loss": 2.0758, "step": 60500 }, { "epoch": 4.25, "learning_rate": 0.0001725094050438902, "loss": 2.1113, "step": 61000 }, { "epoch": 4.28, "learning_rate": 0.00017146440016720075, "loss": 2.1134, "step": 61500 }, { "epoch": 4.32, "learning_rate": 0.00017041939529051134, "loss": 2.1019, "step": 62000 }, { "epoch": 4.35, "learning_rate": 0.00016937439041382193, "loss": 2.0924, "step": 62500 }, { "epoch": 4.39, "learning_rate": 0.00016832938553713249, "loss": 2.1106, "step": 63000 }, { "epoch": 4.42, "learning_rate": 0.00016728438066044307, "loss": 2.1112, "step": 63500 }, { "epoch": 4.46, "learning_rate": 0.00016623937578375363, "loss": 2.0951, "step": 64000 }, { "epoch": 4.49, "learning_rate": 0.00016519437090706422, "loss": 2.1011, "step": 64500 }, { "epoch": 4.53, "learning_rate": 0.0001641493660303748, "loss": 2.0806, "step": 65000 }, { "epoch": 4.53, "eval_gen_len": 12.715, "eval_loss": 2.180889844894409, "eval_rouge1": 56.6599, "eval_rouge2": 37.1282, "eval_rougeL": 53.8838, "eval_rougeLsum": 53.8781, "eval_runtime": 110.8084, "eval_samples_per_second": 43.76, "eval_steps_per_second": 5.478, "step": 65000 }, { "epoch": 4.56, "learning_rate": 0.00016310436115368537, "loss": 2.1025, "step": 65500 }, { "epoch": 4.6, "learning_rate": 0.00016205935627699596, "loss": 2.1107, "step": 66000 }, { "epoch": 4.63, "learning_rate": 0.00016101435140030652, "loss": 2.1125, "step": 66500 }, { "epoch": 4.67, "learning_rate": 0.00015996934652361708, "loss": 2.0816, "step": 67000 }, { "epoch": 4.7, "learning_rate": 0.00015892434164692767, "loss": 2.1276, "step": 67500 }, { "epoch": 4.74, "learning_rate": 0.00015787933677023825, "loss": 2.1125, "step": 68000 }, { "epoch": 4.77, "learning_rate": 0.00015683433189354884, "loss": 2.1045, "step": 68500 }, { "epoch": 4.81, "learning_rate": 0.0001557893270168594, "loss": 2.1052, "step": 69000 }, { "epoch": 4.84, "learning_rate": 0.00015474432214016996, "loss": 2.1224, "step": 69500 }, { "epoch": 4.88, "learning_rate": 0.00015369931726348055, "loss": 2.0933, "step": 70000 }, { "epoch": 4.88, "eval_gen_len": 12.6593, "eval_loss": 2.1771466732025146, "eval_rouge1": 56.5891, "eval_rouge2": 36.9461, "eval_rougeL": 53.8058, "eval_rougeLsum": 53.8087, "eval_runtime": 110.0911, "eval_samples_per_second": 44.045, "eval_steps_per_second": 5.514, "step": 70000 }, { "epoch": 4.91, "learning_rate": 0.0001526543123867911, "loss": 2.1076, "step": 70500 }, { "epoch": 4.95, "learning_rate": 0.0001516093075101017, "loss": 2.0704, "step": 71000 }, { "epoch": 4.98, "learning_rate": 0.00015056430263341228, "loss": 2.0975, "step": 71500 }, { "epoch": 5.02, "learning_rate": 0.00014951929775672285, "loss": 2.0603, "step": 72000 }, { "epoch": 5.05, "learning_rate": 0.00014847429288003343, "loss": 1.9328, "step": 72500 }, { "epoch": 5.09, "learning_rate": 0.000147429288003344, "loss": 1.9873, "step": 73000 }, { "epoch": 5.12, "learning_rate": 0.00014638428312665458, "loss": 1.9887, "step": 73500 }, { "epoch": 5.16, "learning_rate": 0.00014533927824996514, "loss": 1.9416, "step": 74000 }, { "epoch": 5.19, "learning_rate": 0.00014429427337327573, "loss": 2.0074, "step": 74500 }, { "epoch": 5.23, "learning_rate": 0.00014324926849658632, "loss": 1.9949, "step": 75000 }, { "epoch": 5.23, "eval_gen_len": 12.6723, "eval_loss": 2.1931562423706055, "eval_rouge1": 56.4956, "eval_rouge2": 36.9679, "eval_rougeL": 53.7634, "eval_rougeLsum": 53.7731, "eval_runtime": 108.5365, "eval_samples_per_second": 44.676, "eval_steps_per_second": 5.593, "step": 75000 }, { "epoch": 5.26, "learning_rate": 0.00014220426361989688, "loss": 2.002, "step": 75500 }, { "epoch": 5.29, "learning_rate": 0.00014115925874320744, "loss": 2.0161, "step": 76000 }, { "epoch": 5.33, "learning_rate": 0.00014011425386651802, "loss": 2.0139, "step": 76500 }, { "epoch": 5.36, "learning_rate": 0.0001390692489898286, "loss": 1.9874, "step": 77000 }, { "epoch": 5.4, "learning_rate": 0.0001380242441131392, "loss": 1.961, "step": 77500 }, { "epoch": 5.43, "learning_rate": 0.00013697923923644976, "loss": 2.0082, "step": 78000 }, { "epoch": 5.47, "learning_rate": 0.00013593423435976032, "loss": 2.0175, "step": 78500 }, { "epoch": 5.5, "learning_rate": 0.0001348892294830709, "loss": 1.987, "step": 79000 }, { "epoch": 5.54, "learning_rate": 0.0001338442246063815, "loss": 2.0167, "step": 79500 }, { "epoch": 5.57, "learning_rate": 0.00013279921972969206, "loss": 1.9954, "step": 80000 }, { "epoch": 5.57, "eval_gen_len": 12.6599, "eval_loss": 2.181297779083252, "eval_rouge1": 56.4827, "eval_rouge2": 36.8319, "eval_rougeL": 53.6397, "eval_rougeLsum": 53.6399, "eval_runtime": 111.2169, "eval_samples_per_second": 43.6, "eval_steps_per_second": 5.458, "step": 80000 }, { "epoch": 5.61, "learning_rate": 0.00013175421485300264, "loss": 1.9816, "step": 80500 }, { "epoch": 5.64, "learning_rate": 0.0001307092099763132, "loss": 2.0116, "step": 81000 }, { "epoch": 5.68, "learning_rate": 0.0001296642050996238, "loss": 2.0182, "step": 81500 }, { "epoch": 5.71, "learning_rate": 0.00012861920022293435, "loss": 2.0032, "step": 82000 }, { "epoch": 5.75, "learning_rate": 0.00012757419534624494, "loss": 1.9978, "step": 82500 }, { "epoch": 5.78, "learning_rate": 0.0001265291904695555, "loss": 2.0083, "step": 83000 }, { "epoch": 5.82, "learning_rate": 0.0001254841855928661, "loss": 2.0017, "step": 83500 }, { "epoch": 5.85, "learning_rate": 0.00012443918071617665, "loss": 2.0163, "step": 84000 }, { "epoch": 5.89, "learning_rate": 0.00012339417583948724, "loss": 2.001, "step": 84500 }, { "epoch": 5.92, "learning_rate": 0.00012234917096279782, "loss": 1.9912, "step": 85000 }, { "epoch": 5.92, "eval_gen_len": 12.7534, "eval_loss": 2.1754705905914307, "eval_rouge1": 56.6723, "eval_rouge2": 37.0432, "eval_rougeL": 53.8339, "eval_rougeLsum": 53.8233, "eval_runtime": 111.456, "eval_samples_per_second": 43.506, "eval_steps_per_second": 5.446, "step": 85000 }, { "epoch": 5.96, "learning_rate": 0.0001213041660861084, "loss": 2.0087, "step": 85500 }, { "epoch": 5.99, "learning_rate": 0.00012025916120941897, "loss": 1.9864, "step": 86000 }, { "epoch": 6.03, "learning_rate": 0.00011921415633272953, "loss": 1.9169, "step": 86500 }, { "epoch": 6.06, "learning_rate": 0.00011816915145604012, "loss": 1.8964, "step": 87000 }, { "epoch": 6.1, "learning_rate": 0.00011712414657935069, "loss": 1.8886, "step": 87500 }, { "epoch": 6.13, "learning_rate": 0.00011607914170266128, "loss": 1.9246, "step": 88000 }, { "epoch": 6.17, "learning_rate": 0.00011503413682597184, "loss": 1.9266, "step": 88500 }, { "epoch": 6.2, "learning_rate": 0.00011398913194928242, "loss": 1.8718, "step": 89000 }, { "epoch": 6.24, "learning_rate": 0.000112944127072593, "loss": 1.8617, "step": 89500 }, { "epoch": 6.27, "learning_rate": 0.00011189912219590358, "loss": 1.9068, "step": 90000 }, { "epoch": 6.27, "eval_gen_len": 12.7037, "eval_loss": 2.184929847717285, "eval_rouge1": 56.6574, "eval_rouge2": 37.0691, "eval_rougeL": 53.9029, "eval_rougeLsum": 53.892, "eval_runtime": 109.9104, "eval_samples_per_second": 44.118, "eval_steps_per_second": 5.523, "step": 90000 }, { "epoch": 6.3, "learning_rate": 0.00011085411731921414, "loss": 1.8786, "step": 90500 }, { "epoch": 6.34, "learning_rate": 0.00010980911244252472, "loss": 1.9071, "step": 91000 }, { "epoch": 6.37, "learning_rate": 0.0001087641075658353, "loss": 1.8807, "step": 91500 }, { "epoch": 6.41, "learning_rate": 0.00010771910268914589, "loss": 1.9267, "step": 92000 }, { "epoch": 6.44, "learning_rate": 0.00010667409781245645, "loss": 1.9136, "step": 92500 }, { "epoch": 6.48, "learning_rate": 0.00010562909293576702, "loss": 1.9075, "step": 93000 }, { "epoch": 6.51, "learning_rate": 0.00010458408805907761, "loss": 1.908, "step": 93500 }, { "epoch": 6.55, "learning_rate": 0.00010353908318238818, "loss": 1.9132, "step": 94000 }, { "epoch": 6.58, "learning_rate": 0.00010249407830569874, "loss": 1.9013, "step": 94500 }, { "epoch": 6.62, "learning_rate": 0.00010144907342900933, "loss": 1.9173, "step": 95000 }, { "epoch": 6.62, "eval_gen_len": 12.6467, "eval_loss": 2.1786956787109375, "eval_rouge1": 56.5701, "eval_rouge2": 36.861, "eval_rougeL": 53.6855, "eval_rougeLsum": 53.6699, "eval_runtime": 109.3551, "eval_samples_per_second": 44.342, "eval_steps_per_second": 5.551, "step": 95000 }, { "epoch": 6.65, "learning_rate": 0.0001004040685523199, "loss": 1.9182, "step": 95500 }, { "epoch": 6.69, "learning_rate": 9.935906367563048e-05, "loss": 1.8904, "step": 96000 }, { "epoch": 6.72, "learning_rate": 9.831405879894105e-05, "loss": 1.9399, "step": 96500 }, { "epoch": 6.76, "learning_rate": 9.726905392225163e-05, "loss": 1.9188, "step": 97000 }, { "epoch": 6.79, "learning_rate": 9.622404904556221e-05, "loss": 1.8742, "step": 97500 }, { "epoch": 6.83, "learning_rate": 9.517904416887279e-05, "loss": 1.9191, "step": 98000 }, { "epoch": 6.86, "learning_rate": 9.413403929218335e-05, "loss": 1.9207, "step": 98500 }, { "epoch": 6.9, "learning_rate": 9.308903441549394e-05, "loss": 1.9416, "step": 99000 }, { "epoch": 6.93, "learning_rate": 9.204402953880451e-05, "loss": 1.9364, "step": 99500 }, { "epoch": 6.97, "learning_rate": 9.099902466211508e-05, "loss": 1.9131, "step": 100000 }, { "epoch": 6.97, "eval_gen_len": 12.7072, "eval_loss": 2.186249017715454, "eval_rouge1": 56.7175, "eval_rouge2": 37.0749, "eval_rougeL": 53.8761, "eval_rougeLsum": 53.8794, "eval_runtime": 109.9963, "eval_samples_per_second": 44.083, "eval_steps_per_second": 5.518, "step": 100000 }, { "epoch": 7.0, "learning_rate": 8.995401978542566e-05, "loss": 1.9358, "step": 100500 }, { "epoch": 7.04, "learning_rate": 8.890901490873623e-05, "loss": 1.8457, "step": 101000 }, { "epoch": 7.07, "learning_rate": 8.78640100320468e-05, "loss": 1.821, "step": 101500 }, { "epoch": 7.11, "learning_rate": 8.681900515535739e-05, "loss": 1.8181, "step": 102000 }, { "epoch": 7.14, "learning_rate": 8.577400027866795e-05, "loss": 1.8372, "step": 102500 }, { "epoch": 7.18, "learning_rate": 8.472899540197853e-05, "loss": 1.8143, "step": 103000 }, { "epoch": 7.21, "learning_rate": 8.368399052528912e-05, "loss": 1.8175, "step": 103500 }, { "epoch": 7.25, "learning_rate": 8.263898564859969e-05, "loss": 1.8245, "step": 104000 }, { "epoch": 7.28, "learning_rate": 8.159398077191025e-05, "loss": 1.8187, "step": 104500 }, { "epoch": 7.32, "learning_rate": 8.054897589522084e-05, "loss": 1.8164, "step": 105000 }, { "epoch": 7.32, "eval_gen_len": 12.6364, "eval_loss": 2.1999216079711914, "eval_rouge1": 56.6104, "eval_rouge2": 37.0809, "eval_rougeL": 53.8098, "eval_rougeLsum": 53.8216, "eval_runtime": 110.0832, "eval_samples_per_second": 44.049, "eval_steps_per_second": 5.514, "step": 105000 }, { "epoch": 7.35, "learning_rate": 7.950397101853141e-05, "loss": 1.8402, "step": 105500 }, { "epoch": 7.38, "learning_rate": 7.8458966141842e-05, "loss": 1.8282, "step": 106000 }, { "epoch": 7.42, "learning_rate": 7.741396126515256e-05, "loss": 1.811, "step": 106500 }, { "epoch": 7.45, "learning_rate": 7.636895638846313e-05, "loss": 1.8315, "step": 107000 }, { "epoch": 7.49, "learning_rate": 7.532395151177372e-05, "loss": 1.8636, "step": 107500 }, { "epoch": 7.52, "learning_rate": 7.42789466350843e-05, "loss": 1.8146, "step": 108000 }, { "epoch": 7.56, "learning_rate": 7.323394175839487e-05, "loss": 1.8336, "step": 108500 }, { "epoch": 7.59, "learning_rate": 7.218893688170544e-05, "loss": 1.8256, "step": 109000 }, { "epoch": 7.63, "learning_rate": 7.114393200501602e-05, "loss": 1.8438, "step": 109500 }, { "epoch": 7.66, "learning_rate": 7.009892712832659e-05, "loss": 1.8489, "step": 110000 }, { "epoch": 7.66, "eval_gen_len": 12.5741, "eval_loss": 2.1944735050201416, "eval_rouge1": 56.6645, "eval_rouge2": 37.1267, "eval_rougeL": 53.9009, "eval_rougeLsum": 53.9008, "eval_runtime": 109.0675, "eval_samples_per_second": 44.459, "eval_steps_per_second": 5.565, "step": 110000 }, { "epoch": 7.7, "learning_rate": 6.905392225163716e-05, "loss": 1.7998, "step": 110500 }, { "epoch": 7.73, "learning_rate": 6.800891737494774e-05, "loss": 1.8382, "step": 111000 }, { "epoch": 7.77, "learning_rate": 6.696391249825833e-05, "loss": 1.8198, "step": 111500 }, { "epoch": 7.8, "learning_rate": 6.591890762156889e-05, "loss": 1.8558, "step": 112000 }, { "epoch": 7.84, "learning_rate": 6.487390274487947e-05, "loss": 1.8395, "step": 112500 }, { "epoch": 7.87, "learning_rate": 6.382889786819005e-05, "loss": 1.8386, "step": 113000 }, { "epoch": 7.91, "learning_rate": 6.278389299150062e-05, "loss": 1.8703, "step": 113500 }, { "epoch": 7.94, "learning_rate": 6.17388881148112e-05, "loss": 1.8576, "step": 114000 }, { "epoch": 7.98, "learning_rate": 6.069388323812178e-05, "loss": 1.8179, "step": 114500 }, { "epoch": 8.01, "learning_rate": 5.9648878361432344e-05, "loss": 1.82, "step": 115000 }, { "epoch": 8.01, "eval_gen_len": 12.6428, "eval_loss": 2.2074716091156006, "eval_rouge1": 56.6075, "eval_rouge2": 37.0359, "eval_rougeL": 53.8792, "eval_rougeLsum": 53.8833, "eval_runtime": 110.9343, "eval_samples_per_second": 43.711, "eval_steps_per_second": 5.472, "step": 115000 }, { "epoch": 8.05, "learning_rate": 5.8603873484742925e-05, "loss": 1.7476, "step": 115500 }, { "epoch": 8.08, "learning_rate": 5.75588686080535e-05, "loss": 1.798, "step": 116000 }, { "epoch": 8.12, "learning_rate": 5.651386373136407e-05, "loss": 1.7595, "step": 116500 }, { "epoch": 8.15, "learning_rate": 5.546885885467465e-05, "loss": 1.7551, "step": 117000 }, { "epoch": 8.19, "learning_rate": 5.442385397798523e-05, "loss": 1.7635, "step": 117500 }, { "epoch": 8.22, "learning_rate": 5.33788491012958e-05, "loss": 1.7637, "step": 118000 }, { "epoch": 8.26, "learning_rate": 5.2333844224606375e-05, "loss": 1.7355, "step": 118500 }, { "epoch": 8.29, "learning_rate": 5.128883934791695e-05, "loss": 1.7711, "step": 119000 }, { "epoch": 8.33, "learning_rate": 5.024383447122753e-05, "loss": 1.7786, "step": 119500 }, { "epoch": 8.36, "learning_rate": 4.91988295945381e-05, "loss": 1.772, "step": 120000 }, { "epoch": 8.36, "eval_gen_len": 12.6591, "eval_loss": 2.2067320346832275, "eval_rouge1": 56.4716, "eval_rouge2": 36.8675, "eval_rougeL": 53.6826, "eval_rougeLsum": 53.6742, "eval_runtime": 109.3817, "eval_samples_per_second": 44.331, "eval_steps_per_second": 5.549, "step": 120000 }, { "epoch": 8.39, "learning_rate": 4.815382471784868e-05, "loss": 1.8009, "step": 120500 }, { "epoch": 8.43, "learning_rate": 4.710881984115925e-05, "loss": 1.7549, "step": 121000 }, { "epoch": 8.46, "learning_rate": 4.606381496446983e-05, "loss": 1.7607, "step": 121500 }, { "epoch": 8.5, "learning_rate": 4.50188100877804e-05, "loss": 1.7686, "step": 122000 }, { "epoch": 8.53, "learning_rate": 4.397380521109098e-05, "loss": 1.7666, "step": 122500 }, { "epoch": 8.57, "learning_rate": 4.2928800334401555e-05, "loss": 1.7789, "step": 123000 }, { "epoch": 8.6, "learning_rate": 4.1883795457712136e-05, "loss": 1.7836, "step": 123500 }, { "epoch": 8.64, "learning_rate": 4.08387905810227e-05, "loss": 1.8019, "step": 124000 }, { "epoch": 8.67, "learning_rate": 3.9793785704333284e-05, "loss": 1.7651, "step": 124500 }, { "epoch": 8.71, "learning_rate": 3.874878082764386e-05, "loss": 1.7795, "step": 125000 }, { "epoch": 8.71, "eval_gen_len": 12.608, "eval_loss": 2.205610990524292, "eval_rouge1": 56.4112, "eval_rouge2": 36.9011, "eval_rougeL": 53.6554, "eval_rougeLsum": 53.6495, "eval_runtime": 111.1925, "eval_samples_per_second": 43.609, "eval_steps_per_second": 5.459, "step": 125000 }, { "epoch": 8.74, "learning_rate": 3.770377595095443e-05, "loss": 1.7496, "step": 125500 }, { "epoch": 8.78, "learning_rate": 3.665877107426501e-05, "loss": 1.7984, "step": 126000 }, { "epoch": 8.81, "learning_rate": 3.5613766197575586e-05, "loss": 1.781, "step": 126500 }, { "epoch": 8.85, "learning_rate": 3.456876132088616e-05, "loss": 1.783, "step": 127000 }, { "epoch": 8.88, "learning_rate": 3.3523756444196734e-05, "loss": 1.7638, "step": 127500 }, { "epoch": 8.92, "learning_rate": 3.2478751567507315e-05, "loss": 1.7673, "step": 128000 }, { "epoch": 8.95, "learning_rate": 3.143374669081789e-05, "loss": 1.7821, "step": 128500 }, { "epoch": 8.99, "learning_rate": 3.0388741814128463e-05, "loss": 1.7632, "step": 129000 }, { "epoch": 9.02, "learning_rate": 2.9343736937439037e-05, "loss": 1.7608, "step": 129500 }, { "epoch": 9.06, "learning_rate": 2.8298732060749614e-05, "loss": 1.72, "step": 130000 }, { "epoch": 9.06, "eval_gen_len": 12.6758, "eval_loss": 2.2197024822235107, "eval_rouge1": 56.4735, "eval_rouge2": 36.9255, "eval_rougeL": 53.6592, "eval_rougeLsum": 53.6463, "eval_runtime": 112.7473, "eval_samples_per_second": 43.008, "eval_steps_per_second": 5.384, "step": 130000 }, { "epoch": 9.09, "learning_rate": 2.725372718406019e-05, "loss": 1.7181, "step": 130500 }, { "epoch": 9.13, "learning_rate": 2.6208722307370766e-05, "loss": 1.6894, "step": 131000 }, { "epoch": 9.16, "learning_rate": 2.516371743068134e-05, "loss": 1.7012, "step": 131500 }, { "epoch": 9.2, "learning_rate": 2.4118712553991917e-05, "loss": 1.7261, "step": 132000 }, { "epoch": 9.23, "learning_rate": 2.307370767730249e-05, "loss": 1.752, "step": 132500 }, { "epoch": 9.27, "learning_rate": 2.202870280061307e-05, "loss": 1.7228, "step": 133000 }, { "epoch": 9.3, "learning_rate": 2.0983697923923643e-05, "loss": 1.7194, "step": 133500 }, { "epoch": 9.34, "learning_rate": 1.9938693047234217e-05, "loss": 1.7077, "step": 134000 }, { "epoch": 9.37, "learning_rate": 1.8893688170544794e-05, "loss": 1.7183, "step": 134500 }, { "epoch": 9.41, "learning_rate": 1.7848683293855368e-05, "loss": 1.7174, "step": 135000 }, { "epoch": 9.41, "eval_gen_len": 12.6568, "eval_loss": 2.216855764389038, "eval_rouge1": 56.4209, "eval_rouge2": 36.8139, "eval_rougeL": 53.5778, "eval_rougeLsum": 53.5685, "eval_runtime": 109.2185, "eval_samples_per_second": 44.397, "eval_steps_per_second": 5.558, "step": 135000 }, { "epoch": 9.44, "learning_rate": 1.6803678417165945e-05, "loss": 1.7175, "step": 135500 }, { "epoch": 9.47, "learning_rate": 1.575867354047652e-05, "loss": 1.7225, "step": 136000 }, { "epoch": 9.51, "learning_rate": 1.4713668663787095e-05, "loss": 1.7305, "step": 136500 }, { "epoch": 9.54, "learning_rate": 1.366866378709767e-05, "loss": 1.7352, "step": 137000 }, { "epoch": 9.58, "learning_rate": 1.2623658910408248e-05, "loss": 1.727, "step": 137500 }, { "epoch": 9.61, "learning_rate": 1.1578654033718824e-05, "loss": 1.7204, "step": 138000 }, { "epoch": 9.65, "learning_rate": 1.05336491570294e-05, "loss": 1.7324, "step": 138500 }, { "epoch": 9.68, "learning_rate": 9.488644280339975e-06, "loss": 1.7106, "step": 139000 }, { "epoch": 9.72, "learning_rate": 8.443639403650549e-06, "loss": 1.7305, "step": 139500 }, { "epoch": 9.75, "learning_rate": 7.398634526961125e-06, "loss": 1.7466, "step": 140000 }, { "epoch": 9.75, "eval_gen_len": 12.6416, "eval_loss": 2.2165005207061768, "eval_rouge1": 56.3715, "eval_rouge2": 36.767, "eval_rougeL": 53.555, "eval_rougeLsum": 53.5468, "eval_runtime": 113.5456, "eval_samples_per_second": 42.705, "eval_steps_per_second": 5.346, "step": 140000 }, { "epoch": 9.79, "learning_rate": 6.3536296502717004e-06, "loss": 1.7393, "step": 140500 }, { "epoch": 9.82, "learning_rate": 5.308624773582276e-06, "loss": 1.7315, "step": 141000 }, { "epoch": 9.86, "learning_rate": 4.263619896892852e-06, "loss": 1.7313, "step": 141500 }, { "epoch": 9.89, "learning_rate": 3.2186150202034275e-06, "loss": 1.741, "step": 142000 }, { "epoch": 9.93, "learning_rate": 2.1736101435140028e-06, "loss": 1.7116, "step": 142500 }, { "epoch": 9.96, "learning_rate": 1.1286052668245784e-06, "loss": 1.7329, "step": 143000 }, { "epoch": 10.0, "learning_rate": 8.360039013515396e-08, "loss": 1.736, "step": 143500 }, { "epoch": 10.0, "step": 143540, "total_flos": 2.0621473641824256e+16, "train_loss": 2.137477010615576, "train_runtime": 21978.9943, "train_samples_per_second": 52.245, "train_steps_per_second": 6.531 } ], "max_steps": 143540, "num_train_epochs": 10, "total_flos": 2.0621473641824256e+16, "trial_name": null, "trial_params": null }