{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8713160404573231, "global_step": 74000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 0.00029941127294563693, "loss": 4.087, "step": 500 }, { "epoch": 0.01, "learning_rate": 0.0002988225458912739, "loss": 3.7168, "step": 1000 }, { "epoch": 0.02, "learning_rate": 0.0002982338188369108, "loss": 3.5567, "step": 1500 }, { "epoch": 0.02, "learning_rate": 0.00029764509178254774, "loss": 3.4939, "step": 2000 }, { "epoch": 0.03, "learning_rate": 0.0002970563647281847, "loss": 3.4094, "step": 2500 }, { "epoch": 0.04, "learning_rate": 0.00029646763767382164, "loss": 3.4297, "step": 3000 }, { "epoch": 0.04, "learning_rate": 0.0002958789106194586, "loss": 3.3537, "step": 3500 }, { "epoch": 0.05, "learning_rate": 0.0002952901835650955, "loss": 3.3117, "step": 4000 }, { "epoch": 0.05, "learning_rate": 0.0002947014565107325, "loss": 3.2607, "step": 4500 }, { "epoch": 0.06, "learning_rate": 0.0002941127294563694, "loss": 3.2741, "step": 5000 }, { "epoch": 0.06, "eval_gen_len": 18.9343, "eval_loss": 2.923264980316162, "eval_rouge1": 21.2236, "eval_rouge2": 9.7543, "eval_rougeL": 19.1786, "eval_rougeLsum": 19.782, "eval_runtime": 554.7643, "eval_samples_per_second": 18.026, "eval_steps_per_second": 2.253, "step": 5000 }, { "epoch": 0.06, "learning_rate": 0.00029352400240200636, "loss": 3.2403, "step": 5500 }, { "epoch": 0.07, "learning_rate": 0.0002929352753476433, "loss": 3.2089, "step": 6000 }, { "epoch": 0.08, "learning_rate": 0.0002923465482932802, "loss": 3.1972, "step": 6500 }, { "epoch": 0.08, "learning_rate": 0.0002917578212389172, "loss": 3.1384, "step": 7000 }, { "epoch": 0.09, "learning_rate": 0.0002911690941845541, "loss": 3.1834, "step": 7500 }, { "epoch": 0.09, "learning_rate": 0.0002905803671301911, "loss": 3.1484, "step": 8000 }, { "epoch": 0.1, "learning_rate": 0.00028999164007582803, "loss": 3.1509, "step": 8500 }, { "epoch": 0.11, "learning_rate": 0.00028940291302146493, "loss": 3.125, "step": 9000 }, { "epoch": 0.11, "learning_rate": 0.00028881418596710194, "loss": 3.1113, "step": 9500 }, { "epoch": 0.12, "learning_rate": 0.00028822545891273884, "loss": 3.1359, "step": 10000 }, { "epoch": 0.12, "eval_gen_len": 18.9427, "eval_loss": 2.825679302215576, "eval_rouge1": 23.1537, "eval_rouge2": 11.6804, "eval_rougeL": 21.2251, "eval_rougeLsum": 21.8301, "eval_runtime": 547.7371, "eval_samples_per_second": 18.257, "eval_steps_per_second": 2.282, "step": 10000 }, { "epoch": 0.12, "learning_rate": 0.0002876367318583758, "loss": 3.085, "step": 10500 }, { "epoch": 0.13, "learning_rate": 0.00028704800480401274, "loss": 3.0939, "step": 11000 }, { "epoch": 0.14, "learning_rate": 0.0002864592777496497, "loss": 3.0683, "step": 11500 }, { "epoch": 0.14, "learning_rate": 0.00028587055069528665, "loss": 3.0748, "step": 12000 }, { "epoch": 0.15, "learning_rate": 0.00028528182364092355, "loss": 3.0484, "step": 12500 }, { "epoch": 0.15, "learning_rate": 0.0002846930965865605, "loss": 3.0441, "step": 13000 }, { "epoch": 0.16, "learning_rate": 0.00028410436953219746, "loss": 3.0315, "step": 13500 }, { "epoch": 0.16, "learning_rate": 0.0002835156424778344, "loss": 3.0418, "step": 14000 }, { "epoch": 0.17, "learning_rate": 0.00028292691542347137, "loss": 3.036, "step": 14500 }, { "epoch": 0.18, "learning_rate": 0.00028233818836910827, "loss": 3.0571, "step": 15000 }, { "epoch": 0.18, "eval_gen_len": 18.8978, "eval_loss": 2.760854721069336, "eval_rouge1": 23.6484, "eval_rouge2": 12.3437, "eval_rougeL": 21.8478, "eval_rougeLsum": 22.4063, "eval_runtime": 548.4414, "eval_samples_per_second": 18.233, "eval_steps_per_second": 2.279, "step": 15000 }, { "epoch": 0.18, "learning_rate": 0.0002817494613147452, "loss": 3.0305, "step": 15500 }, { "epoch": 0.19, "learning_rate": 0.0002811607342603822, "loss": 3.0116, "step": 16000 }, { "epoch": 0.19, "learning_rate": 0.00028057200720601913, "loss": 3.0284, "step": 16500 }, { "epoch": 0.2, "learning_rate": 0.0002799832801516561, "loss": 3.0063, "step": 17000 }, { "epoch": 0.21, "learning_rate": 0.000279394553097293, "loss": 2.9533, "step": 17500 }, { "epoch": 0.21, "learning_rate": 0.00027880582604292994, "loss": 2.9608, "step": 18000 }, { "epoch": 0.22, "learning_rate": 0.0002782170989885669, "loss": 2.9796, "step": 18500 }, { "epoch": 0.22, "learning_rate": 0.00027762837193420385, "loss": 2.9651, "step": 19000 }, { "epoch": 0.23, "learning_rate": 0.0002770396448798408, "loss": 2.963, "step": 19500 }, { "epoch": 0.24, "learning_rate": 0.0002764509178254777, "loss": 2.9887, "step": 20000 }, { "epoch": 0.24, "eval_gen_len": 18.9275, "eval_loss": 2.7225496768951416, "eval_rouge1": 24.3863, "eval_rouge2": 13.0699, "eval_rougeL": 22.5709, "eval_rougeLsum": 23.1191, "eval_runtime": 548.8094, "eval_samples_per_second": 18.221, "eval_steps_per_second": 2.278, "step": 20000 }, { "epoch": 0.24, "learning_rate": 0.00027586219077111465, "loss": 2.9618, "step": 20500 }, { "epoch": 0.25, "learning_rate": 0.0002752734637167516, "loss": 2.9229, "step": 21000 }, { "epoch": 0.25, "learning_rate": 0.00027468473666238856, "loss": 2.9284, "step": 21500 }, { "epoch": 0.26, "learning_rate": 0.0002740960096080255, "loss": 2.9324, "step": 22000 }, { "epoch": 0.26, "learning_rate": 0.00027350728255366247, "loss": 2.941, "step": 22500 }, { "epoch": 0.27, "learning_rate": 0.0002729185554992994, "loss": 2.9216, "step": 23000 }, { "epoch": 0.28, "learning_rate": 0.0002723298284449363, "loss": 2.9258, "step": 23500 }, { "epoch": 0.28, "learning_rate": 0.0002717411013905733, "loss": 2.9075, "step": 24000 }, { "epoch": 0.29, "learning_rate": 0.00027115237433621023, "loss": 2.9001, "step": 24500 }, { "epoch": 0.29, "learning_rate": 0.0002705636472818472, "loss": 2.8939, "step": 25000 }, { "epoch": 0.29, "eval_gen_len": 18.8512, "eval_loss": 2.6895627975463867, "eval_rouge1": 23.8549, "eval_rouge2": 12.6254, "eval_rougeL": 22.1601, "eval_rougeLsum": 22.688, "eval_runtime": 544.7255, "eval_samples_per_second": 18.358, "eval_steps_per_second": 2.295, "step": 25000 }, { "epoch": 0.3, "learning_rate": 0.00026997492022748414, "loss": 2.8947, "step": 25500 }, { "epoch": 0.31, "learning_rate": 0.00026938619317312104, "loss": 2.9073, "step": 26000 }, { "epoch": 0.31, "learning_rate": 0.000268797466118758, "loss": 2.9035, "step": 26500 }, { "epoch": 0.32, "learning_rate": 0.00026820873906439495, "loss": 2.9128, "step": 27000 }, { "epoch": 0.32, "learning_rate": 0.0002676200120100319, "loss": 2.8639, "step": 27500 }, { "epoch": 0.33, "learning_rate": 0.00026703128495566886, "loss": 2.8944, "step": 28000 }, { "epoch": 0.34, "learning_rate": 0.00026644255790130575, "loss": 2.8952, "step": 28500 }, { "epoch": 0.34, "learning_rate": 0.0002658538308469427, "loss": 2.8956, "step": 29000 }, { "epoch": 0.35, "learning_rate": 0.00026526510379257966, "loss": 2.8959, "step": 29500 }, { "epoch": 0.35, "learning_rate": 0.0002646763767382166, "loss": 2.8614, "step": 30000 }, { "epoch": 0.35, "eval_gen_len": 18.9345, "eval_loss": 2.6755456924438477, "eval_rouge1": 24.3321, "eval_rouge2": 13.0763, "eval_rougeL": 22.6103, "eval_rougeLsum": 23.1247, "eval_runtime": 546.9326, "eval_samples_per_second": 18.284, "eval_steps_per_second": 2.285, "step": 30000 }, { "epoch": 0.36, "learning_rate": 0.00026408764968385357, "loss": 2.8459, "step": 30500 }, { "epoch": 0.37, "learning_rate": 0.00026349892262949047, "loss": 2.8591, "step": 31000 }, { "epoch": 0.37, "learning_rate": 0.0002629101955751274, "loss": 2.8291, "step": 31500 }, { "epoch": 0.38, "learning_rate": 0.0002623214685207644, "loss": 2.8645, "step": 32000 }, { "epoch": 0.38, "learning_rate": 0.00026173274146640133, "loss": 2.8185, "step": 32500 }, { "epoch": 0.39, "learning_rate": 0.0002611440144120383, "loss": 2.8498, "step": 33000 }, { "epoch": 0.39, "learning_rate": 0.0002605552873576752, "loss": 2.8384, "step": 33500 }, { "epoch": 0.4, "learning_rate": 0.00025996656030331214, "loss": 2.8515, "step": 34000 }, { "epoch": 0.41, "learning_rate": 0.0002593778332489491, "loss": 2.8378, "step": 34500 }, { "epoch": 0.41, "learning_rate": 0.00025878910619458605, "loss": 2.849, "step": 35000 }, { "epoch": 0.41, "eval_gen_len": 18.8908, "eval_loss": 2.6249804496765137, "eval_rouge1": 24.2486, "eval_rouge2": 12.8938, "eval_rougeL": 22.5311, "eval_rougeLsum": 23.0375, "eval_runtime": 549.356, "eval_samples_per_second": 18.203, "eval_steps_per_second": 2.275, "step": 35000 }, { "epoch": 0.42, "learning_rate": 0.000258200379140223, "loss": 2.813, "step": 35500 }, { "epoch": 0.42, "learning_rate": 0.00025761165208585996, "loss": 2.8157, "step": 36000 }, { "epoch": 0.43, "learning_rate": 0.00025702292503149686, "loss": 2.83, "step": 36500 }, { "epoch": 0.44, "learning_rate": 0.0002564341979771338, "loss": 2.8324, "step": 37000 }, { "epoch": 0.44, "learning_rate": 0.00025584547092277076, "loss": 2.849, "step": 37500 }, { "epoch": 0.45, "learning_rate": 0.0002552567438684077, "loss": 2.8163, "step": 38000 }, { "epoch": 0.45, "learning_rate": 0.00025466801681404467, "loss": 2.8215, "step": 38500 }, { "epoch": 0.46, "learning_rate": 0.00025407928975968157, "loss": 2.777, "step": 39000 }, { "epoch": 0.47, "learning_rate": 0.0002534905627053185, "loss": 2.8147, "step": 39500 }, { "epoch": 0.47, "learning_rate": 0.0002529018356509555, "loss": 2.7994, "step": 40000 }, { "epoch": 0.47, "eval_gen_len": 18.891, "eval_loss": 2.618790626525879, "eval_rouge1": 24.5722, "eval_rouge2": 13.3685, "eval_rougeL": 22.865, "eval_rougeLsum": 23.3852, "eval_runtime": 548.1205, "eval_samples_per_second": 18.244, "eval_steps_per_second": 2.281, "step": 40000 }, { "epoch": 0.48, "learning_rate": 0.00025231310859659243, "loss": 2.7992, "step": 40500 }, { "epoch": 0.48, "learning_rate": 0.0002517243815422294, "loss": 2.8235, "step": 41000 }, { "epoch": 0.49, "learning_rate": 0.0002511356544878663, "loss": 2.8081, "step": 41500 }, { "epoch": 0.49, "learning_rate": 0.00025054692743350324, "loss": 2.812, "step": 42000 }, { "epoch": 0.5, "learning_rate": 0.0002499582003791402, "loss": 2.7941, "step": 42500 }, { "epoch": 0.51, "learning_rate": 0.00024936947332477715, "loss": 2.8014, "step": 43000 }, { "epoch": 0.51, "learning_rate": 0.0002487807462704141, "loss": 2.7778, "step": 43500 }, { "epoch": 0.52, "learning_rate": 0.000248192019216051, "loss": 2.7734, "step": 44000 }, { "epoch": 0.52, "learning_rate": 0.00024760329216168796, "loss": 2.8093, "step": 44500 }, { "epoch": 0.53, "learning_rate": 0.0002470145651073249, "loss": 2.7899, "step": 45000 }, { "epoch": 0.53, "eval_gen_len": 18.9064, "eval_loss": 2.6196963787078857, "eval_rouge1": 24.3304, "eval_rouge2": 13.157, "eval_rougeL": 22.6459, "eval_rougeLsum": 23.1647, "eval_runtime": 540.7531, "eval_samples_per_second": 18.493, "eval_steps_per_second": 2.312, "step": 45000 }, { "epoch": 0.54, "learning_rate": 0.00024642583805296187, "loss": 2.8157, "step": 45500 }, { "epoch": 0.54, "learning_rate": 0.0002458371109985988, "loss": 2.7576, "step": 46000 }, { "epoch": 0.55, "learning_rate": 0.0002452483839442357, "loss": 2.7482, "step": 46500 }, { "epoch": 0.55, "learning_rate": 0.00024465965688987273, "loss": 2.7704, "step": 47000 }, { "epoch": 0.56, "learning_rate": 0.00024407092983550963, "loss": 2.7597, "step": 47500 }, { "epoch": 0.57, "learning_rate": 0.0002434822027811466, "loss": 2.765, "step": 48000 }, { "epoch": 0.57, "learning_rate": 0.00024289347572678354, "loss": 2.7709, "step": 48500 }, { "epoch": 0.58, "learning_rate": 0.00024230474867242046, "loss": 2.7519, "step": 49000 }, { "epoch": 0.58, "learning_rate": 0.00024171602161805742, "loss": 2.74, "step": 49500 }, { "epoch": 0.59, "learning_rate": 0.00024112729456369434, "loss": 2.786, "step": 50000 }, { "epoch": 0.59, "eval_gen_len": 18.8697, "eval_loss": 2.584320545196533, "eval_rouge1": 24.7437, "eval_rouge2": 13.5493, "eval_rougeL": 23.0267, "eval_rougeLsum": 23.5281, "eval_runtime": 542.416, "eval_samples_per_second": 18.436, "eval_steps_per_second": 2.305, "step": 50000 }, { "epoch": 0.59, "learning_rate": 0.00024053856750933132, "loss": 2.7556, "step": 50500 }, { "epoch": 0.6, "learning_rate": 0.00023994984045496825, "loss": 2.746, "step": 51000 }, { "epoch": 0.61, "learning_rate": 0.00023936111340060518, "loss": 2.767, "step": 51500 }, { "epoch": 0.61, "learning_rate": 0.00023877238634624213, "loss": 2.7261, "step": 52000 }, { "epoch": 0.62, "learning_rate": 0.00023818365929187906, "loss": 2.773, "step": 52500 }, { "epoch": 0.62, "learning_rate": 0.00023759493223751604, "loss": 2.7811, "step": 53000 }, { "epoch": 0.63, "learning_rate": 0.00023700620518315297, "loss": 2.7727, "step": 53500 }, { "epoch": 0.64, "learning_rate": 0.0002364174781287899, "loss": 2.75, "step": 54000 }, { "epoch": 0.64, "learning_rate": 0.00023582875107442685, "loss": 2.7689, "step": 54500 }, { "epoch": 0.65, "learning_rate": 0.0002352400240200638, "loss": 2.7765, "step": 55000 }, { "epoch": 0.65, "eval_gen_len": 18.8908, "eval_loss": 2.5723013877868652, "eval_rouge1": 24.55, "eval_rouge2": 13.501, "eval_rougeL": 22.8132, "eval_rougeLsum": 23.3216, "eval_runtime": 548.4906, "eval_samples_per_second": 18.232, "eval_steps_per_second": 2.279, "step": 55000 }, { "epoch": 0.65, "learning_rate": 0.00023465129696570076, "loss": 2.7603, "step": 55500 }, { "epoch": 0.66, "learning_rate": 0.00023406256991133768, "loss": 2.7369, "step": 56000 }, { "epoch": 0.67, "learning_rate": 0.0002334738428569746, "loss": 2.6958, "step": 56500 }, { "epoch": 0.67, "learning_rate": 0.0002328851158026116, "loss": 2.7746, "step": 57000 }, { "epoch": 0.68, "learning_rate": 0.00023229638874824852, "loss": 2.7596, "step": 57500 }, { "epoch": 0.68, "learning_rate": 0.00023170766169388547, "loss": 2.7118, "step": 58000 }, { "epoch": 0.69, "learning_rate": 0.0002311189346395224, "loss": 2.7248, "step": 58500 }, { "epoch": 0.69, "learning_rate": 0.00023053020758515932, "loss": 2.7307, "step": 59000 }, { "epoch": 0.7, "learning_rate": 0.0002299414805307963, "loss": 2.7487, "step": 59500 }, { "epoch": 0.71, "learning_rate": 0.00022935275347643323, "loss": 2.7591, "step": 60000 }, { "epoch": 0.71, "eval_gen_len": 18.8755, "eval_loss": 2.5769457817077637, "eval_rouge1": 24.78, "eval_rouge2": 13.6656, "eval_rougeL": 23.0842, "eval_rougeLsum": 23.6064, "eval_runtime": 548.2586, "eval_samples_per_second": 18.24, "eval_steps_per_second": 2.28, "step": 60000 }, { "epoch": 0.71, "learning_rate": 0.0002287640264220702, "loss": 2.7621, "step": 60500 }, { "epoch": 0.72, "learning_rate": 0.00022817529936770711, "loss": 2.7554, "step": 61000 }, { "epoch": 0.72, "learning_rate": 0.00022758657231334404, "loss": 2.7484, "step": 61500 }, { "epoch": 0.73, "learning_rate": 0.00022699784525898102, "loss": 2.7762, "step": 62000 }, { "epoch": 0.74, "learning_rate": 0.00022640911820461795, "loss": 2.7248, "step": 62500 }, { "epoch": 0.74, "learning_rate": 0.0002258203911502549, "loss": 2.6949, "step": 63000 }, { "epoch": 0.75, "learning_rate": 0.00022523166409589183, "loss": 2.7287, "step": 63500 }, { "epoch": 0.75, "learning_rate": 0.0002246429370415288, "loss": 2.7017, "step": 64000 }, { "epoch": 0.76, "learning_rate": 0.00022405420998716574, "loss": 2.6926, "step": 64500 }, { "epoch": 0.77, "learning_rate": 0.00022346548293280266, "loss": 2.7254, "step": 65000 }, { "epoch": 0.77, "eval_gen_len": 18.8954, "eval_loss": 2.566603183746338, "eval_rouge1": 24.8284, "eval_rouge2": 13.73, "eval_rougeL": 23.0997, "eval_rougeLsum": 23.6334, "eval_runtime": 557.6451, "eval_samples_per_second": 17.933, "eval_steps_per_second": 2.242, "step": 65000 }, { "epoch": 0.77, "learning_rate": 0.00022287675587843962, "loss": 2.7054, "step": 65500 }, { "epoch": 0.78, "learning_rate": 0.00022228802882407655, "loss": 2.6841, "step": 66000 }, { "epoch": 0.78, "learning_rate": 0.00022169930176971353, "loss": 2.7164, "step": 66500 }, { "epoch": 0.79, "learning_rate": 0.00022111057471535045, "loss": 2.7272, "step": 67000 }, { "epoch": 0.79, "learning_rate": 0.00022052184766098738, "loss": 2.715, "step": 67500 }, { "epoch": 0.8, "learning_rate": 0.00021993312060662433, "loss": 2.6992, "step": 68000 }, { "epoch": 0.81, "learning_rate": 0.0002193443935522613, "loss": 2.7124, "step": 68500 }, { "epoch": 0.81, "learning_rate": 0.00021875566649789824, "loss": 2.685, "step": 69000 }, { "epoch": 0.82, "learning_rate": 0.00021816693944353517, "loss": 2.6867, "step": 69500 }, { "epoch": 0.82, "learning_rate": 0.0002175782123891721, "loss": 2.6814, "step": 70000 }, { "epoch": 0.82, "eval_gen_len": 18.8313, "eval_loss": 2.5359628200531006, "eval_rouge1": 24.849, "eval_rouge2": 13.8366, "eval_rougeL": 23.1903, "eval_rougeLsum": 23.6941, "eval_runtime": 544.4942, "eval_samples_per_second": 18.366, "eval_steps_per_second": 2.296, "step": 70000 }, { "epoch": 0.83, "learning_rate": 0.00021698948533480908, "loss": 2.6868, "step": 70500 }, { "epoch": 0.84, "learning_rate": 0.000216400758280446, "loss": 2.6849, "step": 71000 }, { "epoch": 0.84, "learning_rate": 0.00021581203122608296, "loss": 2.705, "step": 71500 }, { "epoch": 0.85, "learning_rate": 0.00021522330417171988, "loss": 2.7154, "step": 72000 }, { "epoch": 0.85, "learning_rate": 0.0002146345771173568, "loss": 2.6585, "step": 72500 }, { "epoch": 0.86, "learning_rate": 0.0002140458500629938, "loss": 2.6848, "step": 73000 }, { "epoch": 0.87, "learning_rate": 0.00021345712300863072, "loss": 2.6787, "step": 73500 }, { "epoch": 0.87, "learning_rate": 0.00021286839595426767, "loss": 2.646, "step": 74000 } ], "max_steps": 254787, "num_train_epochs": 3, "total_flos": 2.2883060458635264e+17, "trial_name": null, "trial_params": null }