{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.398694794354227, "eval_steps": 750, "global_step": 97500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15176809834572771, "grad_norm": 1.347341537475586, "learning_rate": 1.3333333333333333e-05, "loss": 2.0649, "step": 2000 }, { "epoch": 0.30353619669145543, "grad_norm": 1.299472689628601, "learning_rate": 1.984469638142569e-05, "loss": 1.7903, "step": 4000 }, { "epoch": 0.45530429503718317, "grad_norm": 1.5234107971191406, "learning_rate": 1.9534089144277063e-05, "loss": 1.6734, "step": 6000 }, { "epoch": 0.6070723933829109, "grad_norm": 6.629986763000488, "learning_rate": 1.922348190712844e-05, "loss": 1.6014, "step": 8000 }, { "epoch": 0.7588404917286387, "grad_norm": 1.8827601671218872, "learning_rate": 1.8912874669979814e-05, "loss": 1.5582, "step": 10000 }, { "epoch": 0.9106085900743663, "grad_norm": 1.4713941812515259, "learning_rate": 1.8602267432831186e-05, "loss": 1.5393, "step": 12000 }, { "epoch": 1.0, "eval_loss": 1.4226312637329102, "eval_rouge1": 15.9571, "eval_rouge2": 5.6803, "eval_rougeL": 13.7071, "eval_rougeLsum": 13.8127, "eval_runtime": 167.9928, "eval_samples_per_second": 29.763, "eval_steps_per_second": 3.72, "step": 13178 }, { "epoch": 1.062376688420094, "grad_norm": 1.397419810295105, "learning_rate": 1.829166019568256e-05, "loss": 1.5108, "step": 14000 }, { "epoch": 1.214144786765822, "grad_norm": 1.282456398010254, "learning_rate": 1.7981052958533937e-05, "loss": 1.4931, "step": 16000 }, { "epoch": 1.3659128851115496, "grad_norm": 1.5143241882324219, "learning_rate": 1.767044572138531e-05, "loss": 1.4687, "step": 18000 }, { "epoch": 1.5176809834572773, "grad_norm": 1.0350826978683472, "learning_rate": 1.7359838484236684e-05, "loss": 1.4543, "step": 20000 }, { "epoch": 1.669449081803005, "grad_norm": 1.2214738130569458, "learning_rate": 1.704923124708806e-05, "loss": 1.4466, "step": 22000 }, { "epoch": 1.8212171801487327, "grad_norm": 1.226135492324829, "learning_rate": 1.6738624009939432e-05, "loss": 1.4314, "step": 24000 }, { "epoch": 1.9729852784944604, "grad_norm": 1.2347540855407715, "learning_rate": 1.6428016772790807e-05, "loss": 1.4255, "step": 26000 }, { "epoch": 2.0, "eval_loss": 1.3427051305770874, "eval_rouge1": 16.3039, "eval_rouge2": 6.0757, "eval_rougeL": 14.0031, "eval_rougeLsum": 14.1114, "eval_runtime": 168.2423, "eval_samples_per_second": 29.719, "eval_steps_per_second": 3.715, "step": 26356 }, { "epoch": 2.124753376840188, "grad_norm": 1.024688482284546, "learning_rate": 1.6117409535642183e-05, "loss": 1.4107, "step": 28000 }, { "epoch": 2.2765214751859157, "grad_norm": 1.1046956777572632, "learning_rate": 1.5806802298493555e-05, "loss": 1.3999, "step": 30000 }, { "epoch": 2.428289573531644, "grad_norm": 1.4118067026138306, "learning_rate": 1.549619506134493e-05, "loss": 1.394, "step": 32000 }, { "epoch": 2.5800576718773716, "grad_norm": 1.0044879913330078, "learning_rate": 1.5185587824196304e-05, "loss": 1.3894, "step": 34000 }, { "epoch": 2.7318257702230992, "grad_norm": 3.0287246704101562, "learning_rate": 1.4874980587047681e-05, "loss": 1.3823, "step": 36000 }, { "epoch": 2.883593868568827, "grad_norm": 0.8824607133865356, "learning_rate": 1.4564373349899055e-05, "loss": 1.3747, "step": 38000 }, { "epoch": 3.0, "eval_loss": 1.2928217649459839, "eval_rouge1": 16.2852, "eval_rouge2": 6.139, "eval_rougeL": 14.0119, "eval_rougeLsum": 14.1209, "eval_runtime": 172.2837, "eval_samples_per_second": 29.022, "eval_steps_per_second": 3.628, "step": 39534 }, { "epoch": 3.0353619669145546, "grad_norm": 1.7139147520065308, "learning_rate": 1.4253766112750429e-05, "loss": 1.365, "step": 40000 }, { "epoch": 3.1871300652602823, "grad_norm": 0.9839210510253906, "learning_rate": 1.3943158875601804e-05, "loss": 1.3631, "step": 42000 }, { "epoch": 3.33889816360601, "grad_norm": 2.022289514541626, "learning_rate": 1.3632551638453178e-05, "loss": 1.3497, "step": 44000 }, { "epoch": 3.4906662619517377, "grad_norm": 4.369687080383301, "learning_rate": 1.3321944401304551e-05, "loss": 1.3536, "step": 46000 }, { "epoch": 3.6424343602974654, "grad_norm": 1.046391487121582, "learning_rate": 1.3011337164155927e-05, "loss": 1.3455, "step": 48000 }, { "epoch": 3.794202458643193, "grad_norm": 3.8603522777557373, "learning_rate": 1.27007299270073e-05, "loss": 1.3396, "step": 50000 }, { "epoch": 3.9459705569889207, "grad_norm": 1.02574622631073, "learning_rate": 1.2390122689858674e-05, "loss": 1.3347, "step": 52000 }, { "epoch": 4.0, "eval_loss": 1.2622406482696533, "eval_rouge1": 16.4481, "eval_rouge2": 6.2714, "eval_rougeL": 14.1706, "eval_rougeLsum": 14.2806, "eval_runtime": 167.9003, "eval_samples_per_second": 29.78, "eval_steps_per_second": 3.722, "step": 52712 }, { "epoch": 4.097738655334648, "grad_norm": 1.366310715675354, "learning_rate": 1.2079515452710048e-05, "loss": 1.3304, "step": 54000 }, { "epoch": 4.249506753680376, "grad_norm": 1.1469073295593262, "learning_rate": 1.1768908215561424e-05, "loss": 1.3271, "step": 56000 }, { "epoch": 4.401274852026104, "grad_norm": 1.0787475109100342, "learning_rate": 1.1458300978412797e-05, "loss": 1.3199, "step": 58000 }, { "epoch": 4.5530429503718315, "grad_norm": 1.045688271522522, "learning_rate": 1.1147693741264171e-05, "loss": 1.3229, "step": 60000 }, { "epoch": 4.704811048717559, "grad_norm": 1.0128060579299927, "learning_rate": 1.0837086504115546e-05, "loss": 1.3156, "step": 62000 }, { "epoch": 4.856579147063288, "grad_norm": 1.1346766948699951, "learning_rate": 1.052647926696692e-05, "loss": 1.3186, "step": 64000 }, { "epoch": 5.0, "eval_loss": 1.23964262008667, "eval_rouge1": 16.5213, "eval_rouge2": 6.4307, "eval_rougeL": 14.289, "eval_rougeLsum": 14.3853, "eval_runtime": 170.4893, "eval_samples_per_second": 29.327, "eval_steps_per_second": 3.666, "step": 65890 }, { "epoch": 5.008347245409015, "grad_norm": 1.000510334968567, "learning_rate": 1.0215872029818294e-05, "loss": 1.3074, "step": 66000 }, { "epoch": 5.160115343754743, "grad_norm": 1.4083774089813232, "learning_rate": 9.90526479266967e-06, "loss": 1.3079, "step": 68000 }, { "epoch": 5.311883442100471, "grad_norm": 1.065021276473999, "learning_rate": 9.594657555521045e-06, "loss": 1.3055, "step": 70000 }, { "epoch": 5.4636515404461985, "grad_norm": 0.9340164065361023, "learning_rate": 9.284050318372419e-06, "loss": 1.305, "step": 72000 }, { "epoch": 5.615419638791926, "grad_norm": 0.9457820653915405, "learning_rate": 8.973443081223792e-06, "loss": 1.3015, "step": 74000 }, { "epoch": 5.767187737137654, "grad_norm": 0.8897130489349365, "learning_rate": 8.662835844075168e-06, "loss": 1.2985, "step": 76000 }, { "epoch": 5.9189558354833816, "grad_norm": 1.2775472402572632, "learning_rate": 8.352228606926543e-06, "loss": 1.2973, "step": 78000 }, { "epoch": 6.0, "eval_loss": 1.2244175672531128, "eval_rouge1": 16.4244, "eval_rouge2": 6.384, "eval_rougeL": 14.2167, "eval_rougeLsum": 14.3188, "eval_runtime": 170.4105, "eval_samples_per_second": 29.341, "eval_steps_per_second": 3.668, "step": 79068 }, { "epoch": 6.070723933829109, "grad_norm": 1.0759906768798828, "learning_rate": 8.041621369777917e-06, "loss": 1.2908, "step": 80000 }, { "epoch": 6.222492032174837, "grad_norm": 1.315941333770752, "learning_rate": 7.73101413262929e-06, "loss": 1.2927, "step": 82000 }, { "epoch": 6.374260130520565, "grad_norm": 0.936198353767395, "learning_rate": 7.420406895480665e-06, "loss": 1.2945, "step": 84000 }, { "epoch": 6.526028228866292, "grad_norm": 1.233934998512268, "learning_rate": 7.10979965833204e-06, "loss": 1.285, "step": 86000 }, { "epoch": 6.67779632721202, "grad_norm": 1.1760342121124268, "learning_rate": 6.7991924211834135e-06, "loss": 1.2877, "step": 88000 }, { "epoch": 6.829564425557748, "grad_norm": 2.0586724281311035, "learning_rate": 6.488585184034788e-06, "loss": 1.283, "step": 90000 }, { "epoch": 6.981332523903475, "grad_norm": 0.8292114734649658, "learning_rate": 6.1779779468861636e-06, "loss": 1.2817, "step": 92000 }, { "epoch": 7.0, "eval_loss": 1.21384859085083, "eval_rouge1": 16.5727, "eval_rouge2": 6.4685, "eval_rougeL": 14.3558, "eval_rougeLsum": 14.4703, "eval_runtime": 168.1885, "eval_samples_per_second": 29.729, "eval_steps_per_second": 3.716, "step": 92246 }, { "epoch": 7.133100622249203, "grad_norm": 0.9742059111595154, "learning_rate": 5.867370709737537e-06, "loss": 1.2858, "step": 94000 }, { "epoch": 7.284868720594931, "grad_norm": 1.0256426334381104, "learning_rate": 5.556763472588912e-06, "loss": 1.2804, "step": 96000 } ], "logging_steps": 2000, "max_steps": 131780, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0556092079249818e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }