diff --git "a/checkpoint-17500/trainer_state.json" "b/checkpoint-17500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-17500/trainer_state.json" @@ -0,0 +1,5033 @@ +{ + "best_metric": 1.9262617826461792, + "best_model_checkpoint": "Paraphrase-v3/checkpoint-17500", + "epoch": 7.0, + "eval_steps": 500, + "global_step": 17500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 11.671796798706055, + "learning_rate": 3.2e-07, + "loss": 3.3655, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 9.20463752746582, + "learning_rate": 6.533333333333334e-07, + "loss": 3.2816, + "step": 50 + }, + { + "epoch": 0.03, + "grad_norm": 10.489435195922852, + "learning_rate": 9.733333333333335e-07, + "loss": 3.2049, + "step": 75 + }, + { + "epoch": 0.04, + "grad_norm": 8.335665702819824, + "learning_rate": 1.2933333333333334e-06, + "loss": 3.2663, + "step": 100 + }, + { + "epoch": 0.05, + "grad_norm": 7.564672470092773, + "learning_rate": 1.6133333333333333e-06, + "loss": 3.0361, + "step": 125 + }, + { + "epoch": 0.06, + "grad_norm": 6.445804595947266, + "learning_rate": 1.946666666666667e-06, + "loss": 3.0074, + "step": 150 + }, + { + "epoch": 0.07, + "grad_norm": 5.889507293701172, + "learning_rate": 2.28e-06, + "loss": 2.9401, + "step": 175 + }, + { + "epoch": 0.08, + "grad_norm": 5.881171226501465, + "learning_rate": 2.6133333333333334e-06, + "loss": 2.7827, + "step": 200 + }, + { + "epoch": 0.09, + "grad_norm": 2.8032336235046387, + "learning_rate": 2.9466666666666667e-06, + "loss": 2.7671, + "step": 225 + }, + { + "epoch": 0.1, + "grad_norm": 2.653895616531372, + "learning_rate": 3.2800000000000004e-06, + "loss": 2.6844, + "step": 250 + }, + { + "epoch": 0.11, + "grad_norm": 6.168716907501221, + "learning_rate": 3.613333333333334e-06, + "loss": 2.5707, + "step": 275 + }, + { + "epoch": 0.12, + "grad_norm": 6.516740322113037, + "learning_rate": 3.9466666666666664e-06, + "loss": 2.5428, + "step": 300 + }, + { + "epoch": 0.13, + "grad_norm": 2.4983417987823486, + "learning_rate": 4.28e-06, + "loss": 2.57, + "step": 325 + }, + { + "epoch": 0.14, + "grad_norm": 2.452354669570923, + "learning_rate": 4.613333333333334e-06, + "loss": 2.5167, + "step": 350 + }, + { + "epoch": 0.15, + "grad_norm": 2.4193527698516846, + "learning_rate": 4.946666666666667e-06, + "loss": 2.4788, + "step": 375 + }, + { + "epoch": 0.16, + "grad_norm": 2.7596585750579834, + "learning_rate": 5.28e-06, + "loss": 2.4994, + "step": 400 + }, + { + "epoch": 0.17, + "grad_norm": 2.1304233074188232, + "learning_rate": 5.6133333333333335e-06, + "loss": 2.4324, + "step": 425 + }, + { + "epoch": 0.18, + "grad_norm": 1.9708311557769775, + "learning_rate": 5.946666666666667e-06, + "loss": 2.4117, + "step": 450 + }, + { + "epoch": 0.19, + "grad_norm": 2.2388558387756348, + "learning_rate": 6.28e-06, + "loss": 2.3727, + "step": 475 + }, + { + "epoch": 0.2, + "grad_norm": 2.3298180103302, + "learning_rate": 6.613333333333334e-06, + "loss": 2.4732, + "step": 500 + }, + { + "epoch": 0.21, + "grad_norm": 2.0654354095458984, + "learning_rate": 6.9466666666666665e-06, + "loss": 2.4922, + "step": 525 + }, + { + "epoch": 0.22, + "grad_norm": 1.9771661758422852, + "learning_rate": 7.280000000000001e-06, + "loss": 2.3858, + "step": 550 + }, + { + "epoch": 0.23, + "grad_norm": 2.3070411682128906, + "learning_rate": 7.613333333333334e-06, + "loss": 2.3785, + "step": 575 + }, + { + "epoch": 0.24, + "grad_norm": 2.0033750534057617, + "learning_rate": 7.946666666666668e-06, + "loss": 2.369, + "step": 600 + }, + { + "epoch": 0.25, + "grad_norm": 1.960777997970581, + "learning_rate": 8.28e-06, + "loss": 2.4246, + "step": 625 + }, + { + "epoch": 0.26, + "grad_norm": 1.8081718683242798, + "learning_rate": 8.613333333333334e-06, + "loss": 2.3665, + "step": 650 + }, + { + "epoch": 0.27, + "grad_norm": 1.8548564910888672, + "learning_rate": 8.946666666666667e-06, + "loss": 2.3306, + "step": 675 + }, + { + "epoch": 0.28, + "grad_norm": 2.222216844558716, + "learning_rate": 9.28e-06, + "loss": 2.3112, + "step": 700 + }, + { + "epoch": 0.29, + "grad_norm": 1.7288415431976318, + "learning_rate": 9.613333333333333e-06, + "loss": 2.3098, + "step": 725 + }, + { + "epoch": 0.3, + "grad_norm": 2.0263378620147705, + "learning_rate": 9.946666666666667e-06, + "loss": 2.3304, + "step": 750 + }, + { + "epoch": 0.31, + "grad_norm": 1.5405957698822021, + "learning_rate": 1.0280000000000002e-05, + "loss": 2.3302, + "step": 775 + }, + { + "epoch": 0.32, + "grad_norm": 1.8870110511779785, + "learning_rate": 1.0613333333333334e-05, + "loss": 2.3241, + "step": 800 + }, + { + "epoch": 0.33, + "grad_norm": 1.8393750190734863, + "learning_rate": 1.0946666666666668e-05, + "loss": 2.3361, + "step": 825 + }, + { + "epoch": 0.34, + "grad_norm": 1.870672583580017, + "learning_rate": 1.128e-05, + "loss": 2.3178, + "step": 850 + }, + { + "epoch": 0.35, + "grad_norm": 1.7924401760101318, + "learning_rate": 1.1613333333333335e-05, + "loss": 2.2558, + "step": 875 + }, + { + "epoch": 0.36, + "grad_norm": 1.6780457496643066, + "learning_rate": 1.1946666666666667e-05, + "loss": 2.2928, + "step": 900 + }, + { + "epoch": 0.37, + "grad_norm": 1.8860801458358765, + "learning_rate": 1.2280000000000001e-05, + "loss": 2.2523, + "step": 925 + }, + { + "epoch": 0.38, + "grad_norm": 1.7925626039505005, + "learning_rate": 1.2613333333333332e-05, + "loss": 2.2885, + "step": 950 + }, + { + "epoch": 0.39, + "grad_norm": 1.916331171989441, + "learning_rate": 1.2946666666666668e-05, + "loss": 2.3369, + "step": 975 + }, + { + "epoch": 0.4, + "grad_norm": 1.6720538139343262, + "learning_rate": 1.3280000000000002e-05, + "loss": 2.2789, + "step": 1000 + }, + { + "epoch": 0.41, + "grad_norm": 1.8012834787368774, + "learning_rate": 1.3613333333333334e-05, + "loss": 2.2818, + "step": 1025 + }, + { + "epoch": 0.42, + "grad_norm": 1.82932710647583, + "learning_rate": 1.3946666666666666e-05, + "loss": 2.3173, + "step": 1050 + }, + { + "epoch": 0.43, + "grad_norm": 1.8059712648391724, + "learning_rate": 1.4280000000000002e-05, + "loss": 2.266, + "step": 1075 + }, + { + "epoch": 0.44, + "grad_norm": 1.4668495655059814, + "learning_rate": 1.4613333333333335e-05, + "loss": 2.2639, + "step": 1100 + }, + { + "epoch": 0.45, + "grad_norm": 1.7411282062530518, + "learning_rate": 1.4946666666666667e-05, + "loss": 2.2816, + "step": 1125 + }, + { + "epoch": 0.46, + "grad_norm": 1.5545501708984375, + "learning_rate": 1.528e-05, + "loss": 2.2578, + "step": 1150 + }, + { + "epoch": 0.47, + "grad_norm": 1.6876237392425537, + "learning_rate": 1.5613333333333335e-05, + "loss": 2.3282, + "step": 1175 + }, + { + "epoch": 0.48, + "grad_norm": 1.6718631982803345, + "learning_rate": 1.5946666666666668e-05, + "loss": 2.2713, + "step": 1200 + }, + { + "epoch": 0.49, + "grad_norm": 1.628440499305725, + "learning_rate": 1.628e-05, + "loss": 2.2581, + "step": 1225 + }, + { + "epoch": 0.5, + "grad_norm": 1.8468382358551025, + "learning_rate": 1.6613333333333332e-05, + "loss": 2.3533, + "step": 1250 + }, + { + "epoch": 0.51, + "grad_norm": 1.8317992687225342, + "learning_rate": 1.6946666666666665e-05, + "loss": 2.2398, + "step": 1275 + }, + { + "epoch": 0.52, + "grad_norm": 1.673600673675537, + "learning_rate": 1.728e-05, + "loss": 2.2879, + "step": 1300 + }, + { + "epoch": 0.53, + "grad_norm": 1.5316749811172485, + "learning_rate": 1.7613333333333333e-05, + "loss": 2.2502, + "step": 1325 + }, + { + "epoch": 0.54, + "grad_norm": 1.6259863376617432, + "learning_rate": 1.794666666666667e-05, + "loss": 2.2399, + "step": 1350 + }, + { + "epoch": 0.55, + "grad_norm": 1.4696708917617798, + "learning_rate": 1.828e-05, + "loss": 2.233, + "step": 1375 + }, + { + "epoch": 0.56, + "grad_norm": 1.9751217365264893, + "learning_rate": 1.8613333333333337e-05, + "loss": 2.2855, + "step": 1400 + }, + { + "epoch": 0.57, + "grad_norm": 1.541063666343689, + "learning_rate": 1.894666666666667e-05, + "loss": 2.2782, + "step": 1425 + }, + { + "epoch": 0.58, + "grad_norm": 1.525913953781128, + "learning_rate": 1.9280000000000002e-05, + "loss": 2.238, + "step": 1450 + }, + { + "epoch": 0.59, + "grad_norm": 1.6467074155807495, + "learning_rate": 1.9613333333333334e-05, + "loss": 2.2921, + "step": 1475 + }, + { + "epoch": 0.6, + "grad_norm": 1.6107714176177979, + "learning_rate": 1.9946666666666667e-05, + "loss": 2.2359, + "step": 1500 + }, + { + "epoch": 0.61, + "grad_norm": 1.6940182447433472, + "learning_rate": 2.0280000000000002e-05, + "loss": 2.2137, + "step": 1525 + }, + { + "epoch": 0.62, + "grad_norm": 1.7330864667892456, + "learning_rate": 2.0613333333333335e-05, + "loss": 2.211, + "step": 1550 + }, + { + "epoch": 0.63, + "grad_norm": 1.847303032875061, + "learning_rate": 2.0946666666666667e-05, + "loss": 2.2547, + "step": 1575 + }, + { + "epoch": 0.64, + "grad_norm": 1.3495903015136719, + "learning_rate": 2.128e-05, + "loss": 2.217, + "step": 1600 + }, + { + "epoch": 0.65, + "grad_norm": 1.4377658367156982, + "learning_rate": 2.1613333333333335e-05, + "loss": 2.2179, + "step": 1625 + }, + { + "epoch": 0.66, + "grad_norm": 1.9955512285232544, + "learning_rate": 2.1946666666666668e-05, + "loss": 2.2818, + "step": 1650 + }, + { + "epoch": 0.67, + "grad_norm": 1.3620418310165405, + "learning_rate": 2.228e-05, + "loss": 2.2304, + "step": 1675 + }, + { + "epoch": 0.68, + "grad_norm": 1.4833818674087524, + "learning_rate": 2.2613333333333333e-05, + "loss": 2.2016, + "step": 1700 + }, + { + "epoch": 0.69, + "grad_norm": 1.4675543308258057, + "learning_rate": 2.294666666666667e-05, + "loss": 2.1752, + "step": 1725 + }, + { + "epoch": 0.7, + "grad_norm": 1.5722428560256958, + "learning_rate": 2.328e-05, + "loss": 2.2749, + "step": 1750 + }, + { + "epoch": 0.71, + "grad_norm": 1.711739182472229, + "learning_rate": 2.3613333333333333e-05, + "loss": 2.1855, + "step": 1775 + }, + { + "epoch": 0.72, + "grad_norm": 1.5259678363800049, + "learning_rate": 2.394666666666667e-05, + "loss": 2.1757, + "step": 1800 + }, + { + "epoch": 0.73, + "grad_norm": 1.4229743480682373, + "learning_rate": 2.428e-05, + "loss": 2.2109, + "step": 1825 + }, + { + "epoch": 0.74, + "grad_norm": 1.5702102184295654, + "learning_rate": 2.4613333333333337e-05, + "loss": 2.1032, + "step": 1850 + }, + { + "epoch": 0.75, + "grad_norm": 1.7006195783615112, + "learning_rate": 2.494666666666667e-05, + "loss": 2.1867, + "step": 1875 + }, + { + "epoch": 0.76, + "grad_norm": 1.5293747186660767, + "learning_rate": 2.5280000000000005e-05, + "loss": 2.2308, + "step": 1900 + }, + { + "epoch": 0.77, + "grad_norm": 2.1540281772613525, + "learning_rate": 2.5613333333333334e-05, + "loss": 2.184, + "step": 1925 + }, + { + "epoch": 0.78, + "grad_norm": 1.6256271600723267, + "learning_rate": 2.594666666666667e-05, + "loss": 2.2499, + "step": 1950 + }, + { + "epoch": 0.79, + "grad_norm": 1.394745111465454, + "learning_rate": 2.628e-05, + "loss": 2.1345, + "step": 1975 + }, + { + "epoch": 0.8, + "grad_norm": 1.5799375772476196, + "learning_rate": 2.6613333333333335e-05, + "loss": 2.1641, + "step": 2000 + }, + { + "epoch": 0.81, + "grad_norm": 1.4012045860290527, + "learning_rate": 2.694666666666667e-05, + "loss": 2.169, + "step": 2025 + }, + { + "epoch": 0.82, + "grad_norm": 1.5689955949783325, + "learning_rate": 2.728e-05, + "loss": 2.1793, + "step": 2050 + }, + { + "epoch": 0.83, + "grad_norm": 1.5047041177749634, + "learning_rate": 2.7613333333333335e-05, + "loss": 2.1984, + "step": 2075 + }, + { + "epoch": 0.84, + "grad_norm": 1.399196743965149, + "learning_rate": 2.7946666666666664e-05, + "loss": 2.1996, + "step": 2100 + }, + { + "epoch": 0.85, + "grad_norm": 1.7386200428009033, + "learning_rate": 2.828e-05, + "loss": 2.1799, + "step": 2125 + }, + { + "epoch": 0.86, + "grad_norm": 1.6265965700149536, + "learning_rate": 2.8613333333333336e-05, + "loss": 2.1946, + "step": 2150 + }, + { + "epoch": 0.87, + "grad_norm": 1.3947643041610718, + "learning_rate": 2.8946666666666665e-05, + "loss": 2.206, + "step": 2175 + }, + { + "epoch": 0.88, + "grad_norm": 1.4571387767791748, + "learning_rate": 2.928e-05, + "loss": 2.1895, + "step": 2200 + }, + { + "epoch": 0.89, + "grad_norm": 1.3659110069274902, + "learning_rate": 2.9613333333333337e-05, + "loss": 2.1662, + "step": 2225 + }, + { + "epoch": 0.9, + "grad_norm": 1.6103181838989258, + "learning_rate": 2.9946666666666666e-05, + "loss": 2.2149, + "step": 2250 + }, + { + "epoch": 0.91, + "grad_norm": 1.4823683500289917, + "learning_rate": 3.028e-05, + "loss": 2.2413, + "step": 2275 + }, + { + "epoch": 0.92, + "grad_norm": 1.4936267137527466, + "learning_rate": 3.0613333333333334e-05, + "loss": 2.1555, + "step": 2300 + }, + { + "epoch": 0.93, + "grad_norm": 1.487349271774292, + "learning_rate": 3.0946666666666666e-05, + "loss": 2.194, + "step": 2325 + }, + { + "epoch": 0.94, + "grad_norm": 1.626238226890564, + "learning_rate": 3.1280000000000005e-05, + "loss": 2.1924, + "step": 2350 + }, + { + "epoch": 0.95, + "grad_norm": 1.4786629676818848, + "learning_rate": 3.161333333333333e-05, + "loss": 2.1831, + "step": 2375 + }, + { + "epoch": 0.96, + "grad_norm": 3.430049419403076, + "learning_rate": 3.194666666666667e-05, + "loss": 2.1984, + "step": 2400 + }, + { + "epoch": 0.97, + "grad_norm": 1.4396685361862183, + "learning_rate": 3.2279999999999996e-05, + "loss": 2.1214, + "step": 2425 + }, + { + "epoch": 0.98, + "grad_norm": 1.5446324348449707, + "learning_rate": 3.2613333333333335e-05, + "loss": 2.1869, + "step": 2450 + }, + { + "epoch": 0.99, + "grad_norm": 1.591899037361145, + "learning_rate": 3.294666666666667e-05, + "loss": 2.166, + "step": 2475 + }, + { + "epoch": 1.0, + "grad_norm": 1.7222836017608643, + "learning_rate": 3.328e-05, + "loss": 2.1121, + "step": 2500 + }, + { + "epoch": 1.0, + "eval_gen_len": 13.0372, + "eval_loss": 2.038074016571045, + "eval_rouge1": 50.5825, + "eval_rouge2": 24.8269, + "eval_rougeL": 46.4789, + "eval_rougeLsum": 46.4745, + "eval_runtime": 194.3582, + "eval_samples_per_second": 102.903, + "eval_steps_per_second": 1.61, + "step": 2500 + }, + { + "epoch": 1.01, + "grad_norm": 1.6080988645553589, + "learning_rate": 3.361333333333333e-05, + "loss": 2.1864, + "step": 2525 + }, + { + "epoch": 1.02, + "grad_norm": 1.4214496612548828, + "learning_rate": 3.394666666666667e-05, + "loss": 2.1115, + "step": 2550 + }, + { + "epoch": 1.03, + "grad_norm": 1.4816479682922363, + "learning_rate": 3.4280000000000004e-05, + "loss": 2.1319, + "step": 2575 + }, + { + "epoch": 1.04, + "grad_norm": 1.5511168241500854, + "learning_rate": 3.4613333333333336e-05, + "loss": 2.1639, + "step": 2600 + }, + { + "epoch": 1.05, + "grad_norm": 1.5296896696090698, + "learning_rate": 3.494666666666667e-05, + "loss": 2.1616, + "step": 2625 + }, + { + "epoch": 1.06, + "grad_norm": 1.540246605873108, + "learning_rate": 3.528e-05, + "loss": 2.1357, + "step": 2650 + }, + { + "epoch": 1.07, + "grad_norm": 1.5819247961044312, + "learning_rate": 3.561333333333334e-05, + "loss": 2.1471, + "step": 2675 + }, + { + "epoch": 1.08, + "grad_norm": 1.4258027076721191, + "learning_rate": 3.5946666666666666e-05, + "loss": 2.113, + "step": 2700 + }, + { + "epoch": 1.09, + "grad_norm": 1.356101393699646, + "learning_rate": 3.6280000000000005e-05, + "loss": 2.1027, + "step": 2725 + }, + { + "epoch": 1.1, + "grad_norm": 1.754785418510437, + "learning_rate": 3.661333333333333e-05, + "loss": 2.1101, + "step": 2750 + }, + { + "epoch": 1.11, + "grad_norm": 4.111976146697998, + "learning_rate": 3.6933333333333334e-05, + "loss": 2.1733, + "step": 2775 + }, + { + "epoch": 1.12, + "grad_norm": 1.40675687789917, + "learning_rate": 3.726666666666667e-05, + "loss": 2.1599, + "step": 2800 + }, + { + "epoch": 1.13, + "grad_norm": 1.312162160873413, + "learning_rate": 3.76e-05, + "loss": 2.092, + "step": 2825 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 1.3412392139434814, + "learning_rate": 3.793333333333334e-05, + "loss": 2.1503, + "step": 2850 + }, + { + "epoch": 1.15, + "grad_norm": 1.3180032968521118, + "learning_rate": 3.8266666666666664e-05, + "loss": 2.1323, + "step": 2875 + }, + { + "epoch": 1.16, + "grad_norm": 1.4078686237335205, + "learning_rate": 3.86e-05, + "loss": 2.1663, + "step": 2900 + }, + { + "epoch": 1.17, + "grad_norm": 1.4193419218063354, + "learning_rate": 3.8933333333333336e-05, + "loss": 2.1407, + "step": 2925 + }, + { + "epoch": 1.18, + "grad_norm": 1.5237839221954346, + "learning_rate": 3.926666666666667e-05, + "loss": 2.1884, + "step": 2950 + }, + { + "epoch": 1.19, + "grad_norm": 1.3546311855316162, + "learning_rate": 3.960000000000001e-05, + "loss": 2.0769, + "step": 2975 + }, + { + "epoch": 1.2, + "grad_norm": 1.1709140539169312, + "learning_rate": 3.993333333333333e-05, + "loss": 1.9888, + "step": 3000 + }, + { + "epoch": 1.21, + "grad_norm": 1.3492522239685059, + "learning_rate": 4.026666666666667e-05, + "loss": 2.0922, + "step": 3025 + }, + { + "epoch": 1.22, + "grad_norm": 1.5517680644989014, + "learning_rate": 4.0600000000000004e-05, + "loss": 2.1161, + "step": 3050 + }, + { + "epoch": 1.23, + "grad_norm": 1.3444249629974365, + "learning_rate": 4.093333333333334e-05, + "loss": 2.1351, + "step": 3075 + }, + { + "epoch": 1.24, + "grad_norm": 1.4553472995758057, + "learning_rate": 4.126666666666667e-05, + "loss": 2.1889, + "step": 3100 + }, + { + "epoch": 1.25, + "grad_norm": 1.4636095762252808, + "learning_rate": 4.16e-05, + "loss": 2.1161, + "step": 3125 + }, + { + "epoch": 1.26, + "grad_norm": 1.3863240480422974, + "learning_rate": 4.1933333333333334e-05, + "loss": 2.0402, + "step": 3150 + }, + { + "epoch": 1.27, + "grad_norm": 1.2597577571868896, + "learning_rate": 4.226666666666667e-05, + "loss": 2.0852, + "step": 3175 + }, + { + "epoch": 1.28, + "grad_norm": 1.3540936708450317, + "learning_rate": 4.26e-05, + "loss": 2.1439, + "step": 3200 + }, + { + "epoch": 1.29, + "grad_norm": 1.4448434114456177, + "learning_rate": 4.293333333333334e-05, + "loss": 2.1271, + "step": 3225 + }, + { + "epoch": 1.3, + "grad_norm": 1.352243185043335, + "learning_rate": 4.3266666666666664e-05, + "loss": 2.1263, + "step": 3250 + }, + { + "epoch": 1.31, + "grad_norm": 1.5224635601043701, + "learning_rate": 4.36e-05, + "loss": 2.1373, + "step": 3275 + }, + { + "epoch": 1.32, + "grad_norm": 1.51206636428833, + "learning_rate": 4.3933333333333335e-05, + "loss": 2.1521, + "step": 3300 + }, + { + "epoch": 1.33, + "grad_norm": 1.4777268171310425, + "learning_rate": 4.426666666666667e-05, + "loss": 2.0391, + "step": 3325 + }, + { + "epoch": 1.34, + "grad_norm": 1.278085708618164, + "learning_rate": 4.46e-05, + "loss": 2.1313, + "step": 3350 + }, + { + "epoch": 1.35, + "grad_norm": 1.4700874090194702, + "learning_rate": 4.493333333333333e-05, + "loss": 2.0469, + "step": 3375 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.6795507669448853, + "learning_rate": 4.526666666666667e-05, + "loss": 2.1422, + "step": 3400 + }, + { + "epoch": 1.37, + "grad_norm": 1.496156096458435, + "learning_rate": 4.5600000000000004e-05, + "loss": 2.0771, + "step": 3425 + }, + { + "epoch": 1.38, + "grad_norm": 1.4133617877960205, + "learning_rate": 4.5933333333333336e-05, + "loss": 2.0397, + "step": 3450 + }, + { + "epoch": 1.3900000000000001, + "grad_norm": 1.3693889379501343, + "learning_rate": 4.626666666666667e-05, + "loss": 2.1713, + "step": 3475 + }, + { + "epoch": 1.4, + "grad_norm": 1.5390167236328125, + "learning_rate": 4.660000000000001e-05, + "loss": 2.1348, + "step": 3500 + }, + { + "epoch": 1.41, + "grad_norm": 1.7128788232803345, + "learning_rate": 4.6933333333333333e-05, + "loss": 2.0817, + "step": 3525 + }, + { + "epoch": 1.42, + "grad_norm": 1.4634878635406494, + "learning_rate": 4.726666666666667e-05, + "loss": 2.1121, + "step": 3550 + }, + { + "epoch": 1.43, + "grad_norm": 2.623915672302246, + "learning_rate": 4.76e-05, + "loss": 2.1968, + "step": 3575 + }, + { + "epoch": 1.44, + "grad_norm": 1.4591940641403198, + "learning_rate": 4.793333333333334e-05, + "loss": 2.0789, + "step": 3600 + }, + { + "epoch": 1.45, + "grad_norm": 1.2940136194229126, + "learning_rate": 4.826666666666667e-05, + "loss": 2.147, + "step": 3625 + }, + { + "epoch": 1.46, + "grad_norm": 1.283172845840454, + "learning_rate": 4.86e-05, + "loss": 2.0765, + "step": 3650 + }, + { + "epoch": 1.47, + "grad_norm": 1.3233940601348877, + "learning_rate": 4.8933333333333335e-05, + "loss": 2.1018, + "step": 3675 + }, + { + "epoch": 1.48, + "grad_norm": 1.6195733547210693, + "learning_rate": 4.926666666666667e-05, + "loss": 2.1608, + "step": 3700 + }, + { + "epoch": 1.49, + "grad_norm": 1.5443871021270752, + "learning_rate": 4.96e-05, + "loss": 2.0899, + "step": 3725 + }, + { + "epoch": 1.5, + "grad_norm": 1.3754618167877197, + "learning_rate": 4.993333333333334e-05, + "loss": 2.1288, + "step": 3750 + }, + { + "epoch": 1.51, + "grad_norm": 1.2181512117385864, + "learning_rate": 4.997037037037037e-05, + "loss": 2.1582, + "step": 3775 + }, + { + "epoch": 1.52, + "grad_norm": 1.4862068891525269, + "learning_rate": 4.993333333333334e-05, + "loss": 2.1475, + "step": 3800 + }, + { + "epoch": 1.53, + "grad_norm": 1.3613662719726562, + "learning_rate": 4.9896296296296293e-05, + "loss": 2.1625, + "step": 3825 + }, + { + "epoch": 1.54, + "grad_norm": 1.3887194395065308, + "learning_rate": 4.985925925925926e-05, + "loss": 2.1284, + "step": 3850 + }, + { + "epoch": 1.55, + "grad_norm": 1.468736171722412, + "learning_rate": 4.982222222222222e-05, + "loss": 2.0792, + "step": 3875 + }, + { + "epoch": 1.56, + "grad_norm": 1.4086352586746216, + "learning_rate": 4.978518518518519e-05, + "loss": 2.0997, + "step": 3900 + }, + { + "epoch": 1.5699999999999998, + "grad_norm": 1.3831346035003662, + "learning_rate": 4.9748148148148146e-05, + "loss": 2.0768, + "step": 3925 + }, + { + "epoch": 1.58, + "grad_norm": 1.2189925909042358, + "learning_rate": 4.9711111111111115e-05, + "loss": 2.0809, + "step": 3950 + }, + { + "epoch": 1.5899999999999999, + "grad_norm": 1.2166587114334106, + "learning_rate": 4.9674074074074076e-05, + "loss": 2.128, + "step": 3975 + }, + { + "epoch": 1.6, + "grad_norm": 1.5292308330535889, + "learning_rate": 4.963703703703704e-05, + "loss": 2.1093, + "step": 4000 + }, + { + "epoch": 1.6099999999999999, + "grad_norm": 1.492801308631897, + "learning_rate": 4.96e-05, + "loss": 2.116, + "step": 4025 + }, + { + "epoch": 1.62, + "grad_norm": 1.4455024003982544, + "learning_rate": 4.956296296296297e-05, + "loss": 2.1292, + "step": 4050 + }, + { + "epoch": 1.63, + "grad_norm": 1.3066167831420898, + "learning_rate": 4.952592592592592e-05, + "loss": 2.1011, + "step": 4075 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 1.496256947517395, + "learning_rate": 4.949037037037037e-05, + "loss": 2.1056, + "step": 4100 + }, + { + "epoch": 1.65, + "grad_norm": 1.2948390245437622, + "learning_rate": 4.9453333333333336e-05, + "loss": 2.0569, + "step": 4125 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 1.4984357357025146, + "learning_rate": 4.94162962962963e-05, + "loss": 2.1433, + "step": 4150 + }, + { + "epoch": 1.67, + "grad_norm": 1.7721185684204102, + "learning_rate": 4.9379259259259266e-05, + "loss": 2.1098, + "step": 4175 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.3611931800842285, + "learning_rate": 4.934222222222222e-05, + "loss": 2.0652, + "step": 4200 + }, + { + "epoch": 1.69, + "grad_norm": 1.3092522621154785, + "learning_rate": 4.930518518518519e-05, + "loss": 2.0888, + "step": 4225 + }, + { + "epoch": 1.7, + "grad_norm": 1.3880172967910767, + "learning_rate": 4.926814814814815e-05, + "loss": 2.0751, + "step": 4250 + }, + { + "epoch": 1.71, + "grad_norm": 1.361522912979126, + "learning_rate": 4.923111111111111e-05, + "loss": 2.1263, + "step": 4275 + }, + { + "epoch": 1.72, + "grad_norm": 1.357974886894226, + "learning_rate": 4.9194074074074074e-05, + "loss": 2.0967, + "step": 4300 + }, + { + "epoch": 1.73, + "grad_norm": 1.3499886989593506, + "learning_rate": 4.915703703703704e-05, + "loss": 2.0426, + "step": 4325 + }, + { + "epoch": 1.74, + "grad_norm": 1.4301838874816895, + "learning_rate": 4.9120000000000004e-05, + "loss": 2.0596, + "step": 4350 + }, + { + "epoch": 1.75, + "grad_norm": 1.4266048669815063, + "learning_rate": 4.9082962962962966e-05, + "loss": 2.0574, + "step": 4375 + }, + { + "epoch": 1.76, + "grad_norm": 1.4211664199829102, + "learning_rate": 4.904592592592593e-05, + "loss": 2.0839, + "step": 4400 + }, + { + "epoch": 1.77, + "grad_norm": 1.2608351707458496, + "learning_rate": 4.9008888888888896e-05, + "loss": 2.037, + "step": 4425 + }, + { + "epoch": 1.78, + "grad_norm": 1.35879385471344, + "learning_rate": 4.897185185185185e-05, + "loss": 2.0807, + "step": 4450 + }, + { + "epoch": 1.79, + "grad_norm": 1.4060136079788208, + "learning_rate": 4.893481481481482e-05, + "loss": 2.1474, + "step": 4475 + }, + { + "epoch": 1.8, + "grad_norm": 1.3391823768615723, + "learning_rate": 4.889777777777778e-05, + "loss": 2.115, + "step": 4500 + }, + { + "epoch": 1.81, + "grad_norm": 1.6455671787261963, + "learning_rate": 4.886074074074075e-05, + "loss": 2.0943, + "step": 4525 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 1.3512789011001587, + "learning_rate": 4.88237037037037e-05, + "loss": 2.0675, + "step": 4550 + }, + { + "epoch": 1.83, + "grad_norm": 1.3271268606185913, + "learning_rate": 4.878666666666667e-05, + "loss": 2.139, + "step": 4575 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 1.3180307149887085, + "learning_rate": 4.874962962962963e-05, + "loss": 2.0442, + "step": 4600 + }, + { + "epoch": 1.85, + "grad_norm": 1.322908878326416, + "learning_rate": 4.8712592592592595e-05, + "loss": 2.1164, + "step": 4625 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 1.2810012102127075, + "learning_rate": 4.8675555555555556e-05, + "loss": 2.0138, + "step": 4650 + }, + { + "epoch": 1.87, + "grad_norm": 1.3733242750167847, + "learning_rate": 4.8638518518518525e-05, + "loss": 2.0639, + "step": 4675 + }, + { + "epoch": 1.88, + "grad_norm": 1.3254153728485107, + "learning_rate": 4.860148148148148e-05, + "loss": 2.0502, + "step": 4700 + }, + { + "epoch": 1.8900000000000001, + "grad_norm": 1.3326213359832764, + "learning_rate": 4.856444444444445e-05, + "loss": 2.0394, + "step": 4725 + }, + { + "epoch": 1.9, + "grad_norm": 1.3088740110397339, + "learning_rate": 4.852740740740741e-05, + "loss": 2.1614, + "step": 4750 + }, + { + "epoch": 1.9100000000000001, + "grad_norm": 1.4830114841461182, + "learning_rate": 4.849037037037038e-05, + "loss": 2.0836, + "step": 4775 + }, + { + "epoch": 1.92, + "grad_norm": 1.5083235502243042, + "learning_rate": 4.845333333333333e-05, + "loss": 2.084, + "step": 4800 + }, + { + "epoch": 1.9300000000000002, + "grad_norm": 1.4989155530929565, + "learning_rate": 4.84162962962963e-05, + "loss": 2.1183, + "step": 4825 + }, + { + "epoch": 1.94, + "grad_norm": 1.2256449460983276, + "learning_rate": 4.837925925925926e-05, + "loss": 2.1002, + "step": 4850 + }, + { + "epoch": 1.95, + "grad_norm": 1.2824608087539673, + "learning_rate": 4.8342222222222224e-05, + "loss": 2.0369, + "step": 4875 + }, + { + "epoch": 1.96, + "grad_norm": 1.2830954790115356, + "learning_rate": 4.8305185185185185e-05, + "loss": 2.0361, + "step": 4900 + }, + { + "epoch": 1.97, + "grad_norm": 1.4407249689102173, + "learning_rate": 4.8268148148148154e-05, + "loss": 2.0535, + "step": 4925 + }, + { + "epoch": 1.98, + "grad_norm": 1.3975064754486084, + "learning_rate": 4.8231111111111115e-05, + "loss": 2.0805, + "step": 4950 + }, + { + "epoch": 1.99, + "grad_norm": 1.2193372249603271, + "learning_rate": 4.819407407407408e-05, + "loss": 2.0748, + "step": 4975 + }, + { + "epoch": 2.0, + "grad_norm": 1.4137203693389893, + "learning_rate": 4.815703703703704e-05, + "loss": 2.0812, + "step": 5000 + }, + { + "epoch": 2.0, + "eval_gen_len": 13.1104, + "eval_loss": 1.972951054573059, + "eval_rouge1": 51.236, + "eval_rouge2": 25.4764, + "eval_rougeL": 47.1309, + "eval_rougeLsum": 47.1388, + "eval_runtime": 195.1159, + "eval_samples_per_second": 102.503, + "eval_steps_per_second": 1.604, + "step": 5000 + }, + { + "epoch": 2.01, + "grad_norm": 1.292515754699707, + "learning_rate": 4.812000000000001e-05, + "loss": 1.9586, + "step": 5025 + }, + { + "epoch": 2.02, + "grad_norm": 1.329329490661621, + "learning_rate": 4.808296296296296e-05, + "loss": 2.0068, + "step": 5050 + }, + { + "epoch": 2.03, + "grad_norm": 1.5336360931396484, + "learning_rate": 4.804592592592593e-05, + "loss": 1.986, + "step": 5075 + }, + { + "epoch": 2.04, + "grad_norm": 1.313008427619934, + "learning_rate": 4.800888888888889e-05, + "loss": 2.0093, + "step": 5100 + }, + { + "epoch": 2.05, + "grad_norm": 1.390008807182312, + "learning_rate": 4.797185185185186e-05, + "loss": 2.0478, + "step": 5125 + }, + { + "epoch": 2.06, + "grad_norm": 1.4343863725662231, + "learning_rate": 4.7934814814814815e-05, + "loss": 2.0325, + "step": 5150 + }, + { + "epoch": 2.07, + "grad_norm": 1.3213175535202026, + "learning_rate": 4.789777777777778e-05, + "loss": 2.0362, + "step": 5175 + }, + { + "epoch": 2.08, + "grad_norm": 1.2570987939834595, + "learning_rate": 4.7860740740740745e-05, + "loss": 1.9476, + "step": 5200 + }, + { + "epoch": 2.09, + "grad_norm": 1.377868413925171, + "learning_rate": 4.7823703703703706e-05, + "loss": 1.9655, + "step": 5225 + }, + { + "epoch": 2.1, + "grad_norm": 1.1855210065841675, + "learning_rate": 4.778666666666667e-05, + "loss": 2.0382, + "step": 5250 + }, + { + "epoch": 2.11, + "grad_norm": 1.3284786939620972, + "learning_rate": 4.7749629629629636e-05, + "loss": 2.0808, + "step": 5275 + }, + { + "epoch": 2.12, + "grad_norm": 1.4279937744140625, + "learning_rate": 4.77125925925926e-05, + "loss": 2.0179, + "step": 5300 + }, + { + "epoch": 2.13, + "grad_norm": 1.5963534116744995, + "learning_rate": 4.767555555555556e-05, + "loss": 1.9902, + "step": 5325 + }, + { + "epoch": 2.14, + "grad_norm": 1.3769519329071045, + "learning_rate": 4.763851851851852e-05, + "loss": 1.9842, + "step": 5350 + }, + { + "epoch": 2.15, + "grad_norm": 1.3761203289031982, + "learning_rate": 4.760148148148149e-05, + "loss": 2.0256, + "step": 5375 + }, + { + "epoch": 2.16, + "grad_norm": 1.4661396741867065, + "learning_rate": 4.7564444444444444e-05, + "loss": 2.0704, + "step": 5400 + }, + { + "epoch": 2.17, + "grad_norm": 1.301453709602356, + "learning_rate": 4.752740740740741e-05, + "loss": 1.9957, + "step": 5425 + }, + { + "epoch": 2.18, + "grad_norm": 1.3096243143081665, + "learning_rate": 4.7490370370370374e-05, + "loss": 2.0023, + "step": 5450 + }, + { + "epoch": 2.19, + "grad_norm": 1.3345898389816284, + "learning_rate": 4.7453333333333335e-05, + "loss": 2.0472, + "step": 5475 + }, + { + "epoch": 2.2, + "grad_norm": 1.1880935430526733, + "learning_rate": 4.74162962962963e-05, + "loss": 1.9565, + "step": 5500 + }, + { + "epoch": 2.21, + "grad_norm": 1.2483596801757812, + "learning_rate": 4.7379259259259265e-05, + "loss": 2.0419, + "step": 5525 + }, + { + "epoch": 2.22, + "grad_norm": 1.3003907203674316, + "learning_rate": 4.734222222222223e-05, + "loss": 2.0411, + "step": 5550 + }, + { + "epoch": 2.23, + "grad_norm": 1.3677852153778076, + "learning_rate": 4.730518518518519e-05, + "loss": 2.0454, + "step": 5575 + }, + { + "epoch": 2.24, + "grad_norm": 1.3494675159454346, + "learning_rate": 4.726814814814815e-05, + "loss": 1.9652, + "step": 5600 + }, + { + "epoch": 2.25, + "grad_norm": 1.2623995542526245, + "learning_rate": 4.723111111111112e-05, + "loss": 2.0516, + "step": 5625 + }, + { + "epoch": 2.26, + "grad_norm": 1.4136385917663574, + "learning_rate": 4.719407407407407e-05, + "loss": 1.9735, + "step": 5650 + }, + { + "epoch": 2.27, + "grad_norm": 1.2677298784255981, + "learning_rate": 4.715703703703704e-05, + "loss": 1.9916, + "step": 5675 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 1.422560453414917, + "learning_rate": 4.712e-05, + "loss": 2.0216, + "step": 5700 + }, + { + "epoch": 2.29, + "grad_norm": 1.4796884059906006, + "learning_rate": 4.7082962962962964e-05, + "loss": 2.0134, + "step": 5725 + }, + { + "epoch": 2.3, + "grad_norm": 1.347684383392334, + "learning_rate": 4.7045925925925926e-05, + "loss": 2.0073, + "step": 5750 + }, + { + "epoch": 2.31, + "grad_norm": 1.238133192062378, + "learning_rate": 4.7008888888888894e-05, + "loss": 2.0002, + "step": 5775 + }, + { + "epoch": 2.32, + "grad_norm": 1.271411657333374, + "learning_rate": 4.6971851851851856e-05, + "loss": 1.9866, + "step": 5800 + }, + { + "epoch": 2.33, + "grad_norm": 1.4188841581344604, + "learning_rate": 4.693481481481482e-05, + "loss": 1.96, + "step": 5825 + }, + { + "epoch": 2.34, + "grad_norm": 1.3498203754425049, + "learning_rate": 4.689777777777778e-05, + "loss": 2.0338, + "step": 5850 + }, + { + "epoch": 2.35, + "grad_norm": 1.224448800086975, + "learning_rate": 4.686074074074074e-05, + "loss": 2.0572, + "step": 5875 + }, + { + "epoch": 2.36, + "grad_norm": 1.4224382638931274, + "learning_rate": 4.682370370370371e-05, + "loss": 2.0333, + "step": 5900 + }, + { + "epoch": 2.37, + "grad_norm": 1.553787350654602, + "learning_rate": 4.678666666666667e-05, + "loss": 1.9975, + "step": 5925 + }, + { + "epoch": 2.38, + "grad_norm": 1.377954125404358, + "learning_rate": 4.674962962962963e-05, + "loss": 2.0317, + "step": 5950 + }, + { + "epoch": 2.39, + "grad_norm": 1.4469361305236816, + "learning_rate": 4.6712592592592594e-05, + "loss": 2.0382, + "step": 5975 + }, + { + "epoch": 2.4, + "grad_norm": 1.3143268823623657, + "learning_rate": 4.6675555555555555e-05, + "loss": 2.0107, + "step": 6000 + }, + { + "epoch": 2.41, + "grad_norm": 1.322283148765564, + "learning_rate": 4.6638518518518523e-05, + "loss": 1.9968, + "step": 6025 + }, + { + "epoch": 2.42, + "grad_norm": 1.5277537107467651, + "learning_rate": 4.6601481481481485e-05, + "loss": 2.0148, + "step": 6050 + }, + { + "epoch": 2.43, + "grad_norm": 1.3555701971054077, + "learning_rate": 4.6564444444444447e-05, + "loss": 1.9547, + "step": 6075 + }, + { + "epoch": 2.44, + "grad_norm": 1.3030173778533936, + "learning_rate": 4.652740740740741e-05, + "loss": 2.0186, + "step": 6100 + }, + { + "epoch": 2.45, + "grad_norm": 1.1996219158172607, + "learning_rate": 4.6491851851851854e-05, + "loss": 1.9938, + "step": 6125 + }, + { + "epoch": 2.46, + "grad_norm": 1.2713743448257446, + "learning_rate": 4.6454814814814815e-05, + "loss": 2.0135, + "step": 6150 + }, + { + "epoch": 2.4699999999999998, + "grad_norm": 1.4466146230697632, + "learning_rate": 4.6417777777777784e-05, + "loss": 2.0027, + "step": 6175 + }, + { + "epoch": 2.48, + "grad_norm": 1.3012115955352783, + "learning_rate": 4.638074074074074e-05, + "loss": 2.0462, + "step": 6200 + }, + { + "epoch": 2.49, + "grad_norm": 1.4161940813064575, + "learning_rate": 4.634370370370371e-05, + "loss": 2.0416, + "step": 6225 + }, + { + "epoch": 2.5, + "grad_norm": 1.382034420967102, + "learning_rate": 4.630666666666667e-05, + "loss": 1.9795, + "step": 6250 + }, + { + "epoch": 2.51, + "grad_norm": 1.4618980884552002, + "learning_rate": 4.626962962962963e-05, + "loss": 2.0163, + "step": 6275 + }, + { + "epoch": 2.52, + "grad_norm": 1.304030179977417, + "learning_rate": 4.623259259259259e-05, + "loss": 1.9821, + "step": 6300 + }, + { + "epoch": 2.5300000000000002, + "grad_norm": 1.5101556777954102, + "learning_rate": 4.619555555555556e-05, + "loss": 1.9863, + "step": 6325 + }, + { + "epoch": 2.54, + "grad_norm": 1.3217978477478027, + "learning_rate": 4.615851851851852e-05, + "loss": 2.0009, + "step": 6350 + }, + { + "epoch": 2.55, + "grad_norm": 1.367722749710083, + "learning_rate": 4.612148148148148e-05, + "loss": 2.0459, + "step": 6375 + }, + { + "epoch": 2.56, + "grad_norm": 1.5014063119888306, + "learning_rate": 4.6084444444444444e-05, + "loss": 2.0129, + "step": 6400 + }, + { + "epoch": 2.57, + "grad_norm": 1.4557509422302246, + "learning_rate": 4.604740740740741e-05, + "loss": 1.9869, + "step": 6425 + }, + { + "epoch": 2.58, + "grad_norm": 1.3464524745941162, + "learning_rate": 4.601037037037037e-05, + "loss": 2.0133, + "step": 6450 + }, + { + "epoch": 2.59, + "grad_norm": 1.41274893283844, + "learning_rate": 4.5973333333333336e-05, + "loss": 1.996, + "step": 6475 + }, + { + "epoch": 2.6, + "grad_norm": 1.3834503889083862, + "learning_rate": 4.59362962962963e-05, + "loss": 2.0057, + "step": 6500 + }, + { + "epoch": 2.61, + "grad_norm": 1.4154237508773804, + "learning_rate": 4.5899259259259266e-05, + "loss": 2.0168, + "step": 6525 + }, + { + "epoch": 2.62, + "grad_norm": 1.2246060371398926, + "learning_rate": 4.586222222222222e-05, + "loss": 1.9689, + "step": 6550 + }, + { + "epoch": 2.63, + "grad_norm": 1.5142685174942017, + "learning_rate": 4.582518518518519e-05, + "loss": 2.0025, + "step": 6575 + }, + { + "epoch": 2.64, + "grad_norm": 1.6933802366256714, + "learning_rate": 4.578814814814815e-05, + "loss": 2.0064, + "step": 6600 + }, + { + "epoch": 2.65, + "grad_norm": 1.3439992666244507, + "learning_rate": 4.575111111111111e-05, + "loss": 1.9638, + "step": 6625 + }, + { + "epoch": 2.66, + "grad_norm": 1.4206079244613647, + "learning_rate": 4.5714074074074074e-05, + "loss": 1.9888, + "step": 6650 + }, + { + "epoch": 2.67, + "grad_norm": 1.3695896863937378, + "learning_rate": 4.567703703703704e-05, + "loss": 1.9735, + "step": 6675 + }, + { + "epoch": 2.68, + "grad_norm": 1.2460039854049683, + "learning_rate": 4.564e-05, + "loss": 2.0707, + "step": 6700 + }, + { + "epoch": 2.69, + "grad_norm": 1.2507898807525635, + "learning_rate": 4.5602962962962965e-05, + "loss": 2.0131, + "step": 6725 + }, + { + "epoch": 2.7, + "grad_norm": 1.2939226627349854, + "learning_rate": 4.5565925925925927e-05, + "loss": 1.9902, + "step": 6750 + }, + { + "epoch": 2.71, + "grad_norm": 1.4121019840240479, + "learning_rate": 4.5528888888888895e-05, + "loss": 1.9906, + "step": 6775 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 1.1183571815490723, + "learning_rate": 4.549185185185185e-05, + "loss": 2.0259, + "step": 6800 + }, + { + "epoch": 2.73, + "grad_norm": 1.4369187355041504, + "learning_rate": 4.545481481481482e-05, + "loss": 1.963, + "step": 6825 + }, + { + "epoch": 2.74, + "grad_norm": 1.1758921146392822, + "learning_rate": 4.541777777777778e-05, + "loss": 1.9743, + "step": 6850 + }, + { + "epoch": 2.75, + "grad_norm": 1.3526884317398071, + "learning_rate": 4.538074074074074e-05, + "loss": 2.0403, + "step": 6875 + }, + { + "epoch": 2.76, + "grad_norm": 1.2723559141159058, + "learning_rate": 4.53437037037037e-05, + "loss": 1.958, + "step": 6900 + }, + { + "epoch": 2.77, + "grad_norm": 1.4061169624328613, + "learning_rate": 4.530666666666667e-05, + "loss": 2.0213, + "step": 6925 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 1.292668104171753, + "learning_rate": 4.526962962962963e-05, + "loss": 2.0301, + "step": 6950 + }, + { + "epoch": 2.79, + "grad_norm": 1.2787526845932007, + "learning_rate": 4.5232592592592594e-05, + "loss": 1.9822, + "step": 6975 + }, + { + "epoch": 2.8, + "grad_norm": 1.2812767028808594, + "learning_rate": 4.5195555555555556e-05, + "loss": 1.9919, + "step": 7000 + }, + { + "epoch": 2.81, + "grad_norm": 1.2720341682434082, + "learning_rate": 4.5158518518518524e-05, + "loss": 2.0142, + "step": 7025 + }, + { + "epoch": 2.82, + "grad_norm": 1.3361945152282715, + "learning_rate": 4.512148148148148e-05, + "loss": 2.0006, + "step": 7050 + }, + { + "epoch": 2.83, + "grad_norm": 1.1397879123687744, + "learning_rate": 4.508444444444445e-05, + "loss": 1.9674, + "step": 7075 + }, + { + "epoch": 2.84, + "grad_norm": 1.3024892807006836, + "learning_rate": 4.504740740740741e-05, + "loss": 1.969, + "step": 7100 + }, + { + "epoch": 2.85, + "grad_norm": 1.2149879932403564, + "learning_rate": 4.501037037037038e-05, + "loss": 1.9873, + "step": 7125 + }, + { + "epoch": 2.86, + "grad_norm": 1.2626320123672485, + "learning_rate": 4.497333333333333e-05, + "loss": 1.9552, + "step": 7150 + }, + { + "epoch": 2.87, + "grad_norm": 1.3262602090835571, + "learning_rate": 4.49362962962963e-05, + "loss": 1.9485, + "step": 7175 + }, + { + "epoch": 2.88, + "grad_norm": 1.4364970922470093, + "learning_rate": 4.489925925925926e-05, + "loss": 2.0553, + "step": 7200 + }, + { + "epoch": 2.89, + "grad_norm": 1.2992265224456787, + "learning_rate": 4.486222222222222e-05, + "loss": 2.0324, + "step": 7225 + }, + { + "epoch": 2.9, + "grad_norm": 1.246061086654663, + "learning_rate": 4.4825185185185185e-05, + "loss": 1.9706, + "step": 7250 + }, + { + "epoch": 2.91, + "grad_norm": 1.625594139099121, + "learning_rate": 4.478814814814815e-05, + "loss": 1.9419, + "step": 7275 + }, + { + "epoch": 2.92, + "grad_norm": 1.2219802141189575, + "learning_rate": 4.4751111111111115e-05, + "loss": 2.0337, + "step": 7300 + }, + { + "epoch": 2.93, + "grad_norm": 1.436552882194519, + "learning_rate": 4.4714074074074076e-05, + "loss": 1.9975, + "step": 7325 + }, + { + "epoch": 2.94, + "grad_norm": 1.3120185136795044, + "learning_rate": 4.467703703703704e-05, + "loss": 2.0306, + "step": 7350 + }, + { + "epoch": 2.95, + "grad_norm": 1.2584844827651978, + "learning_rate": 4.4640000000000006e-05, + "loss": 2.0335, + "step": 7375 + }, + { + "epoch": 2.96, + "grad_norm": 1.4038913249969482, + "learning_rate": 4.460296296296296e-05, + "loss": 1.9072, + "step": 7400 + }, + { + "epoch": 2.9699999999999998, + "grad_norm": 1.2124029397964478, + "learning_rate": 4.456592592592593e-05, + "loss": 2.0269, + "step": 7425 + }, + { + "epoch": 2.98, + "grad_norm": 1.45619797706604, + "learning_rate": 4.452888888888889e-05, + "loss": 1.9908, + "step": 7450 + }, + { + "epoch": 2.99, + "grad_norm": 1.2446014881134033, + "learning_rate": 4.449185185185185e-05, + "loss": 2.0035, + "step": 7475 + }, + { + "epoch": 3.0, + "grad_norm": 1.4367069005966187, + "learning_rate": 4.4454814814814814e-05, + "loss": 2.0286, + "step": 7500 + }, + { + "epoch": 3.0, + "eval_gen_len": 13.1299, + "eval_loss": 1.9462145566940308, + "eval_rouge1": 51.2672, + "eval_rouge2": 25.5438, + "eval_rougeL": 47.1691, + "eval_rougeLsum": 47.1653, + "eval_runtime": 194.6381, + "eval_samples_per_second": 102.755, + "eval_steps_per_second": 1.608, + "step": 7500 + }, + { + "epoch": 3.01, + "grad_norm": 1.4803961515426636, + "learning_rate": 4.441777777777778e-05, + "loss": 1.8803, + "step": 7525 + }, + { + "epoch": 3.02, + "grad_norm": 1.3261098861694336, + "learning_rate": 4.4380740740740744e-05, + "loss": 1.9219, + "step": 7550 + }, + { + "epoch": 3.03, + "grad_norm": 1.2701349258422852, + "learning_rate": 4.4343703703703706e-05, + "loss": 1.9066, + "step": 7575 + }, + { + "epoch": 3.04, + "grad_norm": 1.4366850852966309, + "learning_rate": 4.430666666666667e-05, + "loss": 1.9186, + "step": 7600 + }, + { + "epoch": 3.05, + "grad_norm": 1.304621934890747, + "learning_rate": 4.4269629629629635e-05, + "loss": 1.9499, + "step": 7625 + }, + { + "epoch": 3.06, + "grad_norm": 1.4453222751617432, + "learning_rate": 4.423259259259259e-05, + "loss": 1.8863, + "step": 7650 + }, + { + "epoch": 3.07, + "grad_norm": 1.1800676584243774, + "learning_rate": 4.419555555555556e-05, + "loss": 1.8879, + "step": 7675 + }, + { + "epoch": 3.08, + "grad_norm": 1.5625019073486328, + "learning_rate": 4.415851851851852e-05, + "loss": 1.8945, + "step": 7700 + }, + { + "epoch": 3.09, + "grad_norm": 1.702916145324707, + "learning_rate": 4.412148148148149e-05, + "loss": 1.9361, + "step": 7725 + }, + { + "epoch": 3.1, + "grad_norm": 1.641830325126648, + "learning_rate": 4.408444444444444e-05, + "loss": 1.9031, + "step": 7750 + }, + { + "epoch": 3.11, + "grad_norm": 1.6276681423187256, + "learning_rate": 4.404740740740741e-05, + "loss": 1.9207, + "step": 7775 + }, + { + "epoch": 3.12, + "grad_norm": 1.4015640020370483, + "learning_rate": 4.401037037037037e-05, + "loss": 1.91, + "step": 7800 + }, + { + "epoch": 3.13, + "grad_norm": 1.1643470525741577, + "learning_rate": 4.3973333333333335e-05, + "loss": 1.94, + "step": 7825 + }, + { + "epoch": 3.14, + "grad_norm": 1.436637282371521, + "learning_rate": 4.393777777777778e-05, + "loss": 1.9152, + "step": 7850 + }, + { + "epoch": 3.15, + "grad_norm": 1.2453664541244507, + "learning_rate": 4.390074074074074e-05, + "loss": 1.8912, + "step": 7875 + }, + { + "epoch": 3.16, + "grad_norm": 1.507677674293518, + "learning_rate": 4.386370370370371e-05, + "loss": 1.9148, + "step": 7900 + }, + { + "epoch": 3.17, + "grad_norm": 1.3336275815963745, + "learning_rate": 4.382666666666667e-05, + "loss": 1.9583, + "step": 7925 + }, + { + "epoch": 3.18, + "grad_norm": 1.5578241348266602, + "learning_rate": 4.378962962962963e-05, + "loss": 1.9273, + "step": 7950 + }, + { + "epoch": 3.19, + "grad_norm": 1.299078345298767, + "learning_rate": 4.3752592592592595e-05, + "loss": 1.9137, + "step": 7975 + }, + { + "epoch": 3.2, + "grad_norm": 1.4711804389953613, + "learning_rate": 4.3715555555555556e-05, + "loss": 1.8916, + "step": 8000 + }, + { + "epoch": 3.21, + "grad_norm": 1.258434534072876, + "learning_rate": 4.367851851851852e-05, + "loss": 1.8674, + "step": 8025 + }, + { + "epoch": 3.22, + "grad_norm": 1.4030392169952393, + "learning_rate": 4.3641481481481486e-05, + "loss": 1.9207, + "step": 8050 + }, + { + "epoch": 3.23, + "grad_norm": 1.3217052221298218, + "learning_rate": 4.360444444444445e-05, + "loss": 1.9356, + "step": 8075 + }, + { + "epoch": 3.24, + "grad_norm": 1.4329230785369873, + "learning_rate": 4.356740740740741e-05, + "loss": 1.9849, + "step": 8100 + }, + { + "epoch": 3.25, + "grad_norm": 1.4400358200073242, + "learning_rate": 4.353037037037037e-05, + "loss": 1.966, + "step": 8125 + }, + { + "epoch": 3.26, + "grad_norm": 1.449614405632019, + "learning_rate": 4.349333333333334e-05, + "loss": 1.9351, + "step": 8150 + }, + { + "epoch": 3.27, + "grad_norm": 1.3323086500167847, + "learning_rate": 4.34562962962963e-05, + "loss": 1.9247, + "step": 8175 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 1.267240047454834, + "learning_rate": 4.341925925925926e-05, + "loss": 1.902, + "step": 8200 + }, + { + "epoch": 3.29, + "grad_norm": 1.276930332183838, + "learning_rate": 4.3382222222222224e-05, + "loss": 1.938, + "step": 8225 + }, + { + "epoch": 3.3, + "grad_norm": 1.3871114253997803, + "learning_rate": 4.3345185185185186e-05, + "loss": 1.8964, + "step": 8250 + }, + { + "epoch": 3.31, + "grad_norm": 1.4275935888290405, + "learning_rate": 4.330814814814815e-05, + "loss": 1.8912, + "step": 8275 + }, + { + "epoch": 3.32, + "grad_norm": 1.4508986473083496, + "learning_rate": 4.3271111111111115e-05, + "loss": 1.959, + "step": 8300 + }, + { + "epoch": 3.33, + "grad_norm": 1.4686108827590942, + "learning_rate": 4.323407407407408e-05, + "loss": 1.9674, + "step": 8325 + }, + { + "epoch": 3.34, + "grad_norm": 1.2445565462112427, + "learning_rate": 4.319703703703704e-05, + "loss": 1.9429, + "step": 8350 + }, + { + "epoch": 3.35, + "grad_norm": 1.3624173402786255, + "learning_rate": 4.316e-05, + "loss": 1.943, + "step": 8375 + }, + { + "epoch": 3.36, + "grad_norm": 1.9012771844863892, + "learning_rate": 4.312296296296296e-05, + "loss": 1.9657, + "step": 8400 + }, + { + "epoch": 3.37, + "grad_norm": 1.4263502359390259, + "learning_rate": 4.308592592592593e-05, + "loss": 1.9606, + "step": 8425 + }, + { + "epoch": 3.38, + "grad_norm": 1.3634259700775146, + "learning_rate": 4.304888888888889e-05, + "loss": 1.8931, + "step": 8450 + }, + { + "epoch": 3.39, + "grad_norm": 1.498611569404602, + "learning_rate": 4.301185185185185e-05, + "loss": 1.9311, + "step": 8475 + }, + { + "epoch": 3.4, + "grad_norm": 1.5958226919174194, + "learning_rate": 4.2974814814814815e-05, + "loss": 1.9499, + "step": 8500 + }, + { + "epoch": 3.41, + "grad_norm": 1.4740561246871948, + "learning_rate": 4.293777777777778e-05, + "loss": 1.9545, + "step": 8525 + }, + { + "epoch": 3.42, + "grad_norm": 1.3004882335662842, + "learning_rate": 4.2900740740740745e-05, + "loss": 1.9883, + "step": 8550 + }, + { + "epoch": 3.43, + "grad_norm": 1.7882790565490723, + "learning_rate": 4.2863703703703706e-05, + "loss": 1.9759, + "step": 8575 + }, + { + "epoch": 3.44, + "grad_norm": 1.3567942380905151, + "learning_rate": 4.282666666666667e-05, + "loss": 1.9219, + "step": 8600 + }, + { + "epoch": 3.45, + "grad_norm": 1.2835100889205933, + "learning_rate": 4.278962962962963e-05, + "loss": 1.9568, + "step": 8625 + }, + { + "epoch": 3.46, + "grad_norm": 1.3128582239151, + "learning_rate": 4.275259259259259e-05, + "loss": 1.9076, + "step": 8650 + }, + { + "epoch": 3.4699999999999998, + "grad_norm": 1.4423998594284058, + "learning_rate": 4.271555555555556e-05, + "loss": 1.953, + "step": 8675 + }, + { + "epoch": 3.48, + "grad_norm": 1.6110066175460815, + "learning_rate": 4.267851851851852e-05, + "loss": 1.9314, + "step": 8700 + }, + { + "epoch": 3.49, + "grad_norm": 1.235214114189148, + "learning_rate": 4.264148148148148e-05, + "loss": 1.8981, + "step": 8725 + }, + { + "epoch": 3.5, + "grad_norm": 1.223358392715454, + "learning_rate": 4.2604444444444444e-05, + "loss": 1.9607, + "step": 8750 + }, + { + "epoch": 3.51, + "grad_norm": 1.7454544305801392, + "learning_rate": 4.256740740740741e-05, + "loss": 1.9374, + "step": 8775 + }, + { + "epoch": 3.52, + "grad_norm": 1.6236810684204102, + "learning_rate": 4.253185185185186e-05, + "loss": 1.9503, + "step": 8800 + }, + { + "epoch": 3.5300000000000002, + "grad_norm": 1.454771637916565, + "learning_rate": 4.249481481481481e-05, + "loss": 1.9947, + "step": 8825 + }, + { + "epoch": 3.54, + "grad_norm": 1.4015847444534302, + "learning_rate": 4.245777777777778e-05, + "loss": 1.9912, + "step": 8850 + }, + { + "epoch": 3.55, + "grad_norm": 1.4323807954788208, + "learning_rate": 4.242074074074074e-05, + "loss": 1.9142, + "step": 8875 + }, + { + "epoch": 3.56, + "grad_norm": 1.426339864730835, + "learning_rate": 4.2383703703703704e-05, + "loss": 1.9208, + "step": 8900 + }, + { + "epoch": 3.57, + "grad_norm": 1.2626959085464478, + "learning_rate": 4.2346666666666666e-05, + "loss": 1.9237, + "step": 8925 + }, + { + "epoch": 3.58, + "grad_norm": 1.3355377912521362, + "learning_rate": 4.2309629629629634e-05, + "loss": 1.9616, + "step": 8950 + }, + { + "epoch": 3.59, + "grad_norm": 1.3246771097183228, + "learning_rate": 4.2272592592592595e-05, + "loss": 1.9595, + "step": 8975 + }, + { + "epoch": 3.6, + "grad_norm": 1.3291839361190796, + "learning_rate": 4.223555555555556e-05, + "loss": 1.9718, + "step": 9000 + }, + { + "epoch": 3.61, + "grad_norm": 1.4041240215301514, + "learning_rate": 4.219851851851852e-05, + "loss": 1.9147, + "step": 9025 + }, + { + "epoch": 3.62, + "grad_norm": 1.5576542615890503, + "learning_rate": 4.216148148148149e-05, + "loss": 1.9467, + "step": 9050 + }, + { + "epoch": 3.63, + "grad_norm": 1.355878472328186, + "learning_rate": 4.212444444444444e-05, + "loss": 1.932, + "step": 9075 + }, + { + "epoch": 3.64, + "grad_norm": 1.3441921472549438, + "learning_rate": 4.208740740740741e-05, + "loss": 1.9606, + "step": 9100 + }, + { + "epoch": 3.65, + "grad_norm": 1.5347744226455688, + "learning_rate": 4.205037037037037e-05, + "loss": 1.9081, + "step": 9125 + }, + { + "epoch": 3.66, + "grad_norm": 1.4844331741333008, + "learning_rate": 4.201333333333334e-05, + "loss": 1.847, + "step": 9150 + }, + { + "epoch": 3.67, + "grad_norm": 1.4049714803695679, + "learning_rate": 4.1976296296296295e-05, + "loss": 1.9042, + "step": 9175 + }, + { + "epoch": 3.68, + "grad_norm": 1.4935232400894165, + "learning_rate": 4.193925925925926e-05, + "loss": 1.9575, + "step": 9200 + }, + { + "epoch": 3.69, + "grad_norm": 1.6459118127822876, + "learning_rate": 4.1902222222222225e-05, + "loss": 1.9779, + "step": 9225 + }, + { + "epoch": 3.7, + "grad_norm": 1.6347599029541016, + "learning_rate": 4.1865185185185186e-05, + "loss": 1.9265, + "step": 9250 + }, + { + "epoch": 3.71, + "grad_norm": 1.5071406364440918, + "learning_rate": 4.182814814814815e-05, + "loss": 1.9869, + "step": 9275 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 1.521414041519165, + "learning_rate": 4.1791111111111116e-05, + "loss": 1.9607, + "step": 9300 + }, + { + "epoch": 3.73, + "grad_norm": 1.6524264812469482, + "learning_rate": 4.175407407407408e-05, + "loss": 1.9068, + "step": 9325 + }, + { + "epoch": 3.74, + "grad_norm": 1.5275657176971436, + "learning_rate": 4.171703703703704e-05, + "loss": 1.9225, + "step": 9350 + }, + { + "epoch": 3.75, + "grad_norm": 1.2827988862991333, + "learning_rate": 4.168e-05, + "loss": 1.9199, + "step": 9375 + }, + { + "epoch": 3.76, + "grad_norm": 1.5955214500427246, + "learning_rate": 4.164296296296297e-05, + "loss": 1.8854, + "step": 9400 + }, + { + "epoch": 3.77, + "grad_norm": 1.4321446418762207, + "learning_rate": 4.1605925925925924e-05, + "loss": 1.9488, + "step": 9425 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 1.3754669427871704, + "learning_rate": 4.156888888888889e-05, + "loss": 1.9209, + "step": 9450 + }, + { + "epoch": 3.79, + "grad_norm": 1.1604045629501343, + "learning_rate": 4.1531851851851854e-05, + "loss": 1.9515, + "step": 9475 + }, + { + "epoch": 3.8, + "grad_norm": 1.3413041830062866, + "learning_rate": 4.1494814814814815e-05, + "loss": 1.9457, + "step": 9500 + }, + { + "epoch": 3.81, + "grad_norm": 1.349363088607788, + "learning_rate": 4.145777777777778e-05, + "loss": 1.9789, + "step": 9525 + }, + { + "epoch": 3.82, + "grad_norm": 1.3260554075241089, + "learning_rate": 4.1420740740740745e-05, + "loss": 1.8999, + "step": 9550 + }, + { + "epoch": 3.83, + "grad_norm": 1.289332628250122, + "learning_rate": 4.138370370370371e-05, + "loss": 1.9856, + "step": 9575 + }, + { + "epoch": 3.84, + "grad_norm": 1.4211158752441406, + "learning_rate": 4.134666666666667e-05, + "loss": 1.919, + "step": 9600 + }, + { + "epoch": 3.85, + "grad_norm": 1.4039026498794556, + "learning_rate": 4.130962962962963e-05, + "loss": 1.9073, + "step": 9625 + }, + { + "epoch": 3.86, + "grad_norm": 1.2093830108642578, + "learning_rate": 4.12725925925926e-05, + "loss": 1.9501, + "step": 9650 + }, + { + "epoch": 3.87, + "grad_norm": 1.323862075805664, + "learning_rate": 4.123555555555555e-05, + "loss": 1.8516, + "step": 9675 + }, + { + "epoch": 3.88, + "grad_norm": 1.4265086650848389, + "learning_rate": 4.119851851851852e-05, + "loss": 1.9498, + "step": 9700 + }, + { + "epoch": 3.89, + "grad_norm": 1.4013090133666992, + "learning_rate": 4.116148148148148e-05, + "loss": 1.9302, + "step": 9725 + }, + { + "epoch": 3.9, + "grad_norm": 1.44480562210083, + "learning_rate": 4.112444444444445e-05, + "loss": 1.9059, + "step": 9750 + }, + { + "epoch": 3.91, + "grad_norm": 1.6620755195617676, + "learning_rate": 4.1087407407407406e-05, + "loss": 1.9037, + "step": 9775 + }, + { + "epoch": 3.92, + "grad_norm": 1.4004356861114502, + "learning_rate": 4.1050370370370374e-05, + "loss": 1.8987, + "step": 9800 + }, + { + "epoch": 3.93, + "grad_norm": 1.536922812461853, + "learning_rate": 4.1013333333333336e-05, + "loss": 2.0043, + "step": 9825 + }, + { + "epoch": 3.94, + "grad_norm": 1.4472166299819946, + "learning_rate": 4.09762962962963e-05, + "loss": 1.9224, + "step": 9850 + }, + { + "epoch": 3.95, + "grad_norm": 1.3354836702346802, + "learning_rate": 4.093925925925926e-05, + "loss": 1.934, + "step": 9875 + }, + { + "epoch": 3.96, + "grad_norm": 1.3124749660491943, + "learning_rate": 4.090222222222223e-05, + "loss": 1.9879, + "step": 9900 + }, + { + "epoch": 3.9699999999999998, + "grad_norm": 1.283579707145691, + "learning_rate": 4.086518518518519e-05, + "loss": 1.8905, + "step": 9925 + }, + { + "epoch": 3.98, + "grad_norm": 1.672229290008545, + "learning_rate": 4.082814814814815e-05, + "loss": 1.8554, + "step": 9950 + }, + { + "epoch": 3.99, + "grad_norm": 1.5786038637161255, + "learning_rate": 4.079111111111111e-05, + "loss": 1.8784, + "step": 9975 + }, + { + "epoch": 4.0, + "grad_norm": 1.361163854598999, + "learning_rate": 4.075407407407408e-05, + "loss": 1.9669, + "step": 10000 + }, + { + "epoch": 4.0, + "eval_gen_len": 13.1328, + "eval_loss": 1.9336023330688477, + "eval_rouge1": 51.5525, + "eval_rouge2": 25.7439, + "eval_rougeL": 47.3861, + "eval_rougeLsum": 47.3849, + "eval_runtime": 193.2674, + "eval_samples_per_second": 103.484, + "eval_steps_per_second": 1.62, + "step": 10000 + }, + { + "epoch": 4.01, + "grad_norm": 1.6996824741363525, + "learning_rate": 4.0717037037037035e-05, + "loss": 1.8428, + "step": 10025 + }, + { + "epoch": 4.02, + "grad_norm": 1.6396452188491821, + "learning_rate": 4.0680000000000004e-05, + "loss": 1.9144, + "step": 10050 + }, + { + "epoch": 4.03, + "grad_norm": 1.508521556854248, + "learning_rate": 4.0642962962962965e-05, + "loss": 1.8983, + "step": 10075 + }, + { + "epoch": 4.04, + "grad_norm": 1.3133175373077393, + "learning_rate": 4.0605925925925933e-05, + "loss": 1.8159, + "step": 10100 + }, + { + "epoch": 4.05, + "grad_norm": 1.4404352903366089, + "learning_rate": 4.056888888888889e-05, + "loss": 1.8468, + "step": 10125 + }, + { + "epoch": 4.06, + "grad_norm": 1.410372257232666, + "learning_rate": 4.0531851851851857e-05, + "loss": 1.9628, + "step": 10150 + }, + { + "epoch": 4.07, + "grad_norm": 1.509523630142212, + "learning_rate": 4.049481481481482e-05, + "loss": 1.8723, + "step": 10175 + }, + { + "epoch": 4.08, + "grad_norm": 1.488282561302185, + "learning_rate": 4.045777777777778e-05, + "loss": 1.8414, + "step": 10200 + }, + { + "epoch": 4.09, + "grad_norm": 1.3858217000961304, + "learning_rate": 4.042074074074074e-05, + "loss": 1.8241, + "step": 10225 + }, + { + "epoch": 4.1, + "grad_norm": 1.5078978538513184, + "learning_rate": 4.038370370370371e-05, + "loss": 1.9011, + "step": 10250 + }, + { + "epoch": 4.11, + "grad_norm": 1.340116262435913, + "learning_rate": 4.0346666666666664e-05, + "loss": 1.8832, + "step": 10275 + }, + { + "epoch": 4.12, + "grad_norm": 1.4269579648971558, + "learning_rate": 4.030962962962963e-05, + "loss": 1.8314, + "step": 10300 + }, + { + "epoch": 4.13, + "grad_norm": 1.3984671831130981, + "learning_rate": 4.0272592592592594e-05, + "loss": 1.8422, + "step": 10325 + }, + { + "epoch": 4.14, + "grad_norm": 1.5651813745498657, + "learning_rate": 4.023555555555556e-05, + "loss": 1.8475, + "step": 10350 + }, + { + "epoch": 4.15, + "grad_norm": 1.5041767358779907, + "learning_rate": 4.019851851851852e-05, + "loss": 1.8392, + "step": 10375 + }, + { + "epoch": 4.16, + "grad_norm": 1.6004717350006104, + "learning_rate": 4.0161481481481486e-05, + "loss": 1.8364, + "step": 10400 + }, + { + "epoch": 4.17, + "grad_norm": 1.3751362562179565, + "learning_rate": 4.012444444444445e-05, + "loss": 1.8457, + "step": 10425 + }, + { + "epoch": 4.18, + "grad_norm": 1.4664337635040283, + "learning_rate": 4.008740740740741e-05, + "loss": 1.9372, + "step": 10450 + }, + { + "epoch": 4.19, + "grad_norm": 1.3487550020217896, + "learning_rate": 4.005037037037037e-05, + "loss": 1.9049, + "step": 10475 + }, + { + "epoch": 4.2, + "grad_norm": 1.2509174346923828, + "learning_rate": 4.001333333333334e-05, + "loss": 1.8987, + "step": 10500 + }, + { + "epoch": 4.21, + "grad_norm": 1.5854827165603638, + "learning_rate": 3.99762962962963e-05, + "loss": 1.8423, + "step": 10525 + }, + { + "epoch": 4.22, + "grad_norm": 1.4388275146484375, + "learning_rate": 3.993925925925926e-05, + "loss": 1.8349, + "step": 10550 + }, + { + "epoch": 4.23, + "grad_norm": 1.4520167112350464, + "learning_rate": 3.9902222222222223e-05, + "loss": 1.8283, + "step": 10575 + }, + { + "epoch": 4.24, + "grad_norm": 1.44745671749115, + "learning_rate": 3.986518518518519e-05, + "loss": 1.9163, + "step": 10600 + }, + { + "epoch": 4.25, + "grad_norm": 1.4306520223617554, + "learning_rate": 3.9828148148148147e-05, + "loss": 1.8803, + "step": 10625 + }, + { + "epoch": 4.26, + "grad_norm": 1.3885631561279297, + "learning_rate": 3.9791111111111115e-05, + "loss": 1.8763, + "step": 10650 + }, + { + "epoch": 4.27, + "grad_norm": 1.3905619382858276, + "learning_rate": 3.9754074074074076e-05, + "loss": 1.8828, + "step": 10675 + }, + { + "epoch": 4.28, + "grad_norm": 1.3806442022323608, + "learning_rate": 3.9717037037037045e-05, + "loss": 1.908, + "step": 10700 + }, + { + "epoch": 4.29, + "grad_norm": 1.4566986560821533, + "learning_rate": 3.968e-05, + "loss": 1.8479, + "step": 10725 + }, + { + "epoch": 4.3, + "grad_norm": 1.4372694492340088, + "learning_rate": 3.964296296296297e-05, + "loss": 1.9098, + "step": 10750 + }, + { + "epoch": 4.31, + "grad_norm": 1.5970664024353027, + "learning_rate": 3.960592592592593e-05, + "loss": 1.9132, + "step": 10775 + }, + { + "epoch": 4.32, + "grad_norm": 1.3332023620605469, + "learning_rate": 3.956888888888889e-05, + "loss": 1.8479, + "step": 10800 + }, + { + "epoch": 4.33, + "grad_norm": 1.4356377124786377, + "learning_rate": 3.953185185185185e-05, + "loss": 1.8216, + "step": 10825 + }, + { + "epoch": 4.34, + "grad_norm": 1.3846675157546997, + "learning_rate": 3.949481481481482e-05, + "loss": 1.9176, + "step": 10850 + }, + { + "epoch": 4.35, + "grad_norm": 1.4137978553771973, + "learning_rate": 3.945777777777778e-05, + "loss": 1.8499, + "step": 10875 + }, + { + "epoch": 4.36, + "grad_norm": 1.456305980682373, + "learning_rate": 3.9420740740740744e-05, + "loss": 1.8801, + "step": 10900 + }, + { + "epoch": 4.37, + "grad_norm": 1.4478751420974731, + "learning_rate": 3.9383703703703706e-05, + "loss": 1.8548, + "step": 10925 + }, + { + "epoch": 4.38, + "grad_norm": 1.2786507606506348, + "learning_rate": 3.9346666666666674e-05, + "loss": 1.8149, + "step": 10950 + }, + { + "epoch": 4.39, + "grad_norm": 1.5575779676437378, + "learning_rate": 3.930962962962963e-05, + "loss": 1.8687, + "step": 10975 + }, + { + "epoch": 4.4, + "grad_norm": 1.6050180196762085, + "learning_rate": 3.92725925925926e-05, + "loss": 1.8918, + "step": 11000 + }, + { + "epoch": 4.41, + "grad_norm": 1.4834775924682617, + "learning_rate": 3.923555555555556e-05, + "loss": 1.9042, + "step": 11025 + }, + { + "epoch": 4.42, + "grad_norm": 1.3932939767837524, + "learning_rate": 3.919851851851852e-05, + "loss": 1.8931, + "step": 11050 + }, + { + "epoch": 4.43, + "grad_norm": 1.4138563871383667, + "learning_rate": 3.916148148148148e-05, + "loss": 1.8471, + "step": 11075 + }, + { + "epoch": 4.44, + "grad_norm": 1.5365270376205444, + "learning_rate": 3.912444444444445e-05, + "loss": 1.8746, + "step": 11100 + }, + { + "epoch": 4.45, + "grad_norm": 1.2787660360336304, + "learning_rate": 3.908740740740741e-05, + "loss": 1.857, + "step": 11125 + }, + { + "epoch": 4.46, + "grad_norm": 1.3813387155532837, + "learning_rate": 3.905037037037037e-05, + "loss": 1.8436, + "step": 11150 + }, + { + "epoch": 4.47, + "grad_norm": 1.3845306634902954, + "learning_rate": 3.9013333333333335e-05, + "loss": 1.8494, + "step": 11175 + }, + { + "epoch": 4.48, + "grad_norm": 1.4810049533843994, + "learning_rate": 3.8976296296296296e-05, + "loss": 1.9129, + "step": 11200 + }, + { + "epoch": 4.49, + "grad_norm": 1.5942555665969849, + "learning_rate": 3.893925925925926e-05, + "loss": 1.8596, + "step": 11225 + }, + { + "epoch": 4.5, + "grad_norm": 1.3388746976852417, + "learning_rate": 3.8902222222222226e-05, + "loss": 1.8152, + "step": 11250 + }, + { + "epoch": 4.51, + "grad_norm": 1.2811378240585327, + "learning_rate": 3.886518518518519e-05, + "loss": 1.8035, + "step": 11275 + }, + { + "epoch": 4.52, + "grad_norm": 1.4130557775497437, + "learning_rate": 3.882814814814815e-05, + "loss": 1.8349, + "step": 11300 + }, + { + "epoch": 4.53, + "grad_norm": 1.3981435298919678, + "learning_rate": 3.879111111111111e-05, + "loss": 1.9117, + "step": 11325 + }, + { + "epoch": 4.54, + "grad_norm": 1.4052165746688843, + "learning_rate": 3.875407407407408e-05, + "loss": 1.9413, + "step": 11350 + }, + { + "epoch": 4.55, + "grad_norm": 1.462985634803772, + "learning_rate": 3.871703703703704e-05, + "loss": 1.8346, + "step": 11375 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 1.417001485824585, + "learning_rate": 3.868e-05, + "loss": 1.8613, + "step": 11400 + }, + { + "epoch": 4.57, + "grad_norm": 1.3185068368911743, + "learning_rate": 3.8642962962962964e-05, + "loss": 1.8832, + "step": 11425 + }, + { + "epoch": 4.58, + "grad_norm": 1.4845627546310425, + "learning_rate": 3.8605925925925925e-05, + "loss": 1.8526, + "step": 11450 + }, + { + "epoch": 4.59, + "grad_norm": 1.5781909227371216, + "learning_rate": 3.8568888888888894e-05, + "loss": 1.9009, + "step": 11475 + }, + { + "epoch": 4.6, + "grad_norm": 1.4118759632110596, + "learning_rate": 3.8531851851851855e-05, + "loss": 1.8705, + "step": 11500 + }, + { + "epoch": 4.61, + "grad_norm": 1.604973316192627, + "learning_rate": 3.849481481481482e-05, + "loss": 1.8933, + "step": 11525 + }, + { + "epoch": 4.62, + "grad_norm": 1.2688302993774414, + "learning_rate": 3.845777777777778e-05, + "loss": 1.8558, + "step": 11550 + }, + { + "epoch": 4.63, + "grad_norm": 1.3213862180709839, + "learning_rate": 3.842074074074074e-05, + "loss": 1.8655, + "step": 11575 + }, + { + "epoch": 4.64, + "grad_norm": 1.5168383121490479, + "learning_rate": 3.838370370370371e-05, + "loss": 1.8862, + "step": 11600 + }, + { + "epoch": 4.65, + "grad_norm": 1.30950927734375, + "learning_rate": 3.834666666666667e-05, + "loss": 1.8862, + "step": 11625 + }, + { + "epoch": 4.66, + "grad_norm": 1.4738622903823853, + "learning_rate": 3.830962962962963e-05, + "loss": 1.8185, + "step": 11650 + }, + { + "epoch": 4.67, + "grad_norm": 1.4453907012939453, + "learning_rate": 3.827259259259259e-05, + "loss": 1.8367, + "step": 11675 + }, + { + "epoch": 4.68, + "grad_norm": 1.5001963376998901, + "learning_rate": 3.8235555555555555e-05, + "loss": 1.9168, + "step": 11700 + }, + { + "epoch": 4.6899999999999995, + "grad_norm": 1.4097387790679932, + "learning_rate": 3.819851851851852e-05, + "loss": 1.8582, + "step": 11725 + }, + { + "epoch": 4.7, + "grad_norm": 1.5029624700546265, + "learning_rate": 3.8161481481481485e-05, + "loss": 1.87, + "step": 11750 + }, + { + "epoch": 4.71, + "grad_norm": 1.4353328943252563, + "learning_rate": 3.8124444444444446e-05, + "loss": 1.8595, + "step": 11775 + }, + { + "epoch": 4.72, + "grad_norm": 1.6048336029052734, + "learning_rate": 3.808740740740741e-05, + "loss": 1.9087, + "step": 11800 + }, + { + "epoch": 4.73, + "grad_norm": 1.3120425939559937, + "learning_rate": 3.805037037037037e-05, + "loss": 1.8873, + "step": 11825 + }, + { + "epoch": 4.74, + "grad_norm": 1.6301329135894775, + "learning_rate": 3.801333333333333e-05, + "loss": 1.8895, + "step": 11850 + }, + { + "epoch": 4.75, + "grad_norm": 1.480413556098938, + "learning_rate": 3.79762962962963e-05, + "loss": 1.8713, + "step": 11875 + }, + { + "epoch": 4.76, + "grad_norm": 1.2750059366226196, + "learning_rate": 3.793925925925926e-05, + "loss": 1.8706, + "step": 11900 + }, + { + "epoch": 4.77, + "grad_norm": 1.5305309295654297, + "learning_rate": 3.790222222222222e-05, + "loss": 1.9087, + "step": 11925 + }, + { + "epoch": 4.78, + "grad_norm": 1.3535616397857666, + "learning_rate": 3.7865185185185184e-05, + "loss": 1.8658, + "step": 11950 + }, + { + "epoch": 4.79, + "grad_norm": 1.5504143238067627, + "learning_rate": 3.782814814814815e-05, + "loss": 1.8742, + "step": 11975 + }, + { + "epoch": 4.8, + "grad_norm": 1.7946292161941528, + "learning_rate": 3.7791111111111114e-05, + "loss": 1.8456, + "step": 12000 + }, + { + "epoch": 4.8100000000000005, + "grad_norm": 1.395272135734558, + "learning_rate": 3.7754074074074075e-05, + "loss": 1.931, + "step": 12025 + }, + { + "epoch": 4.82, + "grad_norm": 1.4061551094055176, + "learning_rate": 3.771703703703704e-05, + "loss": 1.8836, + "step": 12050 + }, + { + "epoch": 4.83, + "grad_norm": 1.3979151248931885, + "learning_rate": 3.7680000000000005e-05, + "loss": 1.8869, + "step": 12075 + }, + { + "epoch": 4.84, + "grad_norm": 1.4107104539871216, + "learning_rate": 3.764296296296296e-05, + "loss": 1.8736, + "step": 12100 + }, + { + "epoch": 4.85, + "grad_norm": 1.321219801902771, + "learning_rate": 3.760592592592593e-05, + "loss": 1.8761, + "step": 12125 + }, + { + "epoch": 4.86, + "grad_norm": 1.5014673471450806, + "learning_rate": 3.756888888888889e-05, + "loss": 1.8445, + "step": 12150 + }, + { + "epoch": 4.87, + "grad_norm": 1.4317951202392578, + "learning_rate": 3.753185185185185e-05, + "loss": 1.848, + "step": 12175 + }, + { + "epoch": 4.88, + "grad_norm": 1.4096531867980957, + "learning_rate": 3.749481481481481e-05, + "loss": 1.8933, + "step": 12200 + }, + { + "epoch": 4.89, + "grad_norm": 1.2559773921966553, + "learning_rate": 3.745777777777778e-05, + "loss": 1.8033, + "step": 12225 + }, + { + "epoch": 4.9, + "grad_norm": 1.3311185836791992, + "learning_rate": 3.742074074074074e-05, + "loss": 1.8796, + "step": 12250 + }, + { + "epoch": 4.91, + "grad_norm": 1.4260166883468628, + "learning_rate": 3.7383703703703704e-05, + "loss": 1.8633, + "step": 12275 + }, + { + "epoch": 4.92, + "grad_norm": 1.3146923780441284, + "learning_rate": 3.7346666666666666e-05, + "loss": 1.888, + "step": 12300 + }, + { + "epoch": 4.93, + "grad_norm": 1.3045905828475952, + "learning_rate": 3.7309629629629634e-05, + "loss": 1.8274, + "step": 12325 + }, + { + "epoch": 4.9399999999999995, + "grad_norm": 1.3906166553497314, + "learning_rate": 3.727259259259259e-05, + "loss": 1.9098, + "step": 12350 + }, + { + "epoch": 4.95, + "grad_norm": 1.518526554107666, + "learning_rate": 3.723555555555556e-05, + "loss": 1.9287, + "step": 12375 + }, + { + "epoch": 4.96, + "grad_norm": 1.3113497495651245, + "learning_rate": 3.719851851851852e-05, + "loss": 1.8677, + "step": 12400 + }, + { + "epoch": 4.97, + "grad_norm": 1.6457515954971313, + "learning_rate": 3.716148148148149e-05, + "loss": 1.9183, + "step": 12425 + }, + { + "epoch": 4.98, + "grad_norm": 1.5027391910552979, + "learning_rate": 3.712444444444444e-05, + "loss": 1.8613, + "step": 12450 + }, + { + "epoch": 4.99, + "grad_norm": 1.561333179473877, + "learning_rate": 3.708740740740741e-05, + "loss": 1.9073, + "step": 12475 + }, + { + "epoch": 5.0, + "grad_norm": 1.6902166604995728, + "learning_rate": 3.705037037037037e-05, + "loss": 1.8617, + "step": 12500 + }, + { + "epoch": 5.0, + "eval_gen_len": 13.1348, + "eval_loss": 1.9302258491516113, + "eval_rouge1": 51.4633, + "eval_rouge2": 25.7335, + "eval_rougeL": 47.3103, + "eval_rougeLsum": 47.305, + "eval_runtime": 194.3038, + "eval_samples_per_second": 102.932, + "eval_steps_per_second": 1.611, + "step": 12500 + }, + { + "epoch": 5.01, + "grad_norm": 1.3787955045700073, + "learning_rate": 3.7013333333333334e-05, + "loss": 1.763, + "step": 12525 + }, + { + "epoch": 5.02, + "grad_norm": 1.3770140409469604, + "learning_rate": 3.6976296296296295e-05, + "loss": 1.8189, + "step": 12550 + }, + { + "epoch": 5.03, + "grad_norm": 1.486107587814331, + "learning_rate": 3.6939259259259263e-05, + "loss": 1.8043, + "step": 12575 + }, + { + "epoch": 5.04, + "grad_norm": 1.3387417793273926, + "learning_rate": 3.690222222222222e-05, + "loss": 1.8575, + "step": 12600 + }, + { + "epoch": 5.05, + "grad_norm": 1.340170979499817, + "learning_rate": 3.6865185185185187e-05, + "loss": 1.8634, + "step": 12625 + }, + { + "epoch": 5.06, + "grad_norm": 1.2771445512771606, + "learning_rate": 3.682814814814815e-05, + "loss": 1.8065, + "step": 12650 + }, + { + "epoch": 5.07, + "grad_norm": 1.3724429607391357, + "learning_rate": 3.6791111111111116e-05, + "loss": 1.8152, + "step": 12675 + }, + { + "epoch": 5.08, + "grad_norm": 1.2865967750549316, + "learning_rate": 3.675407407407407e-05, + "loss": 1.8625, + "step": 12700 + }, + { + "epoch": 5.09, + "grad_norm": 1.4663174152374268, + "learning_rate": 3.671703703703704e-05, + "loss": 1.7772, + "step": 12725 + }, + { + "epoch": 5.1, + "grad_norm": 1.6799187660217285, + "learning_rate": 3.668e-05, + "loss": 1.8082, + "step": 12750 + }, + { + "epoch": 5.11, + "grad_norm": 1.3404431343078613, + "learning_rate": 3.664296296296296e-05, + "loss": 1.8315, + "step": 12775 + }, + { + "epoch": 5.12, + "grad_norm": 1.4227215051651, + "learning_rate": 3.6605925925925924e-05, + "loss": 1.7863, + "step": 12800 + }, + { + "epoch": 5.13, + "grad_norm": 1.4327188730239868, + "learning_rate": 3.656888888888889e-05, + "loss": 1.8599, + "step": 12825 + }, + { + "epoch": 5.14, + "grad_norm": 1.4590907096862793, + "learning_rate": 3.6531851851851854e-05, + "loss": 1.8372, + "step": 12850 + }, + { + "epoch": 5.15, + "grad_norm": 1.4968684911727905, + "learning_rate": 3.6494814814814816e-05, + "loss": 1.8266, + "step": 12875 + }, + { + "epoch": 5.16, + "grad_norm": 1.3817492723464966, + "learning_rate": 3.645925925925926e-05, + "loss": 1.8532, + "step": 12900 + }, + { + "epoch": 5.17, + "grad_norm": 1.5503273010253906, + "learning_rate": 3.642222222222222e-05, + "loss": 1.8062, + "step": 12925 + }, + { + "epoch": 5.18, + "grad_norm": 1.5043991804122925, + "learning_rate": 3.638518518518519e-05, + "loss": 1.8428, + "step": 12950 + }, + { + "epoch": 5.19, + "grad_norm": 1.3269332647323608, + "learning_rate": 3.6348148148148146e-05, + "loss": 1.7959, + "step": 12975 + }, + { + "epoch": 5.2, + "grad_norm": 1.3784312009811401, + "learning_rate": 3.6311111111111114e-05, + "loss": 1.8489, + "step": 13000 + }, + { + "epoch": 5.21, + "grad_norm": 1.557144284248352, + "learning_rate": 3.6274074074074076e-05, + "loss": 1.8175, + "step": 13025 + }, + { + "epoch": 5.22, + "grad_norm": 1.7027719020843506, + "learning_rate": 3.623703703703704e-05, + "loss": 1.8363, + "step": 13050 + }, + { + "epoch": 5.23, + "grad_norm": 1.415287733078003, + "learning_rate": 3.62e-05, + "loss": 1.8242, + "step": 13075 + }, + { + "epoch": 5.24, + "grad_norm": 1.4795790910720825, + "learning_rate": 3.616296296296297e-05, + "loss": 1.7947, + "step": 13100 + }, + { + "epoch": 5.25, + "grad_norm": 1.703626036643982, + "learning_rate": 3.612592592592593e-05, + "loss": 1.8156, + "step": 13125 + }, + { + "epoch": 5.26, + "grad_norm": 1.5055326223373413, + "learning_rate": 3.608888888888889e-05, + "loss": 1.8225, + "step": 13150 + }, + { + "epoch": 5.27, + "grad_norm": 1.5992326736450195, + "learning_rate": 3.605185185185185e-05, + "loss": 1.8556, + "step": 13175 + }, + { + "epoch": 5.28, + "grad_norm": 1.630657434463501, + "learning_rate": 3.601481481481482e-05, + "loss": 1.8139, + "step": 13200 + }, + { + "epoch": 5.29, + "grad_norm": 1.4624261856079102, + "learning_rate": 3.5977777777777775e-05, + "loss": 1.8808, + "step": 13225 + }, + { + "epoch": 5.3, + "grad_norm": 1.6484681367874146, + "learning_rate": 3.5940740740740743e-05, + "loss": 1.7941, + "step": 13250 + }, + { + "epoch": 5.31, + "grad_norm": 1.5216662883758545, + "learning_rate": 3.5903703703703705e-05, + "loss": 1.7918, + "step": 13275 + }, + { + "epoch": 5.32, + "grad_norm": 1.578412413597107, + "learning_rate": 3.586666666666667e-05, + "loss": 1.8681, + "step": 13300 + }, + { + "epoch": 5.33, + "grad_norm": 1.547946572303772, + "learning_rate": 3.582962962962963e-05, + "loss": 1.8343, + "step": 13325 + }, + { + "epoch": 5.34, + "grad_norm": 1.491708517074585, + "learning_rate": 3.5792592592592596e-05, + "loss": 1.7636, + "step": 13350 + }, + { + "epoch": 5.35, + "grad_norm": 1.4895257949829102, + "learning_rate": 3.575555555555556e-05, + "loss": 1.8088, + "step": 13375 + }, + { + "epoch": 5.36, + "grad_norm": 1.5237321853637695, + "learning_rate": 3.571851851851852e-05, + "loss": 1.8399, + "step": 13400 + }, + { + "epoch": 5.37, + "grad_norm": 1.5580129623413086, + "learning_rate": 3.568148148148148e-05, + "loss": 1.8515, + "step": 13425 + }, + { + "epoch": 5.38, + "grad_norm": 1.3289860486984253, + "learning_rate": 3.564444444444445e-05, + "loss": 1.7867, + "step": 13450 + }, + { + "epoch": 5.39, + "grad_norm": 1.3806110620498657, + "learning_rate": 3.560740740740741e-05, + "loss": 1.8141, + "step": 13475 + }, + { + "epoch": 5.4, + "grad_norm": 1.393282175064087, + "learning_rate": 3.557037037037037e-05, + "loss": 1.7835, + "step": 13500 + }, + { + "epoch": 5.41, + "grad_norm": 1.3642027378082275, + "learning_rate": 3.5533333333333334e-05, + "loss": 1.797, + "step": 13525 + }, + { + "epoch": 5.42, + "grad_norm": 1.378496766090393, + "learning_rate": 3.54962962962963e-05, + "loss": 1.8129, + "step": 13550 + }, + { + "epoch": 5.43, + "grad_norm": 1.7297312021255493, + "learning_rate": 3.545925925925926e-05, + "loss": 1.8175, + "step": 13575 + }, + { + "epoch": 5.44, + "grad_norm": 1.3795442581176758, + "learning_rate": 3.5422222222222226e-05, + "loss": 1.8089, + "step": 13600 + }, + { + "epoch": 5.45, + "grad_norm": 1.577501654624939, + "learning_rate": 3.538518518518519e-05, + "loss": 1.8813, + "step": 13625 + }, + { + "epoch": 5.46, + "grad_norm": 1.478531002998352, + "learning_rate": 3.5348148148148156e-05, + "loss": 1.7873, + "step": 13650 + }, + { + "epoch": 5.47, + "grad_norm": 1.4849592447280884, + "learning_rate": 3.531111111111111e-05, + "loss": 1.8269, + "step": 13675 + }, + { + "epoch": 5.48, + "grad_norm": 1.413120985031128, + "learning_rate": 3.527407407407408e-05, + "loss": 1.8449, + "step": 13700 + }, + { + "epoch": 5.49, + "grad_norm": 1.6044095754623413, + "learning_rate": 3.523703703703704e-05, + "loss": 1.7789, + "step": 13725 + }, + { + "epoch": 5.5, + "grad_norm": 1.4970237016677856, + "learning_rate": 3.52e-05, + "loss": 1.8088, + "step": 13750 + }, + { + "epoch": 5.51, + "grad_norm": 1.735145926475525, + "learning_rate": 3.516296296296296e-05, + "loss": 1.8242, + "step": 13775 + }, + { + "epoch": 5.52, + "grad_norm": 1.592650055885315, + "learning_rate": 3.512592592592593e-05, + "loss": 1.8115, + "step": 13800 + }, + { + "epoch": 5.53, + "grad_norm": 1.3803372383117676, + "learning_rate": 3.5088888888888886e-05, + "loss": 1.7575, + "step": 13825 + }, + { + "epoch": 5.54, + "grad_norm": 1.5937862396240234, + "learning_rate": 3.5051851851851855e-05, + "loss": 1.7927, + "step": 13850 + }, + { + "epoch": 5.55, + "grad_norm": 1.6011449098587036, + "learning_rate": 3.5014814814814816e-05, + "loss": 1.8037, + "step": 13875 + }, + { + "epoch": 5.5600000000000005, + "grad_norm": 1.4640209674835205, + "learning_rate": 3.4977777777777785e-05, + "loss": 1.7909, + "step": 13900 + }, + { + "epoch": 5.57, + "grad_norm": 1.577725887298584, + "learning_rate": 3.494074074074074e-05, + "loss": 1.8111, + "step": 13925 + }, + { + "epoch": 5.58, + "grad_norm": 1.4513524770736694, + "learning_rate": 3.490370370370371e-05, + "loss": 1.7605, + "step": 13950 + }, + { + "epoch": 5.59, + "grad_norm": 1.6748623847961426, + "learning_rate": 3.486666666666667e-05, + "loss": 1.8199, + "step": 13975 + }, + { + "epoch": 5.6, + "grad_norm": 1.5236202478408813, + "learning_rate": 3.482962962962963e-05, + "loss": 1.8162, + "step": 14000 + }, + { + "epoch": 5.61, + "grad_norm": 1.5020064115524292, + "learning_rate": 3.479259259259259e-05, + "loss": 1.8282, + "step": 14025 + }, + { + "epoch": 5.62, + "grad_norm": 1.5203158855438232, + "learning_rate": 3.475555555555556e-05, + "loss": 1.8871, + "step": 14050 + }, + { + "epoch": 5.63, + "grad_norm": 1.4594610929489136, + "learning_rate": 3.471851851851852e-05, + "loss": 1.818, + "step": 14075 + }, + { + "epoch": 5.64, + "grad_norm": 1.5922948122024536, + "learning_rate": 3.4681481481481484e-05, + "loss": 1.8806, + "step": 14100 + }, + { + "epoch": 5.65, + "grad_norm": 1.5797369480133057, + "learning_rate": 3.4644444444444446e-05, + "loss": 1.8307, + "step": 14125 + }, + { + "epoch": 5.66, + "grad_norm": 1.5268616676330566, + "learning_rate": 3.4607407407407414e-05, + "loss": 1.8021, + "step": 14150 + }, + { + "epoch": 5.67, + "grad_norm": 1.6109580993652344, + "learning_rate": 3.457037037037037e-05, + "loss": 1.8541, + "step": 14175 + }, + { + "epoch": 5.68, + "grad_norm": 1.4869205951690674, + "learning_rate": 3.453333333333334e-05, + "loss": 1.8282, + "step": 14200 + }, + { + "epoch": 5.6899999999999995, + "grad_norm": 1.5621397495269775, + "learning_rate": 3.44962962962963e-05, + "loss": 1.8665, + "step": 14225 + }, + { + "epoch": 5.7, + "grad_norm": 1.6197333335876465, + "learning_rate": 3.445925925925926e-05, + "loss": 1.9229, + "step": 14250 + }, + { + "epoch": 5.71, + "grad_norm": 1.4683600664138794, + "learning_rate": 3.442222222222222e-05, + "loss": 1.8309, + "step": 14275 + }, + { + "epoch": 5.72, + "grad_norm": 1.4849534034729004, + "learning_rate": 3.438518518518519e-05, + "loss": 1.8315, + "step": 14300 + }, + { + "epoch": 5.73, + "grad_norm": 1.2944457530975342, + "learning_rate": 3.434814814814815e-05, + "loss": 1.7922, + "step": 14325 + }, + { + "epoch": 5.74, + "grad_norm": 1.4732681512832642, + "learning_rate": 3.431111111111111e-05, + "loss": 1.8181, + "step": 14350 + }, + { + "epoch": 5.75, + "grad_norm": 1.362735629081726, + "learning_rate": 3.4274074074074075e-05, + "loss": 1.8194, + "step": 14375 + }, + { + "epoch": 5.76, + "grad_norm": 1.5724767446517944, + "learning_rate": 3.423703703703704e-05, + "loss": 1.9212, + "step": 14400 + }, + { + "epoch": 5.77, + "grad_norm": 1.430247187614441, + "learning_rate": 3.4200000000000005e-05, + "loss": 1.8141, + "step": 14425 + }, + { + "epoch": 5.78, + "grad_norm": 1.4439537525177002, + "learning_rate": 3.4162962962962966e-05, + "loss": 1.8393, + "step": 14450 + }, + { + "epoch": 5.79, + "grad_norm": 1.6095616817474365, + "learning_rate": 3.412592592592593e-05, + "loss": 1.8514, + "step": 14475 + }, + { + "epoch": 5.8, + "grad_norm": 1.3736265897750854, + "learning_rate": 3.408888888888889e-05, + "loss": 1.8419, + "step": 14500 + }, + { + "epoch": 5.8100000000000005, + "grad_norm": 1.5470609664916992, + "learning_rate": 3.405185185185185e-05, + "loss": 1.8313, + "step": 14525 + }, + { + "epoch": 5.82, + "grad_norm": 1.4805546998977661, + "learning_rate": 3.401481481481482e-05, + "loss": 1.8075, + "step": 14550 + }, + { + "epoch": 5.83, + "grad_norm": 1.3933120965957642, + "learning_rate": 3.397777777777778e-05, + "loss": 1.7556, + "step": 14575 + }, + { + "epoch": 5.84, + "grad_norm": 1.3401881456375122, + "learning_rate": 3.394074074074074e-05, + "loss": 1.7676, + "step": 14600 + }, + { + "epoch": 5.85, + "grad_norm": 1.713220238685608, + "learning_rate": 3.3903703703703704e-05, + "loss": 1.8106, + "step": 14625 + }, + { + "epoch": 5.86, + "grad_norm": 1.4203695058822632, + "learning_rate": 3.3866666666666665e-05, + "loss": 1.8415, + "step": 14650 + }, + { + "epoch": 5.87, + "grad_norm": 1.406247854232788, + "learning_rate": 3.3829629629629634e-05, + "loss": 1.7898, + "step": 14675 + }, + { + "epoch": 5.88, + "grad_norm": 1.422446846961975, + "learning_rate": 3.3792592592592595e-05, + "loss": 1.7958, + "step": 14700 + }, + { + "epoch": 5.89, + "grad_norm": 1.5542668104171753, + "learning_rate": 3.375555555555556e-05, + "loss": 1.8299, + "step": 14725 + }, + { + "epoch": 5.9, + "grad_norm": 1.426229476928711, + "learning_rate": 3.371851851851852e-05, + "loss": 1.8498, + "step": 14750 + }, + { + "epoch": 5.91, + "grad_norm": 1.4984149932861328, + "learning_rate": 3.368148148148148e-05, + "loss": 1.8642, + "step": 14775 + }, + { + "epoch": 5.92, + "grad_norm": 1.347084879875183, + "learning_rate": 3.364444444444445e-05, + "loss": 1.8321, + "step": 14800 + }, + { + "epoch": 5.93, + "grad_norm": 1.4737646579742432, + "learning_rate": 3.360740740740741e-05, + "loss": 1.9252, + "step": 14825 + }, + { + "epoch": 5.9399999999999995, + "grad_norm": 1.3631387948989868, + "learning_rate": 3.357037037037037e-05, + "loss": 1.8539, + "step": 14850 + }, + { + "epoch": 5.95, + "grad_norm": 1.3909265995025635, + "learning_rate": 3.353333333333333e-05, + "loss": 1.8415, + "step": 14875 + }, + { + "epoch": 5.96, + "grad_norm": 1.398703932762146, + "learning_rate": 3.3496296296296295e-05, + "loss": 1.823, + "step": 14900 + }, + { + "epoch": 5.97, + "grad_norm": 1.5495917797088623, + "learning_rate": 3.345925925925926e-05, + "loss": 1.7733, + "step": 14925 + }, + { + "epoch": 5.98, + "grad_norm": 1.7243595123291016, + "learning_rate": 3.3422222222222224e-05, + "loss": 1.8468, + "step": 14950 + }, + { + "epoch": 5.99, + "grad_norm": 1.5244386196136475, + "learning_rate": 3.3385185185185186e-05, + "loss": 1.8229, + "step": 14975 + }, + { + "epoch": 6.0, + "grad_norm": 1.4378917217254639, + "learning_rate": 3.334814814814815e-05, + "loss": 1.7693, + "step": 15000 + }, + { + "epoch": 6.0, + "eval_gen_len": 13.2205, + "eval_loss": 1.9290603399276733, + "eval_rouge1": 51.6382, + "eval_rouge2": 25.9065, + "eval_rougeL": 47.4714, + "eval_rougeLsum": 47.4661, + "eval_runtime": 195.9112, + "eval_samples_per_second": 102.087, + "eval_steps_per_second": 1.598, + "step": 15000 + }, + { + "epoch": 6.01, + "grad_norm": 1.421264886856079, + "learning_rate": 3.3311111111111116e-05, + "loss": 1.7225, + "step": 15025 + }, + { + "epoch": 6.02, + "grad_norm": 1.5342766046524048, + "learning_rate": 3.327407407407408e-05, + "loss": 1.8321, + "step": 15050 + }, + { + "epoch": 6.03, + "grad_norm": 1.4197121858596802, + "learning_rate": 3.323703703703704e-05, + "loss": 1.7555, + "step": 15075 + }, + { + "epoch": 6.04, + "grad_norm": 1.6157759428024292, + "learning_rate": 3.32e-05, + "loss": 1.7768, + "step": 15100 + }, + { + "epoch": 6.05, + "grad_norm": 1.5571355819702148, + "learning_rate": 3.316296296296296e-05, + "loss": 1.7692, + "step": 15125 + }, + { + "epoch": 6.06, + "grad_norm": 1.5974923372268677, + "learning_rate": 3.3125925925925924e-05, + "loss": 1.7249, + "step": 15150 + }, + { + "epoch": 6.07, + "grad_norm": 1.3685996532440186, + "learning_rate": 3.308888888888889e-05, + "loss": 1.7672, + "step": 15175 + }, + { + "epoch": 6.08, + "grad_norm": 1.4658602476119995, + "learning_rate": 3.3051851851851854e-05, + "loss": 1.7717, + "step": 15200 + }, + { + "epoch": 6.09, + "grad_norm": 1.399776577949524, + "learning_rate": 3.3014814814814815e-05, + "loss": 1.7609, + "step": 15225 + }, + { + "epoch": 6.1, + "grad_norm": 1.6905499696731567, + "learning_rate": 3.297777777777778e-05, + "loss": 1.7796, + "step": 15250 + }, + { + "epoch": 6.11, + "grad_norm": 1.7223976850509644, + "learning_rate": 3.2940740740740745e-05, + "loss": 1.7806, + "step": 15275 + }, + { + "epoch": 6.12, + "grad_norm": 1.4597574472427368, + "learning_rate": 3.29037037037037e-05, + "loss": 1.7689, + "step": 15300 + }, + { + "epoch": 6.13, + "grad_norm": 1.7513933181762695, + "learning_rate": 3.286666666666667e-05, + "loss": 1.8005, + "step": 15325 + }, + { + "epoch": 6.14, + "grad_norm": 1.7440515756607056, + "learning_rate": 3.282962962962963e-05, + "loss": 1.7965, + "step": 15350 + }, + { + "epoch": 6.15, + "grad_norm": 1.6327168941497803, + "learning_rate": 3.279259259259259e-05, + "loss": 1.7875, + "step": 15375 + }, + { + "epoch": 6.16, + "grad_norm": 1.5745080709457397, + "learning_rate": 3.275555555555555e-05, + "loss": 1.7659, + "step": 15400 + }, + { + "epoch": 6.17, + "grad_norm": 1.5755482912063599, + "learning_rate": 3.271851851851852e-05, + "loss": 1.7724, + "step": 15425 + }, + { + "epoch": 6.18, + "grad_norm": 1.6102346181869507, + "learning_rate": 3.268148148148148e-05, + "loss": 1.7862, + "step": 15450 + }, + { + "epoch": 6.19, + "grad_norm": 1.5103446245193481, + "learning_rate": 3.2644444444444444e-05, + "loss": 1.7739, + "step": 15475 + }, + { + "epoch": 6.2, + "grad_norm": 1.4357705116271973, + "learning_rate": 3.2607407407407406e-05, + "loss": 1.7596, + "step": 15500 + }, + { + "epoch": 6.21, + "grad_norm": 1.739065170288086, + "learning_rate": 3.2570370370370374e-05, + "loss": 1.8472, + "step": 15525 + }, + { + "epoch": 6.22, + "grad_norm": 1.59120512008667, + "learning_rate": 3.253333333333333e-05, + "loss": 1.8353, + "step": 15550 + }, + { + "epoch": 6.23, + "grad_norm": 1.7250970602035522, + "learning_rate": 3.24962962962963e-05, + "loss": 1.7692, + "step": 15575 + }, + { + "epoch": 6.24, + "grad_norm": 1.6642422676086426, + "learning_rate": 3.245925925925926e-05, + "loss": 1.7367, + "step": 15600 + }, + { + "epoch": 6.25, + "grad_norm": 1.4755923748016357, + "learning_rate": 3.242222222222223e-05, + "loss": 1.7742, + "step": 15625 + }, + { + "epoch": 6.26, + "grad_norm": 1.5769152641296387, + "learning_rate": 3.238518518518518e-05, + "loss": 1.7956, + "step": 15650 + }, + { + "epoch": 6.27, + "grad_norm": 1.437385082244873, + "learning_rate": 3.234814814814815e-05, + "loss": 1.7039, + "step": 15675 + }, + { + "epoch": 6.28, + "grad_norm": 1.4990898370742798, + "learning_rate": 3.231111111111111e-05, + "loss": 1.7824, + "step": 15700 + }, + { + "epoch": 6.29, + "grad_norm": 1.3943312168121338, + "learning_rate": 3.2274074074074074e-05, + "loss": 1.7593, + "step": 15725 + }, + { + "epoch": 6.3, + "grad_norm": 1.6160963773727417, + "learning_rate": 3.2237037037037035e-05, + "loss": 1.7873, + "step": 15750 + }, + { + "epoch": 6.31, + "grad_norm": 1.569349765777588, + "learning_rate": 3.2200000000000003e-05, + "loss": 1.8264, + "step": 15775 + }, + { + "epoch": 6.32, + "grad_norm": 1.4803683757781982, + "learning_rate": 3.216444444444445e-05, + "loss": 1.7451, + "step": 15800 + }, + { + "epoch": 6.33, + "grad_norm": 1.760981798171997, + "learning_rate": 3.212740740740741e-05, + "loss": 1.791, + "step": 15825 + }, + { + "epoch": 6.34, + "grad_norm": 1.3866006135940552, + "learning_rate": 3.209037037037037e-05, + "loss": 1.7865, + "step": 15850 + }, + { + "epoch": 6.35, + "grad_norm": 1.6246627569198608, + "learning_rate": 3.2053333333333334e-05, + "loss": 1.8029, + "step": 15875 + }, + { + "epoch": 6.36, + "grad_norm": 1.4544792175292969, + "learning_rate": 3.20162962962963e-05, + "loss": 1.7549, + "step": 15900 + }, + { + "epoch": 6.37, + "grad_norm": 1.4172948598861694, + "learning_rate": 3.197925925925926e-05, + "loss": 1.7956, + "step": 15925 + }, + { + "epoch": 6.38, + "grad_norm": 1.4404555559158325, + "learning_rate": 3.1942222222222225e-05, + "loss": 1.7294, + "step": 15950 + }, + { + "epoch": 6.39, + "grad_norm": 1.4792706966400146, + "learning_rate": 3.190518518518519e-05, + "loss": 1.7118, + "step": 15975 + }, + { + "epoch": 6.4, + "grad_norm": 1.5171838998794556, + "learning_rate": 3.186814814814815e-05, + "loss": 1.7392, + "step": 16000 + }, + { + "epoch": 6.41, + "grad_norm": 1.5134046077728271, + "learning_rate": 3.183111111111111e-05, + "loss": 1.7854, + "step": 16025 + }, + { + "epoch": 6.42, + "grad_norm": 1.504233479499817, + "learning_rate": 3.179407407407408e-05, + "loss": 1.7571, + "step": 16050 + }, + { + "epoch": 6.43, + "grad_norm": 1.272483468055725, + "learning_rate": 3.175703703703704e-05, + "loss": 1.7626, + "step": 16075 + }, + { + "epoch": 6.44, + "grad_norm": 1.471863031387329, + "learning_rate": 3.172e-05, + "loss": 1.7756, + "step": 16100 + }, + { + "epoch": 6.45, + "grad_norm": 1.3983781337738037, + "learning_rate": 3.168296296296296e-05, + "loss": 1.8251, + "step": 16125 + }, + { + "epoch": 6.46, + "grad_norm": 1.3490697145462036, + "learning_rate": 3.164592592592593e-05, + "loss": 1.7751, + "step": 16150 + }, + { + "epoch": 6.47, + "grad_norm": 1.5252445936203003, + "learning_rate": 3.1608888888888886e-05, + "loss": 1.7565, + "step": 16175 + }, + { + "epoch": 6.48, + "grad_norm": 1.688368558883667, + "learning_rate": 3.1571851851851854e-05, + "loss": 1.8372, + "step": 16200 + }, + { + "epoch": 6.49, + "grad_norm": 1.3352307081222534, + "learning_rate": 3.1534814814814816e-05, + "loss": 1.7646, + "step": 16225 + }, + { + "epoch": 6.5, + "grad_norm": 1.5939990282058716, + "learning_rate": 3.1497777777777784e-05, + "loss": 1.7409, + "step": 16250 + }, + { + "epoch": 6.51, + "grad_norm": 1.3315916061401367, + "learning_rate": 3.146074074074074e-05, + "loss": 1.767, + "step": 16275 + }, + { + "epoch": 6.52, + "grad_norm": 1.367167592048645, + "learning_rate": 3.142370370370371e-05, + "loss": 1.7998, + "step": 16300 + }, + { + "epoch": 6.53, + "grad_norm": 1.7223109006881714, + "learning_rate": 3.138666666666667e-05, + "loss": 1.8021, + "step": 16325 + }, + { + "epoch": 6.54, + "grad_norm": 1.4448398351669312, + "learning_rate": 3.134962962962963e-05, + "loss": 1.8232, + "step": 16350 + }, + { + "epoch": 6.55, + "grad_norm": 1.257012128829956, + "learning_rate": 3.131259259259259e-05, + "loss": 1.7962, + "step": 16375 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 1.2631241083145142, + "learning_rate": 3.127555555555556e-05, + "loss": 1.7465, + "step": 16400 + }, + { + "epoch": 6.57, + "grad_norm": 1.5594916343688965, + "learning_rate": 3.123851851851852e-05, + "loss": 1.7723, + "step": 16425 + }, + { + "epoch": 6.58, + "grad_norm": 1.6267319917678833, + "learning_rate": 3.1201481481481483e-05, + "loss": 1.8153, + "step": 16450 + }, + { + "epoch": 6.59, + "grad_norm": 1.3760591745376587, + "learning_rate": 3.1164444444444445e-05, + "loss": 1.7863, + "step": 16475 + }, + { + "epoch": 6.6, + "grad_norm": 1.565403699874878, + "learning_rate": 3.112740740740741e-05, + "loss": 1.7395, + "step": 16500 + }, + { + "epoch": 6.61, + "grad_norm": 1.562738060951233, + "learning_rate": 3.109037037037037e-05, + "loss": 1.7866, + "step": 16525 + }, + { + "epoch": 6.62, + "grad_norm": 1.7263091802597046, + "learning_rate": 3.1053333333333336e-05, + "loss": 1.7532, + "step": 16550 + }, + { + "epoch": 6.63, + "grad_norm": 1.4510164260864258, + "learning_rate": 3.10162962962963e-05, + "loss": 1.7455, + "step": 16575 + }, + { + "epoch": 6.64, + "grad_norm": 1.474307894706726, + "learning_rate": 3.0979259259259266e-05, + "loss": 1.7317, + "step": 16600 + }, + { + "epoch": 6.65, + "grad_norm": 1.6146974563598633, + "learning_rate": 3.094222222222222e-05, + "loss": 1.8102, + "step": 16625 + }, + { + "epoch": 6.66, + "grad_norm": 1.4847644567489624, + "learning_rate": 3.090518518518519e-05, + "loss": 1.774, + "step": 16650 + }, + { + "epoch": 6.67, + "grad_norm": 1.7083697319030762, + "learning_rate": 3.086814814814815e-05, + "loss": 1.8375, + "step": 16675 + }, + { + "epoch": 6.68, + "grad_norm": 1.3661868572235107, + "learning_rate": 3.083111111111111e-05, + "loss": 1.8694, + "step": 16700 + }, + { + "epoch": 6.6899999999999995, + "grad_norm": 1.7693723440170288, + "learning_rate": 3.0794074074074074e-05, + "loss": 1.8175, + "step": 16725 + }, + { + "epoch": 6.7, + "grad_norm": 1.5230166912078857, + "learning_rate": 3.075703703703704e-05, + "loss": 1.7785, + "step": 16750 + }, + { + "epoch": 6.71, + "grad_norm": 1.4941860437393188, + "learning_rate": 3.072e-05, + "loss": 1.837, + "step": 16775 + }, + { + "epoch": 6.72, + "grad_norm": 1.4451031684875488, + "learning_rate": 3.0682962962962966e-05, + "loss": 1.7919, + "step": 16800 + }, + { + "epoch": 6.73, + "grad_norm": 1.6617428064346313, + "learning_rate": 3.064592592592593e-05, + "loss": 1.8227, + "step": 16825 + }, + { + "epoch": 6.74, + "grad_norm": 1.5577620267868042, + "learning_rate": 3.0608888888888895e-05, + "loss": 1.7545, + "step": 16850 + }, + { + "epoch": 6.75, + "grad_norm": 1.6712673902511597, + "learning_rate": 3.057185185185185e-05, + "loss": 1.799, + "step": 16875 + }, + { + "epoch": 6.76, + "grad_norm": 1.879231572151184, + "learning_rate": 3.053481481481482e-05, + "loss": 1.7737, + "step": 16900 + }, + { + "epoch": 6.77, + "grad_norm": 1.4019575119018555, + "learning_rate": 3.049777777777778e-05, + "loss": 1.7626, + "step": 16925 + }, + { + "epoch": 6.78, + "grad_norm": 1.3805930614471436, + "learning_rate": 3.046074074074074e-05, + "loss": 1.7682, + "step": 16950 + }, + { + "epoch": 6.79, + "grad_norm": 1.5736984014511108, + "learning_rate": 3.0423703703703703e-05, + "loss": 1.8448, + "step": 16975 + }, + { + "epoch": 6.8, + "grad_norm": 1.5094330310821533, + "learning_rate": 3.0386666666666668e-05, + "loss": 1.8033, + "step": 17000 + }, + { + "epoch": 6.8100000000000005, + "grad_norm": 1.7193862199783325, + "learning_rate": 3.0349629629629633e-05, + "loss": 1.7892, + "step": 17025 + }, + { + "epoch": 6.82, + "grad_norm": 1.7383610010147095, + "learning_rate": 3.031259259259259e-05, + "loss": 1.7517, + "step": 17050 + }, + { + "epoch": 6.83, + "grad_norm": 1.5743800401687622, + "learning_rate": 3.0275555555555556e-05, + "loss": 1.8145, + "step": 17075 + }, + { + "epoch": 6.84, + "grad_norm": 1.6040315628051758, + "learning_rate": 3.023851851851852e-05, + "loss": 1.7887, + "step": 17100 + }, + { + "epoch": 6.85, + "grad_norm": 1.7475184202194214, + "learning_rate": 3.020148148148148e-05, + "loss": 1.7959, + "step": 17125 + }, + { + "epoch": 6.86, + "grad_norm": 1.4680596590042114, + "learning_rate": 3.0164444444444444e-05, + "loss": 1.7865, + "step": 17150 + }, + { + "epoch": 6.87, + "grad_norm": 1.5900845527648926, + "learning_rate": 3.012740740740741e-05, + "loss": 1.8171, + "step": 17175 + }, + { + "epoch": 6.88, + "grad_norm": 1.5023775100708008, + "learning_rate": 3.0090370370370374e-05, + "loss": 1.7596, + "step": 17200 + }, + { + "epoch": 6.89, + "grad_norm": 1.5605974197387695, + "learning_rate": 3.0053333333333332e-05, + "loss": 1.7812, + "step": 17225 + }, + { + "epoch": 6.9, + "grad_norm": 1.612432837486267, + "learning_rate": 3.0016296296296297e-05, + "loss": 1.8062, + "step": 17250 + }, + { + "epoch": 6.91, + "grad_norm": 1.664870262145996, + "learning_rate": 2.9979259259259262e-05, + "loss": 1.7445, + "step": 17275 + }, + { + "epoch": 6.92, + "grad_norm": 1.4798489809036255, + "learning_rate": 2.9943703703703708e-05, + "loss": 1.7788, + "step": 17300 + }, + { + "epoch": 6.93, + "grad_norm": 1.5232810974121094, + "learning_rate": 2.9906666666666666e-05, + "loss": 1.8383, + "step": 17325 + }, + { + "epoch": 6.9399999999999995, + "grad_norm": 1.4982616901397705, + "learning_rate": 2.986962962962963e-05, + "loss": 1.7302, + "step": 17350 + }, + { + "epoch": 6.95, + "grad_norm": 1.4312171936035156, + "learning_rate": 2.9832592592592596e-05, + "loss": 1.7573, + "step": 17375 + }, + { + "epoch": 6.96, + "grad_norm": 1.3876653909683228, + "learning_rate": 2.9795555555555554e-05, + "loss": 1.7684, + "step": 17400 + }, + { + "epoch": 6.97, + "grad_norm": 1.6542433500289917, + "learning_rate": 2.975851851851852e-05, + "loss": 1.7529, + "step": 17425 + }, + { + "epoch": 6.98, + "grad_norm": 1.4732959270477295, + "learning_rate": 2.9721481481481484e-05, + "loss": 1.8128, + "step": 17450 + }, + { + "epoch": 6.99, + "grad_norm": 1.4124960899353027, + "learning_rate": 2.968444444444445e-05, + "loss": 1.8079, + "step": 17475 + }, + { + "epoch": 7.0, + "grad_norm": 1.466747760772705, + "learning_rate": 2.9647407407407407e-05, + "loss": 1.8214, + "step": 17500 + }, + { + "epoch": 7.0, + "eval_gen_len": 13.125, + "eval_loss": 1.9262617826461792, + "eval_rouge1": 51.5538, + "eval_rouge2": 25.7728, + "eval_rougeL": 47.3812, + "eval_rougeLsum": 47.3727, + "eval_runtime": 196.7996, + "eval_samples_per_second": 101.626, + "eval_steps_per_second": 1.59, + "step": 17500 + } + ], + "logging_steps": 25, + "max_steps": 37500, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5719030035841024e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}