{ "best_metric": 1.9262617826461792, "best_model_checkpoint": "Paraphrase-v3/checkpoint-17500", "epoch": 7.0, "eval_steps": 500, "global_step": 17500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 11.671796798706055, "learning_rate": 3.2e-07, "loss": 3.3655, "step": 25 }, { "epoch": 0.02, "grad_norm": 9.20463752746582, "learning_rate": 6.533333333333334e-07, "loss": 3.2816, "step": 50 }, { "epoch": 0.03, "grad_norm": 10.489435195922852, "learning_rate": 9.733333333333335e-07, "loss": 3.2049, "step": 75 }, { "epoch": 0.04, "grad_norm": 8.335665702819824, "learning_rate": 1.2933333333333334e-06, "loss": 3.2663, "step": 100 }, { "epoch": 0.05, "grad_norm": 7.564672470092773, "learning_rate": 1.6133333333333333e-06, "loss": 3.0361, "step": 125 }, { "epoch": 0.06, "grad_norm": 6.445804595947266, "learning_rate": 1.946666666666667e-06, "loss": 3.0074, "step": 150 }, { "epoch": 0.07, "grad_norm": 5.889507293701172, "learning_rate": 2.28e-06, "loss": 2.9401, "step": 175 }, { "epoch": 0.08, "grad_norm": 5.881171226501465, "learning_rate": 2.6133333333333334e-06, "loss": 2.7827, "step": 200 }, { "epoch": 0.09, "grad_norm": 2.8032336235046387, "learning_rate": 2.9466666666666667e-06, "loss": 2.7671, "step": 225 }, { "epoch": 0.1, "grad_norm": 2.653895616531372, "learning_rate": 3.2800000000000004e-06, "loss": 2.6844, "step": 250 }, { "epoch": 0.11, "grad_norm": 6.168716907501221, "learning_rate": 3.613333333333334e-06, "loss": 2.5707, "step": 275 }, { "epoch": 0.12, "grad_norm": 6.516740322113037, "learning_rate": 3.9466666666666664e-06, "loss": 2.5428, "step": 300 }, { "epoch": 0.13, "grad_norm": 2.4983417987823486, "learning_rate": 4.28e-06, "loss": 2.57, "step": 325 }, { "epoch": 0.14, "grad_norm": 2.452354669570923, "learning_rate": 4.613333333333334e-06, "loss": 2.5167, "step": 350 }, { "epoch": 0.15, "grad_norm": 2.4193527698516846, "learning_rate": 4.946666666666667e-06, "loss": 2.4788, "step": 375 }, { "epoch": 0.16, "grad_norm": 2.7596585750579834, "learning_rate": 5.28e-06, "loss": 2.4994, "step": 400 }, { "epoch": 0.17, "grad_norm": 2.1304233074188232, "learning_rate": 5.6133333333333335e-06, "loss": 2.4324, "step": 425 }, { "epoch": 0.18, "grad_norm": 1.9708311557769775, "learning_rate": 5.946666666666667e-06, "loss": 2.4117, "step": 450 }, { "epoch": 0.19, "grad_norm": 2.2388558387756348, "learning_rate": 6.28e-06, "loss": 2.3727, "step": 475 }, { "epoch": 0.2, "grad_norm": 2.3298180103302, "learning_rate": 6.613333333333334e-06, "loss": 2.4732, "step": 500 }, { "epoch": 0.21, "grad_norm": 2.0654354095458984, "learning_rate": 6.9466666666666665e-06, "loss": 2.4922, "step": 525 }, { "epoch": 0.22, "grad_norm": 1.9771661758422852, "learning_rate": 7.280000000000001e-06, "loss": 2.3858, "step": 550 }, { "epoch": 0.23, "grad_norm": 2.3070411682128906, "learning_rate": 7.613333333333334e-06, "loss": 2.3785, "step": 575 }, { "epoch": 0.24, "grad_norm": 2.0033750534057617, "learning_rate": 7.946666666666668e-06, "loss": 2.369, "step": 600 }, { "epoch": 0.25, "grad_norm": 1.960777997970581, "learning_rate": 8.28e-06, "loss": 2.4246, "step": 625 }, { "epoch": 0.26, "grad_norm": 1.8081718683242798, "learning_rate": 8.613333333333334e-06, "loss": 2.3665, "step": 650 }, { "epoch": 0.27, "grad_norm": 1.8548564910888672, "learning_rate": 8.946666666666667e-06, "loss": 2.3306, "step": 675 }, { "epoch": 0.28, "grad_norm": 2.222216844558716, "learning_rate": 9.28e-06, "loss": 2.3112, "step": 700 }, { "epoch": 0.29, "grad_norm": 1.7288415431976318, "learning_rate": 9.613333333333333e-06, "loss": 2.3098, "step": 725 }, { "epoch": 0.3, "grad_norm": 2.0263378620147705, "learning_rate": 9.946666666666667e-06, "loss": 2.3304, "step": 750 }, { "epoch": 0.31, "grad_norm": 1.5405957698822021, "learning_rate": 1.0280000000000002e-05, "loss": 2.3302, "step": 775 }, { "epoch": 0.32, "grad_norm": 1.8870110511779785, "learning_rate": 1.0613333333333334e-05, "loss": 2.3241, "step": 800 }, { "epoch": 0.33, "grad_norm": 1.8393750190734863, "learning_rate": 1.0946666666666668e-05, "loss": 2.3361, "step": 825 }, { "epoch": 0.34, "grad_norm": 1.870672583580017, "learning_rate": 1.128e-05, "loss": 2.3178, "step": 850 }, { "epoch": 0.35, "grad_norm": 1.7924401760101318, "learning_rate": 1.1613333333333335e-05, "loss": 2.2558, "step": 875 }, { "epoch": 0.36, "grad_norm": 1.6780457496643066, "learning_rate": 1.1946666666666667e-05, "loss": 2.2928, "step": 900 }, { "epoch": 0.37, "grad_norm": 1.8860801458358765, "learning_rate": 1.2280000000000001e-05, "loss": 2.2523, "step": 925 }, { "epoch": 0.38, "grad_norm": 1.7925626039505005, "learning_rate": 1.2613333333333332e-05, "loss": 2.2885, "step": 950 }, { "epoch": 0.39, "grad_norm": 1.916331171989441, "learning_rate": 1.2946666666666668e-05, "loss": 2.3369, "step": 975 }, { "epoch": 0.4, "grad_norm": 1.6720538139343262, "learning_rate": 1.3280000000000002e-05, "loss": 2.2789, "step": 1000 }, { "epoch": 0.41, "grad_norm": 1.8012834787368774, "learning_rate": 1.3613333333333334e-05, "loss": 2.2818, "step": 1025 }, { "epoch": 0.42, "grad_norm": 1.82932710647583, "learning_rate": 1.3946666666666666e-05, "loss": 2.3173, "step": 1050 }, { "epoch": 0.43, "grad_norm": 1.8059712648391724, "learning_rate": 1.4280000000000002e-05, "loss": 2.266, "step": 1075 }, { "epoch": 0.44, "grad_norm": 1.4668495655059814, "learning_rate": 1.4613333333333335e-05, "loss": 2.2639, "step": 1100 }, { "epoch": 0.45, "grad_norm": 1.7411282062530518, "learning_rate": 1.4946666666666667e-05, "loss": 2.2816, "step": 1125 }, { "epoch": 0.46, "grad_norm": 1.5545501708984375, "learning_rate": 1.528e-05, "loss": 2.2578, "step": 1150 }, { "epoch": 0.47, "grad_norm": 1.6876237392425537, "learning_rate": 1.5613333333333335e-05, "loss": 2.3282, "step": 1175 }, { "epoch": 0.48, "grad_norm": 1.6718631982803345, "learning_rate": 1.5946666666666668e-05, "loss": 2.2713, "step": 1200 }, { "epoch": 0.49, "grad_norm": 1.628440499305725, "learning_rate": 1.628e-05, "loss": 2.2581, "step": 1225 }, { "epoch": 0.5, "grad_norm": 1.8468382358551025, "learning_rate": 1.6613333333333332e-05, "loss": 2.3533, "step": 1250 }, { "epoch": 0.51, "grad_norm": 1.8317992687225342, "learning_rate": 1.6946666666666665e-05, "loss": 2.2398, "step": 1275 }, { "epoch": 0.52, "grad_norm": 1.673600673675537, "learning_rate": 1.728e-05, "loss": 2.2879, "step": 1300 }, { "epoch": 0.53, "grad_norm": 1.5316749811172485, "learning_rate": 1.7613333333333333e-05, "loss": 2.2502, "step": 1325 }, { "epoch": 0.54, "grad_norm": 1.6259863376617432, "learning_rate": 1.794666666666667e-05, "loss": 2.2399, "step": 1350 }, { "epoch": 0.55, "grad_norm": 1.4696708917617798, "learning_rate": 1.828e-05, "loss": 2.233, "step": 1375 }, { "epoch": 0.56, "grad_norm": 1.9751217365264893, "learning_rate": 1.8613333333333337e-05, "loss": 2.2855, "step": 1400 }, { "epoch": 0.57, "grad_norm": 1.541063666343689, "learning_rate": 1.894666666666667e-05, "loss": 2.2782, "step": 1425 }, { "epoch": 0.58, "grad_norm": 1.525913953781128, "learning_rate": 1.9280000000000002e-05, "loss": 2.238, "step": 1450 }, { "epoch": 0.59, "grad_norm": 1.6467074155807495, "learning_rate": 1.9613333333333334e-05, "loss": 2.2921, "step": 1475 }, { "epoch": 0.6, "grad_norm": 1.6107714176177979, "learning_rate": 1.9946666666666667e-05, "loss": 2.2359, "step": 1500 }, { "epoch": 0.61, "grad_norm": 1.6940182447433472, "learning_rate": 2.0280000000000002e-05, "loss": 2.2137, "step": 1525 }, { "epoch": 0.62, "grad_norm": 1.7330864667892456, "learning_rate": 2.0613333333333335e-05, "loss": 2.211, "step": 1550 }, { "epoch": 0.63, "grad_norm": 1.847303032875061, "learning_rate": 2.0946666666666667e-05, "loss": 2.2547, "step": 1575 }, { "epoch": 0.64, "grad_norm": 1.3495903015136719, "learning_rate": 2.128e-05, "loss": 2.217, "step": 1600 }, { "epoch": 0.65, "grad_norm": 1.4377658367156982, "learning_rate": 2.1613333333333335e-05, "loss": 2.2179, "step": 1625 }, { "epoch": 0.66, "grad_norm": 1.9955512285232544, "learning_rate": 2.1946666666666668e-05, "loss": 2.2818, "step": 1650 }, { "epoch": 0.67, "grad_norm": 1.3620418310165405, "learning_rate": 2.228e-05, "loss": 2.2304, "step": 1675 }, { "epoch": 0.68, "grad_norm": 1.4833818674087524, "learning_rate": 2.2613333333333333e-05, "loss": 2.2016, "step": 1700 }, { "epoch": 0.69, "grad_norm": 1.4675543308258057, "learning_rate": 2.294666666666667e-05, "loss": 2.1752, "step": 1725 }, { "epoch": 0.7, "grad_norm": 1.5722428560256958, "learning_rate": 2.328e-05, "loss": 2.2749, "step": 1750 }, { "epoch": 0.71, "grad_norm": 1.711739182472229, "learning_rate": 2.3613333333333333e-05, "loss": 2.1855, "step": 1775 }, { "epoch": 0.72, "grad_norm": 1.5259678363800049, "learning_rate": 2.394666666666667e-05, "loss": 2.1757, "step": 1800 }, { "epoch": 0.73, "grad_norm": 1.4229743480682373, "learning_rate": 2.428e-05, "loss": 2.2109, "step": 1825 }, { "epoch": 0.74, "grad_norm": 1.5702102184295654, "learning_rate": 2.4613333333333337e-05, "loss": 2.1032, "step": 1850 }, { "epoch": 0.75, "grad_norm": 1.7006195783615112, "learning_rate": 2.494666666666667e-05, "loss": 2.1867, "step": 1875 }, { "epoch": 0.76, "grad_norm": 1.5293747186660767, "learning_rate": 2.5280000000000005e-05, "loss": 2.2308, "step": 1900 }, { "epoch": 0.77, "grad_norm": 2.1540281772613525, "learning_rate": 2.5613333333333334e-05, "loss": 2.184, "step": 1925 }, { "epoch": 0.78, "grad_norm": 1.6256271600723267, "learning_rate": 2.594666666666667e-05, "loss": 2.2499, "step": 1950 }, { "epoch": 0.79, "grad_norm": 1.394745111465454, "learning_rate": 2.628e-05, "loss": 2.1345, "step": 1975 }, { "epoch": 0.8, "grad_norm": 1.5799375772476196, "learning_rate": 2.6613333333333335e-05, "loss": 2.1641, "step": 2000 }, { "epoch": 0.81, "grad_norm": 1.4012045860290527, "learning_rate": 2.694666666666667e-05, "loss": 2.169, "step": 2025 }, { "epoch": 0.82, "grad_norm": 1.5689955949783325, "learning_rate": 2.728e-05, "loss": 2.1793, "step": 2050 }, { "epoch": 0.83, "grad_norm": 1.5047041177749634, "learning_rate": 2.7613333333333335e-05, "loss": 2.1984, "step": 2075 }, { "epoch": 0.84, "grad_norm": 1.399196743965149, "learning_rate": 2.7946666666666664e-05, "loss": 2.1996, "step": 2100 }, { "epoch": 0.85, "grad_norm": 1.7386200428009033, "learning_rate": 2.828e-05, "loss": 2.1799, "step": 2125 }, { "epoch": 0.86, "grad_norm": 1.6265965700149536, "learning_rate": 2.8613333333333336e-05, "loss": 2.1946, "step": 2150 }, { "epoch": 0.87, "grad_norm": 1.3947643041610718, "learning_rate": 2.8946666666666665e-05, "loss": 2.206, "step": 2175 }, { "epoch": 0.88, "grad_norm": 1.4571387767791748, "learning_rate": 2.928e-05, "loss": 2.1895, "step": 2200 }, { "epoch": 0.89, "grad_norm": 1.3659110069274902, "learning_rate": 2.9613333333333337e-05, "loss": 2.1662, "step": 2225 }, { "epoch": 0.9, "grad_norm": 1.6103181838989258, "learning_rate": 2.9946666666666666e-05, "loss": 2.2149, "step": 2250 }, { "epoch": 0.91, "grad_norm": 1.4823683500289917, "learning_rate": 3.028e-05, "loss": 2.2413, "step": 2275 }, { "epoch": 0.92, "grad_norm": 1.4936267137527466, "learning_rate": 3.0613333333333334e-05, "loss": 2.1555, "step": 2300 }, { "epoch": 0.93, "grad_norm": 1.487349271774292, "learning_rate": 3.0946666666666666e-05, "loss": 2.194, "step": 2325 }, { "epoch": 0.94, "grad_norm": 1.626238226890564, "learning_rate": 3.1280000000000005e-05, "loss": 2.1924, "step": 2350 }, { "epoch": 0.95, "grad_norm": 1.4786629676818848, "learning_rate": 3.161333333333333e-05, "loss": 2.1831, "step": 2375 }, { "epoch": 0.96, "grad_norm": 3.430049419403076, "learning_rate": 3.194666666666667e-05, "loss": 2.1984, "step": 2400 }, { "epoch": 0.97, "grad_norm": 1.4396685361862183, "learning_rate": 3.2279999999999996e-05, "loss": 2.1214, "step": 2425 }, { "epoch": 0.98, "grad_norm": 1.5446324348449707, "learning_rate": 3.2613333333333335e-05, "loss": 2.1869, "step": 2450 }, { "epoch": 0.99, "grad_norm": 1.591899037361145, "learning_rate": 3.294666666666667e-05, "loss": 2.166, "step": 2475 }, { "epoch": 1.0, "grad_norm": 1.7222836017608643, "learning_rate": 3.328e-05, "loss": 2.1121, "step": 2500 }, { "epoch": 1.0, "eval_gen_len": 13.0372, "eval_loss": 2.038074016571045, "eval_rouge1": 50.5825, "eval_rouge2": 24.8269, "eval_rougeL": 46.4789, "eval_rougeLsum": 46.4745, "eval_runtime": 194.3582, "eval_samples_per_second": 102.903, "eval_steps_per_second": 1.61, "step": 2500 }, { "epoch": 1.01, "grad_norm": 1.6080988645553589, "learning_rate": 3.361333333333333e-05, "loss": 2.1864, "step": 2525 }, { "epoch": 1.02, "grad_norm": 1.4214496612548828, "learning_rate": 3.394666666666667e-05, "loss": 2.1115, "step": 2550 }, { "epoch": 1.03, "grad_norm": 1.4816479682922363, "learning_rate": 3.4280000000000004e-05, "loss": 2.1319, "step": 2575 }, { "epoch": 1.04, "grad_norm": 1.5511168241500854, "learning_rate": 3.4613333333333336e-05, "loss": 2.1639, "step": 2600 }, { "epoch": 1.05, "grad_norm": 1.5296896696090698, "learning_rate": 3.494666666666667e-05, "loss": 2.1616, "step": 2625 }, { "epoch": 1.06, "grad_norm": 1.540246605873108, "learning_rate": 3.528e-05, "loss": 2.1357, "step": 2650 }, { "epoch": 1.07, "grad_norm": 1.5819247961044312, "learning_rate": 3.561333333333334e-05, "loss": 2.1471, "step": 2675 }, { "epoch": 1.08, "grad_norm": 1.4258027076721191, "learning_rate": 3.5946666666666666e-05, "loss": 2.113, "step": 2700 }, { "epoch": 1.09, "grad_norm": 1.356101393699646, "learning_rate": 3.6280000000000005e-05, "loss": 2.1027, "step": 2725 }, { "epoch": 1.1, "grad_norm": 1.754785418510437, "learning_rate": 3.661333333333333e-05, "loss": 2.1101, "step": 2750 }, { "epoch": 1.11, "grad_norm": 4.111976146697998, "learning_rate": 3.6933333333333334e-05, "loss": 2.1733, "step": 2775 }, { "epoch": 1.12, "grad_norm": 1.40675687789917, "learning_rate": 3.726666666666667e-05, "loss": 2.1599, "step": 2800 }, { "epoch": 1.13, "grad_norm": 1.312162160873413, "learning_rate": 3.76e-05, "loss": 2.092, "step": 2825 }, { "epoch": 1.1400000000000001, "grad_norm": 1.3412392139434814, "learning_rate": 3.793333333333334e-05, "loss": 2.1503, "step": 2850 }, { "epoch": 1.15, "grad_norm": 1.3180032968521118, "learning_rate": 3.8266666666666664e-05, "loss": 2.1323, "step": 2875 }, { "epoch": 1.16, "grad_norm": 1.4078686237335205, "learning_rate": 3.86e-05, "loss": 2.1663, "step": 2900 }, { "epoch": 1.17, "grad_norm": 1.4193419218063354, "learning_rate": 3.8933333333333336e-05, "loss": 2.1407, "step": 2925 }, { "epoch": 1.18, "grad_norm": 1.5237839221954346, "learning_rate": 3.926666666666667e-05, "loss": 2.1884, "step": 2950 }, { "epoch": 1.19, "grad_norm": 1.3546311855316162, "learning_rate": 3.960000000000001e-05, "loss": 2.0769, "step": 2975 }, { "epoch": 1.2, "grad_norm": 1.1709140539169312, "learning_rate": 3.993333333333333e-05, "loss": 1.9888, "step": 3000 }, { "epoch": 1.21, "grad_norm": 1.3492522239685059, "learning_rate": 4.026666666666667e-05, "loss": 2.0922, "step": 3025 }, { "epoch": 1.22, "grad_norm": 1.5517680644989014, "learning_rate": 4.0600000000000004e-05, "loss": 2.1161, "step": 3050 }, { "epoch": 1.23, "grad_norm": 1.3444249629974365, "learning_rate": 4.093333333333334e-05, "loss": 2.1351, "step": 3075 }, { "epoch": 1.24, "grad_norm": 1.4553472995758057, "learning_rate": 4.126666666666667e-05, "loss": 2.1889, "step": 3100 }, { "epoch": 1.25, "grad_norm": 1.4636095762252808, "learning_rate": 4.16e-05, "loss": 2.1161, "step": 3125 }, { "epoch": 1.26, "grad_norm": 1.3863240480422974, "learning_rate": 4.1933333333333334e-05, "loss": 2.0402, "step": 3150 }, { "epoch": 1.27, "grad_norm": 1.2597577571868896, "learning_rate": 4.226666666666667e-05, "loss": 2.0852, "step": 3175 }, { "epoch": 1.28, "grad_norm": 1.3540936708450317, "learning_rate": 4.26e-05, "loss": 2.1439, "step": 3200 }, { "epoch": 1.29, "grad_norm": 1.4448434114456177, "learning_rate": 4.293333333333334e-05, "loss": 2.1271, "step": 3225 }, { "epoch": 1.3, "grad_norm": 1.352243185043335, "learning_rate": 4.3266666666666664e-05, "loss": 2.1263, "step": 3250 }, { "epoch": 1.31, "grad_norm": 1.5224635601043701, "learning_rate": 4.36e-05, "loss": 2.1373, "step": 3275 }, { "epoch": 1.32, "grad_norm": 1.51206636428833, "learning_rate": 4.3933333333333335e-05, "loss": 2.1521, "step": 3300 }, { "epoch": 1.33, "grad_norm": 1.4777268171310425, "learning_rate": 4.426666666666667e-05, "loss": 2.0391, "step": 3325 }, { "epoch": 1.34, "grad_norm": 1.278085708618164, "learning_rate": 4.46e-05, "loss": 2.1313, "step": 3350 }, { "epoch": 1.35, "grad_norm": 1.4700874090194702, "learning_rate": 4.493333333333333e-05, "loss": 2.0469, "step": 3375 }, { "epoch": 1.3599999999999999, "grad_norm": 1.6795507669448853, "learning_rate": 4.526666666666667e-05, "loss": 2.1422, "step": 3400 }, { "epoch": 1.37, "grad_norm": 1.496156096458435, "learning_rate": 4.5600000000000004e-05, "loss": 2.0771, "step": 3425 }, { "epoch": 1.38, "grad_norm": 1.4133617877960205, "learning_rate": 4.5933333333333336e-05, "loss": 2.0397, "step": 3450 }, { "epoch": 1.3900000000000001, "grad_norm": 1.3693889379501343, "learning_rate": 4.626666666666667e-05, "loss": 2.1713, "step": 3475 }, { "epoch": 1.4, "grad_norm": 1.5390167236328125, "learning_rate": 4.660000000000001e-05, "loss": 2.1348, "step": 3500 }, { "epoch": 1.41, "grad_norm": 1.7128788232803345, "learning_rate": 4.6933333333333333e-05, "loss": 2.0817, "step": 3525 }, { "epoch": 1.42, "grad_norm": 1.4634878635406494, "learning_rate": 4.726666666666667e-05, "loss": 2.1121, "step": 3550 }, { "epoch": 1.43, "grad_norm": 2.623915672302246, "learning_rate": 4.76e-05, "loss": 2.1968, "step": 3575 }, { "epoch": 1.44, "grad_norm": 1.4591940641403198, "learning_rate": 4.793333333333334e-05, "loss": 2.0789, "step": 3600 }, { "epoch": 1.45, "grad_norm": 1.2940136194229126, "learning_rate": 4.826666666666667e-05, "loss": 2.147, "step": 3625 }, { "epoch": 1.46, "grad_norm": 1.283172845840454, "learning_rate": 4.86e-05, "loss": 2.0765, "step": 3650 }, { "epoch": 1.47, "grad_norm": 1.3233940601348877, "learning_rate": 4.8933333333333335e-05, "loss": 2.1018, "step": 3675 }, { "epoch": 1.48, "grad_norm": 1.6195733547210693, "learning_rate": 4.926666666666667e-05, "loss": 2.1608, "step": 3700 }, { "epoch": 1.49, "grad_norm": 1.5443871021270752, "learning_rate": 4.96e-05, "loss": 2.0899, "step": 3725 }, { "epoch": 1.5, "grad_norm": 1.3754618167877197, "learning_rate": 4.993333333333334e-05, "loss": 2.1288, "step": 3750 }, { "epoch": 1.51, "grad_norm": 1.2181512117385864, "learning_rate": 4.997037037037037e-05, "loss": 2.1582, "step": 3775 }, { "epoch": 1.52, "grad_norm": 1.4862068891525269, "learning_rate": 4.993333333333334e-05, "loss": 2.1475, "step": 3800 }, { "epoch": 1.53, "grad_norm": 1.3613662719726562, "learning_rate": 4.9896296296296293e-05, "loss": 2.1625, "step": 3825 }, { "epoch": 1.54, "grad_norm": 1.3887194395065308, "learning_rate": 4.985925925925926e-05, "loss": 2.1284, "step": 3850 }, { "epoch": 1.55, "grad_norm": 1.468736171722412, "learning_rate": 4.982222222222222e-05, "loss": 2.0792, "step": 3875 }, { "epoch": 1.56, "grad_norm": 1.4086352586746216, "learning_rate": 4.978518518518519e-05, "loss": 2.0997, "step": 3900 }, { "epoch": 1.5699999999999998, "grad_norm": 1.3831346035003662, "learning_rate": 4.9748148148148146e-05, "loss": 2.0768, "step": 3925 }, { "epoch": 1.58, "grad_norm": 1.2189925909042358, "learning_rate": 4.9711111111111115e-05, "loss": 2.0809, "step": 3950 }, { "epoch": 1.5899999999999999, "grad_norm": 1.2166587114334106, "learning_rate": 4.9674074074074076e-05, "loss": 2.128, "step": 3975 }, { "epoch": 1.6, "grad_norm": 1.5292308330535889, "learning_rate": 4.963703703703704e-05, "loss": 2.1093, "step": 4000 }, { "epoch": 1.6099999999999999, "grad_norm": 1.492801308631897, "learning_rate": 4.96e-05, "loss": 2.116, "step": 4025 }, { "epoch": 1.62, "grad_norm": 1.4455024003982544, "learning_rate": 4.956296296296297e-05, "loss": 2.1292, "step": 4050 }, { "epoch": 1.63, "grad_norm": 1.3066167831420898, "learning_rate": 4.952592592592592e-05, "loss": 2.1011, "step": 4075 }, { "epoch": 1.6400000000000001, "grad_norm": 1.496256947517395, "learning_rate": 4.949037037037037e-05, "loss": 2.1056, "step": 4100 }, { "epoch": 1.65, "grad_norm": 1.2948390245437622, "learning_rate": 4.9453333333333336e-05, "loss": 2.0569, "step": 4125 }, { "epoch": 1.6600000000000001, "grad_norm": 1.4984357357025146, "learning_rate": 4.94162962962963e-05, "loss": 2.1433, "step": 4150 }, { "epoch": 1.67, "grad_norm": 1.7721185684204102, "learning_rate": 4.9379259259259266e-05, "loss": 2.1098, "step": 4175 }, { "epoch": 1.6800000000000002, "grad_norm": 1.3611931800842285, "learning_rate": 4.934222222222222e-05, "loss": 2.0652, "step": 4200 }, { "epoch": 1.69, "grad_norm": 1.3092522621154785, "learning_rate": 4.930518518518519e-05, "loss": 2.0888, "step": 4225 }, { "epoch": 1.7, "grad_norm": 1.3880172967910767, "learning_rate": 4.926814814814815e-05, "loss": 2.0751, "step": 4250 }, { "epoch": 1.71, "grad_norm": 1.361522912979126, "learning_rate": 4.923111111111111e-05, "loss": 2.1263, "step": 4275 }, { "epoch": 1.72, "grad_norm": 1.357974886894226, "learning_rate": 4.9194074074074074e-05, "loss": 2.0967, "step": 4300 }, { "epoch": 1.73, "grad_norm": 1.3499886989593506, "learning_rate": 4.915703703703704e-05, "loss": 2.0426, "step": 4325 }, { "epoch": 1.74, "grad_norm": 1.4301838874816895, "learning_rate": 4.9120000000000004e-05, "loss": 2.0596, "step": 4350 }, { "epoch": 1.75, "grad_norm": 1.4266048669815063, "learning_rate": 4.9082962962962966e-05, "loss": 2.0574, "step": 4375 }, { "epoch": 1.76, "grad_norm": 1.4211664199829102, "learning_rate": 4.904592592592593e-05, "loss": 2.0839, "step": 4400 }, { "epoch": 1.77, "grad_norm": 1.2608351707458496, "learning_rate": 4.9008888888888896e-05, "loss": 2.037, "step": 4425 }, { "epoch": 1.78, "grad_norm": 1.35879385471344, "learning_rate": 4.897185185185185e-05, "loss": 2.0807, "step": 4450 }, { "epoch": 1.79, "grad_norm": 1.4060136079788208, "learning_rate": 4.893481481481482e-05, "loss": 2.1474, "step": 4475 }, { "epoch": 1.8, "grad_norm": 1.3391823768615723, "learning_rate": 4.889777777777778e-05, "loss": 2.115, "step": 4500 }, { "epoch": 1.81, "grad_norm": 1.6455671787261963, "learning_rate": 4.886074074074075e-05, "loss": 2.0943, "step": 4525 }, { "epoch": 1.8199999999999998, "grad_norm": 1.3512789011001587, "learning_rate": 4.88237037037037e-05, "loss": 2.0675, "step": 4550 }, { "epoch": 1.83, "grad_norm": 1.3271268606185913, "learning_rate": 4.878666666666667e-05, "loss": 2.139, "step": 4575 }, { "epoch": 1.8399999999999999, "grad_norm": 1.3180307149887085, "learning_rate": 4.874962962962963e-05, "loss": 2.0442, "step": 4600 }, { "epoch": 1.85, "grad_norm": 1.322908878326416, "learning_rate": 4.8712592592592595e-05, "loss": 2.1164, "step": 4625 }, { "epoch": 1.8599999999999999, "grad_norm": 1.2810012102127075, "learning_rate": 4.8675555555555556e-05, "loss": 2.0138, "step": 4650 }, { "epoch": 1.87, "grad_norm": 1.3733242750167847, "learning_rate": 4.8638518518518525e-05, "loss": 2.0639, "step": 4675 }, { "epoch": 1.88, "grad_norm": 1.3254153728485107, "learning_rate": 4.860148148148148e-05, "loss": 2.0502, "step": 4700 }, { "epoch": 1.8900000000000001, "grad_norm": 1.3326213359832764, "learning_rate": 4.856444444444445e-05, "loss": 2.0394, "step": 4725 }, { "epoch": 1.9, "grad_norm": 1.3088740110397339, "learning_rate": 4.852740740740741e-05, "loss": 2.1614, "step": 4750 }, { "epoch": 1.9100000000000001, "grad_norm": 1.4830114841461182, "learning_rate": 4.849037037037038e-05, "loss": 2.0836, "step": 4775 }, { "epoch": 1.92, "grad_norm": 1.5083235502243042, "learning_rate": 4.845333333333333e-05, "loss": 2.084, "step": 4800 }, { "epoch": 1.9300000000000002, "grad_norm": 1.4989155530929565, "learning_rate": 4.84162962962963e-05, "loss": 2.1183, "step": 4825 }, { "epoch": 1.94, "grad_norm": 1.2256449460983276, "learning_rate": 4.837925925925926e-05, "loss": 2.1002, "step": 4850 }, { "epoch": 1.95, "grad_norm": 1.2824608087539673, "learning_rate": 4.8342222222222224e-05, "loss": 2.0369, "step": 4875 }, { "epoch": 1.96, "grad_norm": 1.2830954790115356, "learning_rate": 4.8305185185185185e-05, "loss": 2.0361, "step": 4900 }, { "epoch": 1.97, "grad_norm": 1.4407249689102173, "learning_rate": 4.8268148148148154e-05, "loss": 2.0535, "step": 4925 }, { "epoch": 1.98, "grad_norm": 1.3975064754486084, "learning_rate": 4.8231111111111115e-05, "loss": 2.0805, "step": 4950 }, { "epoch": 1.99, "grad_norm": 1.2193372249603271, "learning_rate": 4.819407407407408e-05, "loss": 2.0748, "step": 4975 }, { "epoch": 2.0, "grad_norm": 1.4137203693389893, "learning_rate": 4.815703703703704e-05, "loss": 2.0812, "step": 5000 }, { "epoch": 2.0, "eval_gen_len": 13.1104, "eval_loss": 1.972951054573059, "eval_rouge1": 51.236, "eval_rouge2": 25.4764, "eval_rougeL": 47.1309, "eval_rougeLsum": 47.1388, "eval_runtime": 195.1159, "eval_samples_per_second": 102.503, "eval_steps_per_second": 1.604, "step": 5000 }, { "epoch": 2.01, "grad_norm": 1.292515754699707, "learning_rate": 4.812000000000001e-05, "loss": 1.9586, "step": 5025 }, { "epoch": 2.02, "grad_norm": 1.329329490661621, "learning_rate": 4.808296296296296e-05, "loss": 2.0068, "step": 5050 }, { "epoch": 2.03, "grad_norm": 1.5336360931396484, "learning_rate": 4.804592592592593e-05, "loss": 1.986, "step": 5075 }, { "epoch": 2.04, "grad_norm": 1.313008427619934, "learning_rate": 4.800888888888889e-05, "loss": 2.0093, "step": 5100 }, { "epoch": 2.05, "grad_norm": 1.390008807182312, "learning_rate": 4.797185185185186e-05, "loss": 2.0478, "step": 5125 }, { "epoch": 2.06, "grad_norm": 1.4343863725662231, "learning_rate": 4.7934814814814815e-05, "loss": 2.0325, "step": 5150 }, { "epoch": 2.07, "grad_norm": 1.3213175535202026, "learning_rate": 4.789777777777778e-05, "loss": 2.0362, "step": 5175 }, { "epoch": 2.08, "grad_norm": 1.2570987939834595, "learning_rate": 4.7860740740740745e-05, "loss": 1.9476, "step": 5200 }, { "epoch": 2.09, "grad_norm": 1.377868413925171, "learning_rate": 4.7823703703703706e-05, "loss": 1.9655, "step": 5225 }, { "epoch": 2.1, "grad_norm": 1.1855210065841675, "learning_rate": 4.778666666666667e-05, "loss": 2.0382, "step": 5250 }, { "epoch": 2.11, "grad_norm": 1.3284786939620972, "learning_rate": 4.7749629629629636e-05, "loss": 2.0808, "step": 5275 }, { "epoch": 2.12, "grad_norm": 1.4279937744140625, "learning_rate": 4.77125925925926e-05, "loss": 2.0179, "step": 5300 }, { "epoch": 2.13, "grad_norm": 1.5963534116744995, "learning_rate": 4.767555555555556e-05, "loss": 1.9902, "step": 5325 }, { "epoch": 2.14, "grad_norm": 1.3769519329071045, "learning_rate": 4.763851851851852e-05, "loss": 1.9842, "step": 5350 }, { "epoch": 2.15, "grad_norm": 1.3761203289031982, "learning_rate": 4.760148148148149e-05, "loss": 2.0256, "step": 5375 }, { "epoch": 2.16, "grad_norm": 1.4661396741867065, "learning_rate": 4.7564444444444444e-05, "loss": 2.0704, "step": 5400 }, { "epoch": 2.17, "grad_norm": 1.301453709602356, "learning_rate": 4.752740740740741e-05, "loss": 1.9957, "step": 5425 }, { "epoch": 2.18, "grad_norm": 1.3096243143081665, "learning_rate": 4.7490370370370374e-05, "loss": 2.0023, "step": 5450 }, { "epoch": 2.19, "grad_norm": 1.3345898389816284, "learning_rate": 4.7453333333333335e-05, "loss": 2.0472, "step": 5475 }, { "epoch": 2.2, "grad_norm": 1.1880935430526733, "learning_rate": 4.74162962962963e-05, "loss": 1.9565, "step": 5500 }, { "epoch": 2.21, "grad_norm": 1.2483596801757812, "learning_rate": 4.7379259259259265e-05, "loss": 2.0419, "step": 5525 }, { "epoch": 2.22, "grad_norm": 1.3003907203674316, "learning_rate": 4.734222222222223e-05, "loss": 2.0411, "step": 5550 }, { "epoch": 2.23, "grad_norm": 1.3677852153778076, "learning_rate": 4.730518518518519e-05, "loss": 2.0454, "step": 5575 }, { "epoch": 2.24, "grad_norm": 1.3494675159454346, "learning_rate": 4.726814814814815e-05, "loss": 1.9652, "step": 5600 }, { "epoch": 2.25, "grad_norm": 1.2623995542526245, "learning_rate": 4.723111111111112e-05, "loss": 2.0516, "step": 5625 }, { "epoch": 2.26, "grad_norm": 1.4136385917663574, "learning_rate": 4.719407407407407e-05, "loss": 1.9735, "step": 5650 }, { "epoch": 2.27, "grad_norm": 1.2677298784255981, "learning_rate": 4.715703703703704e-05, "loss": 1.9916, "step": 5675 }, { "epoch": 2.2800000000000002, "grad_norm": 1.422560453414917, "learning_rate": 4.712e-05, "loss": 2.0216, "step": 5700 }, { "epoch": 2.29, "grad_norm": 1.4796884059906006, "learning_rate": 4.7082962962962964e-05, "loss": 2.0134, "step": 5725 }, { "epoch": 2.3, "grad_norm": 1.347684383392334, "learning_rate": 4.7045925925925926e-05, "loss": 2.0073, "step": 5750 }, { "epoch": 2.31, "grad_norm": 1.238133192062378, "learning_rate": 4.7008888888888894e-05, "loss": 2.0002, "step": 5775 }, { "epoch": 2.32, "grad_norm": 1.271411657333374, "learning_rate": 4.6971851851851856e-05, "loss": 1.9866, "step": 5800 }, { "epoch": 2.33, "grad_norm": 1.4188841581344604, "learning_rate": 4.693481481481482e-05, "loss": 1.96, "step": 5825 }, { "epoch": 2.34, "grad_norm": 1.3498203754425049, "learning_rate": 4.689777777777778e-05, "loss": 2.0338, "step": 5850 }, { "epoch": 2.35, "grad_norm": 1.224448800086975, "learning_rate": 4.686074074074074e-05, "loss": 2.0572, "step": 5875 }, { "epoch": 2.36, "grad_norm": 1.4224382638931274, "learning_rate": 4.682370370370371e-05, "loss": 2.0333, "step": 5900 }, { "epoch": 2.37, "grad_norm": 1.553787350654602, "learning_rate": 4.678666666666667e-05, "loss": 1.9975, "step": 5925 }, { "epoch": 2.38, "grad_norm": 1.377954125404358, "learning_rate": 4.674962962962963e-05, "loss": 2.0317, "step": 5950 }, { "epoch": 2.39, "grad_norm": 1.4469361305236816, "learning_rate": 4.6712592592592594e-05, "loss": 2.0382, "step": 5975 }, { "epoch": 2.4, "grad_norm": 1.3143268823623657, "learning_rate": 4.6675555555555555e-05, "loss": 2.0107, "step": 6000 }, { "epoch": 2.41, "grad_norm": 1.322283148765564, "learning_rate": 4.6638518518518523e-05, "loss": 1.9968, "step": 6025 }, { "epoch": 2.42, "grad_norm": 1.5277537107467651, "learning_rate": 4.6601481481481485e-05, "loss": 2.0148, "step": 6050 }, { "epoch": 2.43, "grad_norm": 1.3555701971054077, "learning_rate": 4.6564444444444447e-05, "loss": 1.9547, "step": 6075 }, { "epoch": 2.44, "grad_norm": 1.3030173778533936, "learning_rate": 4.652740740740741e-05, "loss": 2.0186, "step": 6100 }, { "epoch": 2.45, "grad_norm": 1.1996219158172607, "learning_rate": 4.6491851851851854e-05, "loss": 1.9938, "step": 6125 }, { "epoch": 2.46, "grad_norm": 1.2713743448257446, "learning_rate": 4.6454814814814815e-05, "loss": 2.0135, "step": 6150 }, { "epoch": 2.4699999999999998, "grad_norm": 1.4466146230697632, "learning_rate": 4.6417777777777784e-05, "loss": 2.0027, "step": 6175 }, { "epoch": 2.48, "grad_norm": 1.3012115955352783, "learning_rate": 4.638074074074074e-05, "loss": 2.0462, "step": 6200 }, { "epoch": 2.49, "grad_norm": 1.4161940813064575, "learning_rate": 4.634370370370371e-05, "loss": 2.0416, "step": 6225 }, { "epoch": 2.5, "grad_norm": 1.382034420967102, "learning_rate": 4.630666666666667e-05, "loss": 1.9795, "step": 6250 }, { "epoch": 2.51, "grad_norm": 1.4618980884552002, "learning_rate": 4.626962962962963e-05, "loss": 2.0163, "step": 6275 }, { "epoch": 2.52, "grad_norm": 1.304030179977417, "learning_rate": 4.623259259259259e-05, "loss": 1.9821, "step": 6300 }, { "epoch": 2.5300000000000002, "grad_norm": 1.5101556777954102, "learning_rate": 4.619555555555556e-05, "loss": 1.9863, "step": 6325 }, { "epoch": 2.54, "grad_norm": 1.3217978477478027, "learning_rate": 4.615851851851852e-05, "loss": 2.0009, "step": 6350 }, { "epoch": 2.55, "grad_norm": 1.367722749710083, "learning_rate": 4.612148148148148e-05, "loss": 2.0459, "step": 6375 }, { "epoch": 2.56, "grad_norm": 1.5014063119888306, "learning_rate": 4.6084444444444444e-05, "loss": 2.0129, "step": 6400 }, { "epoch": 2.57, "grad_norm": 1.4557509422302246, "learning_rate": 4.604740740740741e-05, "loss": 1.9869, "step": 6425 }, { "epoch": 2.58, "grad_norm": 1.3464524745941162, "learning_rate": 4.601037037037037e-05, "loss": 2.0133, "step": 6450 }, { "epoch": 2.59, "grad_norm": 1.41274893283844, "learning_rate": 4.5973333333333336e-05, "loss": 1.996, "step": 6475 }, { "epoch": 2.6, "grad_norm": 1.3834503889083862, "learning_rate": 4.59362962962963e-05, "loss": 2.0057, "step": 6500 }, { "epoch": 2.61, "grad_norm": 1.4154237508773804, "learning_rate": 4.5899259259259266e-05, "loss": 2.0168, "step": 6525 }, { "epoch": 2.62, "grad_norm": 1.2246060371398926, "learning_rate": 4.586222222222222e-05, "loss": 1.9689, "step": 6550 }, { "epoch": 2.63, "grad_norm": 1.5142685174942017, "learning_rate": 4.582518518518519e-05, "loss": 2.0025, "step": 6575 }, { "epoch": 2.64, "grad_norm": 1.6933802366256714, "learning_rate": 4.578814814814815e-05, "loss": 2.0064, "step": 6600 }, { "epoch": 2.65, "grad_norm": 1.3439992666244507, "learning_rate": 4.575111111111111e-05, "loss": 1.9638, "step": 6625 }, { "epoch": 2.66, "grad_norm": 1.4206079244613647, "learning_rate": 4.5714074074074074e-05, "loss": 1.9888, "step": 6650 }, { "epoch": 2.67, "grad_norm": 1.3695896863937378, "learning_rate": 4.567703703703704e-05, "loss": 1.9735, "step": 6675 }, { "epoch": 2.68, "grad_norm": 1.2460039854049683, "learning_rate": 4.564e-05, "loss": 2.0707, "step": 6700 }, { "epoch": 2.69, "grad_norm": 1.2507898807525635, "learning_rate": 4.5602962962962965e-05, "loss": 2.0131, "step": 6725 }, { "epoch": 2.7, "grad_norm": 1.2939226627349854, "learning_rate": 4.5565925925925927e-05, "loss": 1.9902, "step": 6750 }, { "epoch": 2.71, "grad_norm": 1.4121019840240479, "learning_rate": 4.5528888888888895e-05, "loss": 1.9906, "step": 6775 }, { "epoch": 2.7199999999999998, "grad_norm": 1.1183571815490723, "learning_rate": 4.549185185185185e-05, "loss": 2.0259, "step": 6800 }, { "epoch": 2.73, "grad_norm": 1.4369187355041504, "learning_rate": 4.545481481481482e-05, "loss": 1.963, "step": 6825 }, { "epoch": 2.74, "grad_norm": 1.1758921146392822, "learning_rate": 4.541777777777778e-05, "loss": 1.9743, "step": 6850 }, { "epoch": 2.75, "grad_norm": 1.3526884317398071, "learning_rate": 4.538074074074074e-05, "loss": 2.0403, "step": 6875 }, { "epoch": 2.76, "grad_norm": 1.2723559141159058, "learning_rate": 4.53437037037037e-05, "loss": 1.958, "step": 6900 }, { "epoch": 2.77, "grad_norm": 1.4061169624328613, "learning_rate": 4.530666666666667e-05, "loss": 2.0213, "step": 6925 }, { "epoch": 2.7800000000000002, "grad_norm": 1.292668104171753, "learning_rate": 4.526962962962963e-05, "loss": 2.0301, "step": 6950 }, { "epoch": 2.79, "grad_norm": 1.2787526845932007, "learning_rate": 4.5232592592592594e-05, "loss": 1.9822, "step": 6975 }, { "epoch": 2.8, "grad_norm": 1.2812767028808594, "learning_rate": 4.5195555555555556e-05, "loss": 1.9919, "step": 7000 }, { "epoch": 2.81, "grad_norm": 1.2720341682434082, "learning_rate": 4.5158518518518524e-05, "loss": 2.0142, "step": 7025 }, { "epoch": 2.82, "grad_norm": 1.3361945152282715, "learning_rate": 4.512148148148148e-05, "loss": 2.0006, "step": 7050 }, { "epoch": 2.83, "grad_norm": 1.1397879123687744, "learning_rate": 4.508444444444445e-05, "loss": 1.9674, "step": 7075 }, { "epoch": 2.84, "grad_norm": 1.3024892807006836, "learning_rate": 4.504740740740741e-05, "loss": 1.969, "step": 7100 }, { "epoch": 2.85, "grad_norm": 1.2149879932403564, "learning_rate": 4.501037037037038e-05, "loss": 1.9873, "step": 7125 }, { "epoch": 2.86, "grad_norm": 1.2626320123672485, "learning_rate": 4.497333333333333e-05, "loss": 1.9552, "step": 7150 }, { "epoch": 2.87, "grad_norm": 1.3262602090835571, "learning_rate": 4.49362962962963e-05, "loss": 1.9485, "step": 7175 }, { "epoch": 2.88, "grad_norm": 1.4364970922470093, "learning_rate": 4.489925925925926e-05, "loss": 2.0553, "step": 7200 }, { "epoch": 2.89, "grad_norm": 1.2992265224456787, "learning_rate": 4.486222222222222e-05, "loss": 2.0324, "step": 7225 }, { "epoch": 2.9, "grad_norm": 1.246061086654663, "learning_rate": 4.4825185185185185e-05, "loss": 1.9706, "step": 7250 }, { "epoch": 2.91, "grad_norm": 1.625594139099121, "learning_rate": 4.478814814814815e-05, "loss": 1.9419, "step": 7275 }, { "epoch": 2.92, "grad_norm": 1.2219802141189575, "learning_rate": 4.4751111111111115e-05, "loss": 2.0337, "step": 7300 }, { "epoch": 2.93, "grad_norm": 1.436552882194519, "learning_rate": 4.4714074074074076e-05, "loss": 1.9975, "step": 7325 }, { "epoch": 2.94, "grad_norm": 1.3120185136795044, "learning_rate": 4.467703703703704e-05, "loss": 2.0306, "step": 7350 }, { "epoch": 2.95, "grad_norm": 1.2584844827651978, "learning_rate": 4.4640000000000006e-05, "loss": 2.0335, "step": 7375 }, { "epoch": 2.96, "grad_norm": 1.4038913249969482, "learning_rate": 4.460296296296296e-05, "loss": 1.9072, "step": 7400 }, { "epoch": 2.9699999999999998, "grad_norm": 1.2124029397964478, "learning_rate": 4.456592592592593e-05, "loss": 2.0269, "step": 7425 }, { "epoch": 2.98, "grad_norm": 1.45619797706604, "learning_rate": 4.452888888888889e-05, "loss": 1.9908, "step": 7450 }, { "epoch": 2.99, "grad_norm": 1.2446014881134033, "learning_rate": 4.449185185185185e-05, "loss": 2.0035, "step": 7475 }, { "epoch": 3.0, "grad_norm": 1.4367069005966187, "learning_rate": 4.4454814814814814e-05, "loss": 2.0286, "step": 7500 }, { "epoch": 3.0, "eval_gen_len": 13.1299, "eval_loss": 1.9462145566940308, "eval_rouge1": 51.2672, "eval_rouge2": 25.5438, "eval_rougeL": 47.1691, "eval_rougeLsum": 47.1653, "eval_runtime": 194.6381, "eval_samples_per_second": 102.755, "eval_steps_per_second": 1.608, "step": 7500 }, { "epoch": 3.01, "grad_norm": 1.4803961515426636, "learning_rate": 4.441777777777778e-05, "loss": 1.8803, "step": 7525 }, { "epoch": 3.02, "grad_norm": 1.3261098861694336, "learning_rate": 4.4380740740740744e-05, "loss": 1.9219, "step": 7550 }, { "epoch": 3.03, "grad_norm": 1.2701349258422852, "learning_rate": 4.4343703703703706e-05, "loss": 1.9066, "step": 7575 }, { "epoch": 3.04, "grad_norm": 1.4366850852966309, "learning_rate": 4.430666666666667e-05, "loss": 1.9186, "step": 7600 }, { "epoch": 3.05, "grad_norm": 1.304621934890747, "learning_rate": 4.4269629629629635e-05, "loss": 1.9499, "step": 7625 }, { "epoch": 3.06, "grad_norm": 1.4453222751617432, "learning_rate": 4.423259259259259e-05, "loss": 1.8863, "step": 7650 }, { "epoch": 3.07, "grad_norm": 1.1800676584243774, "learning_rate": 4.419555555555556e-05, "loss": 1.8879, "step": 7675 }, { "epoch": 3.08, "grad_norm": 1.5625019073486328, "learning_rate": 4.415851851851852e-05, "loss": 1.8945, "step": 7700 }, { "epoch": 3.09, "grad_norm": 1.702916145324707, "learning_rate": 4.412148148148149e-05, "loss": 1.9361, "step": 7725 }, { "epoch": 3.1, "grad_norm": 1.641830325126648, "learning_rate": 4.408444444444444e-05, "loss": 1.9031, "step": 7750 }, { "epoch": 3.11, "grad_norm": 1.6276681423187256, "learning_rate": 4.404740740740741e-05, "loss": 1.9207, "step": 7775 }, { "epoch": 3.12, "grad_norm": 1.4015640020370483, "learning_rate": 4.401037037037037e-05, "loss": 1.91, "step": 7800 }, { "epoch": 3.13, "grad_norm": 1.1643470525741577, "learning_rate": 4.3973333333333335e-05, "loss": 1.94, "step": 7825 }, { "epoch": 3.14, "grad_norm": 1.436637282371521, "learning_rate": 4.393777777777778e-05, "loss": 1.9152, "step": 7850 }, { "epoch": 3.15, "grad_norm": 1.2453664541244507, "learning_rate": 4.390074074074074e-05, "loss": 1.8912, "step": 7875 }, { "epoch": 3.16, "grad_norm": 1.507677674293518, "learning_rate": 4.386370370370371e-05, "loss": 1.9148, "step": 7900 }, { "epoch": 3.17, "grad_norm": 1.3336275815963745, "learning_rate": 4.382666666666667e-05, "loss": 1.9583, "step": 7925 }, { "epoch": 3.18, "grad_norm": 1.5578241348266602, "learning_rate": 4.378962962962963e-05, "loss": 1.9273, "step": 7950 }, { "epoch": 3.19, "grad_norm": 1.299078345298767, "learning_rate": 4.3752592592592595e-05, "loss": 1.9137, "step": 7975 }, { "epoch": 3.2, "grad_norm": 1.4711804389953613, "learning_rate": 4.3715555555555556e-05, "loss": 1.8916, "step": 8000 }, { "epoch": 3.21, "grad_norm": 1.258434534072876, "learning_rate": 4.367851851851852e-05, "loss": 1.8674, "step": 8025 }, { "epoch": 3.22, "grad_norm": 1.4030392169952393, "learning_rate": 4.3641481481481486e-05, "loss": 1.9207, "step": 8050 }, { "epoch": 3.23, "grad_norm": 1.3217052221298218, "learning_rate": 4.360444444444445e-05, "loss": 1.9356, "step": 8075 }, { "epoch": 3.24, "grad_norm": 1.4329230785369873, "learning_rate": 4.356740740740741e-05, "loss": 1.9849, "step": 8100 }, { "epoch": 3.25, "grad_norm": 1.4400358200073242, "learning_rate": 4.353037037037037e-05, "loss": 1.966, "step": 8125 }, { "epoch": 3.26, "grad_norm": 1.449614405632019, "learning_rate": 4.349333333333334e-05, "loss": 1.9351, "step": 8150 }, { "epoch": 3.27, "grad_norm": 1.3323086500167847, "learning_rate": 4.34562962962963e-05, "loss": 1.9247, "step": 8175 }, { "epoch": 3.2800000000000002, "grad_norm": 1.267240047454834, "learning_rate": 4.341925925925926e-05, "loss": 1.902, "step": 8200 }, { "epoch": 3.29, "grad_norm": 1.276930332183838, "learning_rate": 4.3382222222222224e-05, "loss": 1.938, "step": 8225 }, { "epoch": 3.3, "grad_norm": 1.3871114253997803, "learning_rate": 4.3345185185185186e-05, "loss": 1.8964, "step": 8250 }, { "epoch": 3.31, "grad_norm": 1.4275935888290405, "learning_rate": 4.330814814814815e-05, "loss": 1.8912, "step": 8275 }, { "epoch": 3.32, "grad_norm": 1.4508986473083496, "learning_rate": 4.3271111111111115e-05, "loss": 1.959, "step": 8300 }, { "epoch": 3.33, "grad_norm": 1.4686108827590942, "learning_rate": 4.323407407407408e-05, "loss": 1.9674, "step": 8325 }, { "epoch": 3.34, "grad_norm": 1.2445565462112427, "learning_rate": 4.319703703703704e-05, "loss": 1.9429, "step": 8350 }, { "epoch": 3.35, "grad_norm": 1.3624173402786255, "learning_rate": 4.316e-05, "loss": 1.943, "step": 8375 }, { "epoch": 3.36, "grad_norm": 1.9012771844863892, "learning_rate": 4.312296296296296e-05, "loss": 1.9657, "step": 8400 }, { "epoch": 3.37, "grad_norm": 1.4263502359390259, "learning_rate": 4.308592592592593e-05, "loss": 1.9606, "step": 8425 }, { "epoch": 3.38, "grad_norm": 1.3634259700775146, "learning_rate": 4.304888888888889e-05, "loss": 1.8931, "step": 8450 }, { "epoch": 3.39, "grad_norm": 1.498611569404602, "learning_rate": 4.301185185185185e-05, "loss": 1.9311, "step": 8475 }, { "epoch": 3.4, "grad_norm": 1.5958226919174194, "learning_rate": 4.2974814814814815e-05, "loss": 1.9499, "step": 8500 }, { "epoch": 3.41, "grad_norm": 1.4740561246871948, "learning_rate": 4.293777777777778e-05, "loss": 1.9545, "step": 8525 }, { "epoch": 3.42, "grad_norm": 1.3004882335662842, "learning_rate": 4.2900740740740745e-05, "loss": 1.9883, "step": 8550 }, { "epoch": 3.43, "grad_norm": 1.7882790565490723, "learning_rate": 4.2863703703703706e-05, "loss": 1.9759, "step": 8575 }, { "epoch": 3.44, "grad_norm": 1.3567942380905151, "learning_rate": 4.282666666666667e-05, "loss": 1.9219, "step": 8600 }, { "epoch": 3.45, "grad_norm": 1.2835100889205933, "learning_rate": 4.278962962962963e-05, "loss": 1.9568, "step": 8625 }, { "epoch": 3.46, "grad_norm": 1.3128582239151, "learning_rate": 4.275259259259259e-05, "loss": 1.9076, "step": 8650 }, { "epoch": 3.4699999999999998, "grad_norm": 1.4423998594284058, "learning_rate": 4.271555555555556e-05, "loss": 1.953, "step": 8675 }, { "epoch": 3.48, "grad_norm": 1.6110066175460815, "learning_rate": 4.267851851851852e-05, "loss": 1.9314, "step": 8700 }, { "epoch": 3.49, "grad_norm": 1.235214114189148, "learning_rate": 4.264148148148148e-05, "loss": 1.8981, "step": 8725 }, { "epoch": 3.5, "grad_norm": 1.223358392715454, "learning_rate": 4.2604444444444444e-05, "loss": 1.9607, "step": 8750 }, { "epoch": 3.51, "grad_norm": 1.7454544305801392, "learning_rate": 4.256740740740741e-05, "loss": 1.9374, "step": 8775 }, { "epoch": 3.52, "grad_norm": 1.6236810684204102, "learning_rate": 4.253185185185186e-05, "loss": 1.9503, "step": 8800 }, { "epoch": 3.5300000000000002, "grad_norm": 1.454771637916565, "learning_rate": 4.249481481481481e-05, "loss": 1.9947, "step": 8825 }, { "epoch": 3.54, "grad_norm": 1.4015847444534302, "learning_rate": 4.245777777777778e-05, "loss": 1.9912, "step": 8850 }, { "epoch": 3.55, "grad_norm": 1.4323807954788208, "learning_rate": 4.242074074074074e-05, "loss": 1.9142, "step": 8875 }, { "epoch": 3.56, "grad_norm": 1.426339864730835, "learning_rate": 4.2383703703703704e-05, "loss": 1.9208, "step": 8900 }, { "epoch": 3.57, "grad_norm": 1.2626959085464478, "learning_rate": 4.2346666666666666e-05, "loss": 1.9237, "step": 8925 }, { "epoch": 3.58, "grad_norm": 1.3355377912521362, "learning_rate": 4.2309629629629634e-05, "loss": 1.9616, "step": 8950 }, { "epoch": 3.59, "grad_norm": 1.3246771097183228, "learning_rate": 4.2272592592592595e-05, "loss": 1.9595, "step": 8975 }, { "epoch": 3.6, "grad_norm": 1.3291839361190796, "learning_rate": 4.223555555555556e-05, "loss": 1.9718, "step": 9000 }, { "epoch": 3.61, "grad_norm": 1.4041240215301514, "learning_rate": 4.219851851851852e-05, "loss": 1.9147, "step": 9025 }, { "epoch": 3.62, "grad_norm": 1.5576542615890503, "learning_rate": 4.216148148148149e-05, "loss": 1.9467, "step": 9050 }, { "epoch": 3.63, "grad_norm": 1.355878472328186, "learning_rate": 4.212444444444444e-05, "loss": 1.932, "step": 9075 }, { "epoch": 3.64, "grad_norm": 1.3441921472549438, "learning_rate": 4.208740740740741e-05, "loss": 1.9606, "step": 9100 }, { "epoch": 3.65, "grad_norm": 1.5347744226455688, "learning_rate": 4.205037037037037e-05, "loss": 1.9081, "step": 9125 }, { "epoch": 3.66, "grad_norm": 1.4844331741333008, "learning_rate": 4.201333333333334e-05, "loss": 1.847, "step": 9150 }, { "epoch": 3.67, "grad_norm": 1.4049714803695679, "learning_rate": 4.1976296296296295e-05, "loss": 1.9042, "step": 9175 }, { "epoch": 3.68, "grad_norm": 1.4935232400894165, "learning_rate": 4.193925925925926e-05, "loss": 1.9575, "step": 9200 }, { "epoch": 3.69, "grad_norm": 1.6459118127822876, "learning_rate": 4.1902222222222225e-05, "loss": 1.9779, "step": 9225 }, { "epoch": 3.7, "grad_norm": 1.6347599029541016, "learning_rate": 4.1865185185185186e-05, "loss": 1.9265, "step": 9250 }, { "epoch": 3.71, "grad_norm": 1.5071406364440918, "learning_rate": 4.182814814814815e-05, "loss": 1.9869, "step": 9275 }, { "epoch": 3.7199999999999998, "grad_norm": 1.521414041519165, "learning_rate": 4.1791111111111116e-05, "loss": 1.9607, "step": 9300 }, { "epoch": 3.73, "grad_norm": 1.6524264812469482, "learning_rate": 4.175407407407408e-05, "loss": 1.9068, "step": 9325 }, { "epoch": 3.74, "grad_norm": 1.5275657176971436, "learning_rate": 4.171703703703704e-05, "loss": 1.9225, "step": 9350 }, { "epoch": 3.75, "grad_norm": 1.2827988862991333, "learning_rate": 4.168e-05, "loss": 1.9199, "step": 9375 }, { "epoch": 3.76, "grad_norm": 1.5955214500427246, "learning_rate": 4.164296296296297e-05, "loss": 1.8854, "step": 9400 }, { "epoch": 3.77, "grad_norm": 1.4321446418762207, "learning_rate": 4.1605925925925924e-05, "loss": 1.9488, "step": 9425 }, { "epoch": 3.7800000000000002, "grad_norm": 1.3754669427871704, "learning_rate": 4.156888888888889e-05, "loss": 1.9209, "step": 9450 }, { "epoch": 3.79, "grad_norm": 1.1604045629501343, "learning_rate": 4.1531851851851854e-05, "loss": 1.9515, "step": 9475 }, { "epoch": 3.8, "grad_norm": 1.3413041830062866, "learning_rate": 4.1494814814814815e-05, "loss": 1.9457, "step": 9500 }, { "epoch": 3.81, "grad_norm": 1.349363088607788, "learning_rate": 4.145777777777778e-05, "loss": 1.9789, "step": 9525 }, { "epoch": 3.82, "grad_norm": 1.3260554075241089, "learning_rate": 4.1420740740740745e-05, "loss": 1.8999, "step": 9550 }, { "epoch": 3.83, "grad_norm": 1.289332628250122, "learning_rate": 4.138370370370371e-05, "loss": 1.9856, "step": 9575 }, { "epoch": 3.84, "grad_norm": 1.4211158752441406, "learning_rate": 4.134666666666667e-05, "loss": 1.919, "step": 9600 }, { "epoch": 3.85, "grad_norm": 1.4039026498794556, "learning_rate": 4.130962962962963e-05, "loss": 1.9073, "step": 9625 }, { "epoch": 3.86, "grad_norm": 1.2093830108642578, "learning_rate": 4.12725925925926e-05, "loss": 1.9501, "step": 9650 }, { "epoch": 3.87, "grad_norm": 1.323862075805664, "learning_rate": 4.123555555555555e-05, "loss": 1.8516, "step": 9675 }, { "epoch": 3.88, "grad_norm": 1.4265086650848389, "learning_rate": 4.119851851851852e-05, "loss": 1.9498, "step": 9700 }, { "epoch": 3.89, "grad_norm": 1.4013090133666992, "learning_rate": 4.116148148148148e-05, "loss": 1.9302, "step": 9725 }, { "epoch": 3.9, "grad_norm": 1.44480562210083, "learning_rate": 4.112444444444445e-05, "loss": 1.9059, "step": 9750 }, { "epoch": 3.91, "grad_norm": 1.6620755195617676, "learning_rate": 4.1087407407407406e-05, "loss": 1.9037, "step": 9775 }, { "epoch": 3.92, "grad_norm": 1.4004356861114502, "learning_rate": 4.1050370370370374e-05, "loss": 1.8987, "step": 9800 }, { "epoch": 3.93, "grad_norm": 1.536922812461853, "learning_rate": 4.1013333333333336e-05, "loss": 2.0043, "step": 9825 }, { "epoch": 3.94, "grad_norm": 1.4472166299819946, "learning_rate": 4.09762962962963e-05, "loss": 1.9224, "step": 9850 }, { "epoch": 3.95, "grad_norm": 1.3354836702346802, "learning_rate": 4.093925925925926e-05, "loss": 1.934, "step": 9875 }, { "epoch": 3.96, "grad_norm": 1.3124749660491943, "learning_rate": 4.090222222222223e-05, "loss": 1.9879, "step": 9900 }, { "epoch": 3.9699999999999998, "grad_norm": 1.283579707145691, "learning_rate": 4.086518518518519e-05, "loss": 1.8905, "step": 9925 }, { "epoch": 3.98, "grad_norm": 1.672229290008545, "learning_rate": 4.082814814814815e-05, "loss": 1.8554, "step": 9950 }, { "epoch": 3.99, "grad_norm": 1.5786038637161255, "learning_rate": 4.079111111111111e-05, "loss": 1.8784, "step": 9975 }, { "epoch": 4.0, "grad_norm": 1.361163854598999, "learning_rate": 4.075407407407408e-05, "loss": 1.9669, "step": 10000 }, { "epoch": 4.0, "eval_gen_len": 13.1328, "eval_loss": 1.9336023330688477, "eval_rouge1": 51.5525, "eval_rouge2": 25.7439, "eval_rougeL": 47.3861, "eval_rougeLsum": 47.3849, "eval_runtime": 193.2674, "eval_samples_per_second": 103.484, "eval_steps_per_second": 1.62, "step": 10000 }, { "epoch": 4.01, "grad_norm": 1.6996824741363525, "learning_rate": 4.0717037037037035e-05, "loss": 1.8428, "step": 10025 }, { "epoch": 4.02, "grad_norm": 1.6396452188491821, "learning_rate": 4.0680000000000004e-05, "loss": 1.9144, "step": 10050 }, { "epoch": 4.03, "grad_norm": 1.508521556854248, "learning_rate": 4.0642962962962965e-05, "loss": 1.8983, "step": 10075 }, { "epoch": 4.04, "grad_norm": 1.3133175373077393, "learning_rate": 4.0605925925925933e-05, "loss": 1.8159, "step": 10100 }, { "epoch": 4.05, "grad_norm": 1.4404352903366089, "learning_rate": 4.056888888888889e-05, "loss": 1.8468, "step": 10125 }, { "epoch": 4.06, "grad_norm": 1.410372257232666, "learning_rate": 4.0531851851851857e-05, "loss": 1.9628, "step": 10150 }, { "epoch": 4.07, "grad_norm": 1.509523630142212, "learning_rate": 4.049481481481482e-05, "loss": 1.8723, "step": 10175 }, { "epoch": 4.08, "grad_norm": 1.488282561302185, "learning_rate": 4.045777777777778e-05, "loss": 1.8414, "step": 10200 }, { "epoch": 4.09, "grad_norm": 1.3858217000961304, "learning_rate": 4.042074074074074e-05, "loss": 1.8241, "step": 10225 }, { "epoch": 4.1, "grad_norm": 1.5078978538513184, "learning_rate": 4.038370370370371e-05, "loss": 1.9011, "step": 10250 }, { "epoch": 4.11, "grad_norm": 1.340116262435913, "learning_rate": 4.0346666666666664e-05, "loss": 1.8832, "step": 10275 }, { "epoch": 4.12, "grad_norm": 1.4269579648971558, "learning_rate": 4.030962962962963e-05, "loss": 1.8314, "step": 10300 }, { "epoch": 4.13, "grad_norm": 1.3984671831130981, "learning_rate": 4.0272592592592594e-05, "loss": 1.8422, "step": 10325 }, { "epoch": 4.14, "grad_norm": 1.5651813745498657, "learning_rate": 4.023555555555556e-05, "loss": 1.8475, "step": 10350 }, { "epoch": 4.15, "grad_norm": 1.5041767358779907, "learning_rate": 4.019851851851852e-05, "loss": 1.8392, "step": 10375 }, { "epoch": 4.16, "grad_norm": 1.6004717350006104, "learning_rate": 4.0161481481481486e-05, "loss": 1.8364, "step": 10400 }, { "epoch": 4.17, "grad_norm": 1.3751362562179565, "learning_rate": 4.012444444444445e-05, "loss": 1.8457, "step": 10425 }, { "epoch": 4.18, "grad_norm": 1.4664337635040283, "learning_rate": 4.008740740740741e-05, "loss": 1.9372, "step": 10450 }, { "epoch": 4.19, "grad_norm": 1.3487550020217896, "learning_rate": 4.005037037037037e-05, "loss": 1.9049, "step": 10475 }, { "epoch": 4.2, "grad_norm": 1.2509174346923828, "learning_rate": 4.001333333333334e-05, "loss": 1.8987, "step": 10500 }, { "epoch": 4.21, "grad_norm": 1.5854827165603638, "learning_rate": 3.99762962962963e-05, "loss": 1.8423, "step": 10525 }, { "epoch": 4.22, "grad_norm": 1.4388275146484375, "learning_rate": 3.993925925925926e-05, "loss": 1.8349, "step": 10550 }, { "epoch": 4.23, "grad_norm": 1.4520167112350464, "learning_rate": 3.9902222222222223e-05, "loss": 1.8283, "step": 10575 }, { "epoch": 4.24, "grad_norm": 1.44745671749115, "learning_rate": 3.986518518518519e-05, "loss": 1.9163, "step": 10600 }, { "epoch": 4.25, "grad_norm": 1.4306520223617554, "learning_rate": 3.9828148148148147e-05, "loss": 1.8803, "step": 10625 }, { "epoch": 4.26, "grad_norm": 1.3885631561279297, "learning_rate": 3.9791111111111115e-05, "loss": 1.8763, "step": 10650 }, { "epoch": 4.27, "grad_norm": 1.3905619382858276, "learning_rate": 3.9754074074074076e-05, "loss": 1.8828, "step": 10675 }, { "epoch": 4.28, "grad_norm": 1.3806442022323608, "learning_rate": 3.9717037037037045e-05, "loss": 1.908, "step": 10700 }, { "epoch": 4.29, "grad_norm": 1.4566986560821533, "learning_rate": 3.968e-05, "loss": 1.8479, "step": 10725 }, { "epoch": 4.3, "grad_norm": 1.4372694492340088, "learning_rate": 3.964296296296297e-05, "loss": 1.9098, "step": 10750 }, { "epoch": 4.31, "grad_norm": 1.5970664024353027, "learning_rate": 3.960592592592593e-05, "loss": 1.9132, "step": 10775 }, { "epoch": 4.32, "grad_norm": 1.3332023620605469, "learning_rate": 3.956888888888889e-05, "loss": 1.8479, "step": 10800 }, { "epoch": 4.33, "grad_norm": 1.4356377124786377, "learning_rate": 3.953185185185185e-05, "loss": 1.8216, "step": 10825 }, { "epoch": 4.34, "grad_norm": 1.3846675157546997, "learning_rate": 3.949481481481482e-05, "loss": 1.9176, "step": 10850 }, { "epoch": 4.35, "grad_norm": 1.4137978553771973, "learning_rate": 3.945777777777778e-05, "loss": 1.8499, "step": 10875 }, { "epoch": 4.36, "grad_norm": 1.456305980682373, "learning_rate": 3.9420740740740744e-05, "loss": 1.8801, "step": 10900 }, { "epoch": 4.37, "grad_norm": 1.4478751420974731, "learning_rate": 3.9383703703703706e-05, "loss": 1.8548, "step": 10925 }, { "epoch": 4.38, "grad_norm": 1.2786507606506348, "learning_rate": 3.9346666666666674e-05, "loss": 1.8149, "step": 10950 }, { "epoch": 4.39, "grad_norm": 1.5575779676437378, "learning_rate": 3.930962962962963e-05, "loss": 1.8687, "step": 10975 }, { "epoch": 4.4, "grad_norm": 1.6050180196762085, "learning_rate": 3.92725925925926e-05, "loss": 1.8918, "step": 11000 }, { "epoch": 4.41, "grad_norm": 1.4834775924682617, "learning_rate": 3.923555555555556e-05, "loss": 1.9042, "step": 11025 }, { "epoch": 4.42, "grad_norm": 1.3932939767837524, "learning_rate": 3.919851851851852e-05, "loss": 1.8931, "step": 11050 }, { "epoch": 4.43, "grad_norm": 1.4138563871383667, "learning_rate": 3.916148148148148e-05, "loss": 1.8471, "step": 11075 }, { "epoch": 4.44, "grad_norm": 1.5365270376205444, "learning_rate": 3.912444444444445e-05, "loss": 1.8746, "step": 11100 }, { "epoch": 4.45, "grad_norm": 1.2787660360336304, "learning_rate": 3.908740740740741e-05, "loss": 1.857, "step": 11125 }, { "epoch": 4.46, "grad_norm": 1.3813387155532837, "learning_rate": 3.905037037037037e-05, "loss": 1.8436, "step": 11150 }, { "epoch": 4.47, "grad_norm": 1.3845306634902954, "learning_rate": 3.9013333333333335e-05, "loss": 1.8494, "step": 11175 }, { "epoch": 4.48, "grad_norm": 1.4810049533843994, "learning_rate": 3.8976296296296296e-05, "loss": 1.9129, "step": 11200 }, { "epoch": 4.49, "grad_norm": 1.5942555665969849, "learning_rate": 3.893925925925926e-05, "loss": 1.8596, "step": 11225 }, { "epoch": 4.5, "grad_norm": 1.3388746976852417, "learning_rate": 3.8902222222222226e-05, "loss": 1.8152, "step": 11250 }, { "epoch": 4.51, "grad_norm": 1.2811378240585327, "learning_rate": 3.886518518518519e-05, "loss": 1.8035, "step": 11275 }, { "epoch": 4.52, "grad_norm": 1.4130557775497437, "learning_rate": 3.882814814814815e-05, "loss": 1.8349, "step": 11300 }, { "epoch": 4.53, "grad_norm": 1.3981435298919678, "learning_rate": 3.879111111111111e-05, "loss": 1.9117, "step": 11325 }, { "epoch": 4.54, "grad_norm": 1.4052165746688843, "learning_rate": 3.875407407407408e-05, "loss": 1.9413, "step": 11350 }, { "epoch": 4.55, "grad_norm": 1.462985634803772, "learning_rate": 3.871703703703704e-05, "loss": 1.8346, "step": 11375 }, { "epoch": 4.5600000000000005, "grad_norm": 1.417001485824585, "learning_rate": 3.868e-05, "loss": 1.8613, "step": 11400 }, { "epoch": 4.57, "grad_norm": 1.3185068368911743, "learning_rate": 3.8642962962962964e-05, "loss": 1.8832, "step": 11425 }, { "epoch": 4.58, "grad_norm": 1.4845627546310425, "learning_rate": 3.8605925925925925e-05, "loss": 1.8526, "step": 11450 }, { "epoch": 4.59, "grad_norm": 1.5781909227371216, "learning_rate": 3.8568888888888894e-05, "loss": 1.9009, "step": 11475 }, { "epoch": 4.6, "grad_norm": 1.4118759632110596, "learning_rate": 3.8531851851851855e-05, "loss": 1.8705, "step": 11500 }, { "epoch": 4.61, "grad_norm": 1.604973316192627, "learning_rate": 3.849481481481482e-05, "loss": 1.8933, "step": 11525 }, { "epoch": 4.62, "grad_norm": 1.2688302993774414, "learning_rate": 3.845777777777778e-05, "loss": 1.8558, "step": 11550 }, { "epoch": 4.63, "grad_norm": 1.3213862180709839, "learning_rate": 3.842074074074074e-05, "loss": 1.8655, "step": 11575 }, { "epoch": 4.64, "grad_norm": 1.5168383121490479, "learning_rate": 3.838370370370371e-05, "loss": 1.8862, "step": 11600 }, { "epoch": 4.65, "grad_norm": 1.30950927734375, "learning_rate": 3.834666666666667e-05, "loss": 1.8862, "step": 11625 }, { "epoch": 4.66, "grad_norm": 1.4738622903823853, "learning_rate": 3.830962962962963e-05, "loss": 1.8185, "step": 11650 }, { "epoch": 4.67, "grad_norm": 1.4453907012939453, "learning_rate": 3.827259259259259e-05, "loss": 1.8367, "step": 11675 }, { "epoch": 4.68, "grad_norm": 1.5001963376998901, "learning_rate": 3.8235555555555555e-05, "loss": 1.9168, "step": 11700 }, { "epoch": 4.6899999999999995, "grad_norm": 1.4097387790679932, "learning_rate": 3.819851851851852e-05, "loss": 1.8582, "step": 11725 }, { "epoch": 4.7, "grad_norm": 1.5029624700546265, "learning_rate": 3.8161481481481485e-05, "loss": 1.87, "step": 11750 }, { "epoch": 4.71, "grad_norm": 1.4353328943252563, "learning_rate": 3.8124444444444446e-05, "loss": 1.8595, "step": 11775 }, { "epoch": 4.72, "grad_norm": 1.6048336029052734, "learning_rate": 3.808740740740741e-05, "loss": 1.9087, "step": 11800 }, { "epoch": 4.73, "grad_norm": 1.3120425939559937, "learning_rate": 3.805037037037037e-05, "loss": 1.8873, "step": 11825 }, { "epoch": 4.74, "grad_norm": 1.6301329135894775, "learning_rate": 3.801333333333333e-05, "loss": 1.8895, "step": 11850 }, { "epoch": 4.75, "grad_norm": 1.480413556098938, "learning_rate": 3.79762962962963e-05, "loss": 1.8713, "step": 11875 }, { "epoch": 4.76, "grad_norm": 1.2750059366226196, "learning_rate": 3.793925925925926e-05, "loss": 1.8706, "step": 11900 }, { "epoch": 4.77, "grad_norm": 1.5305309295654297, "learning_rate": 3.790222222222222e-05, "loss": 1.9087, "step": 11925 }, { "epoch": 4.78, "grad_norm": 1.3535616397857666, "learning_rate": 3.7865185185185184e-05, "loss": 1.8658, "step": 11950 }, { "epoch": 4.79, "grad_norm": 1.5504143238067627, "learning_rate": 3.782814814814815e-05, "loss": 1.8742, "step": 11975 }, { "epoch": 4.8, "grad_norm": 1.7946292161941528, "learning_rate": 3.7791111111111114e-05, "loss": 1.8456, "step": 12000 }, { "epoch": 4.8100000000000005, "grad_norm": 1.395272135734558, "learning_rate": 3.7754074074074075e-05, "loss": 1.931, "step": 12025 }, { "epoch": 4.82, "grad_norm": 1.4061551094055176, "learning_rate": 3.771703703703704e-05, "loss": 1.8836, "step": 12050 }, { "epoch": 4.83, "grad_norm": 1.3979151248931885, "learning_rate": 3.7680000000000005e-05, "loss": 1.8869, "step": 12075 }, { "epoch": 4.84, "grad_norm": 1.4107104539871216, "learning_rate": 3.764296296296296e-05, "loss": 1.8736, "step": 12100 }, { "epoch": 4.85, "grad_norm": 1.321219801902771, "learning_rate": 3.760592592592593e-05, "loss": 1.8761, "step": 12125 }, { "epoch": 4.86, "grad_norm": 1.5014673471450806, "learning_rate": 3.756888888888889e-05, "loss": 1.8445, "step": 12150 }, { "epoch": 4.87, "grad_norm": 1.4317951202392578, "learning_rate": 3.753185185185185e-05, "loss": 1.848, "step": 12175 }, { "epoch": 4.88, "grad_norm": 1.4096531867980957, "learning_rate": 3.749481481481481e-05, "loss": 1.8933, "step": 12200 }, { "epoch": 4.89, "grad_norm": 1.2559773921966553, "learning_rate": 3.745777777777778e-05, "loss": 1.8033, "step": 12225 }, { "epoch": 4.9, "grad_norm": 1.3311185836791992, "learning_rate": 3.742074074074074e-05, "loss": 1.8796, "step": 12250 }, { "epoch": 4.91, "grad_norm": 1.4260166883468628, "learning_rate": 3.7383703703703704e-05, "loss": 1.8633, "step": 12275 }, { "epoch": 4.92, "grad_norm": 1.3146923780441284, "learning_rate": 3.7346666666666666e-05, "loss": 1.888, "step": 12300 }, { "epoch": 4.93, "grad_norm": 1.3045905828475952, "learning_rate": 3.7309629629629634e-05, "loss": 1.8274, "step": 12325 }, { "epoch": 4.9399999999999995, "grad_norm": 1.3906166553497314, "learning_rate": 3.727259259259259e-05, "loss": 1.9098, "step": 12350 }, { "epoch": 4.95, "grad_norm": 1.518526554107666, "learning_rate": 3.723555555555556e-05, "loss": 1.9287, "step": 12375 }, { "epoch": 4.96, "grad_norm": 1.3113497495651245, "learning_rate": 3.719851851851852e-05, "loss": 1.8677, "step": 12400 }, { "epoch": 4.97, "grad_norm": 1.6457515954971313, "learning_rate": 3.716148148148149e-05, "loss": 1.9183, "step": 12425 }, { "epoch": 4.98, "grad_norm": 1.5027391910552979, "learning_rate": 3.712444444444444e-05, "loss": 1.8613, "step": 12450 }, { "epoch": 4.99, "grad_norm": 1.561333179473877, "learning_rate": 3.708740740740741e-05, "loss": 1.9073, "step": 12475 }, { "epoch": 5.0, "grad_norm": 1.6902166604995728, "learning_rate": 3.705037037037037e-05, "loss": 1.8617, "step": 12500 }, { "epoch": 5.0, "eval_gen_len": 13.1348, "eval_loss": 1.9302258491516113, "eval_rouge1": 51.4633, "eval_rouge2": 25.7335, "eval_rougeL": 47.3103, "eval_rougeLsum": 47.305, "eval_runtime": 194.3038, "eval_samples_per_second": 102.932, "eval_steps_per_second": 1.611, "step": 12500 }, { "epoch": 5.01, "grad_norm": 1.3787955045700073, "learning_rate": 3.7013333333333334e-05, "loss": 1.763, "step": 12525 }, { "epoch": 5.02, "grad_norm": 1.3770140409469604, "learning_rate": 3.6976296296296295e-05, "loss": 1.8189, "step": 12550 }, { "epoch": 5.03, "grad_norm": 1.486107587814331, "learning_rate": 3.6939259259259263e-05, "loss": 1.8043, "step": 12575 }, { "epoch": 5.04, "grad_norm": 1.3387417793273926, "learning_rate": 3.690222222222222e-05, "loss": 1.8575, "step": 12600 }, { "epoch": 5.05, "grad_norm": 1.340170979499817, "learning_rate": 3.6865185185185187e-05, "loss": 1.8634, "step": 12625 }, { "epoch": 5.06, "grad_norm": 1.2771445512771606, "learning_rate": 3.682814814814815e-05, "loss": 1.8065, "step": 12650 }, { "epoch": 5.07, "grad_norm": 1.3724429607391357, "learning_rate": 3.6791111111111116e-05, "loss": 1.8152, "step": 12675 }, { "epoch": 5.08, "grad_norm": 1.2865967750549316, "learning_rate": 3.675407407407407e-05, "loss": 1.8625, "step": 12700 }, { "epoch": 5.09, "grad_norm": 1.4663174152374268, "learning_rate": 3.671703703703704e-05, "loss": 1.7772, "step": 12725 }, { "epoch": 5.1, "grad_norm": 1.6799187660217285, "learning_rate": 3.668e-05, "loss": 1.8082, "step": 12750 }, { "epoch": 5.11, "grad_norm": 1.3404431343078613, "learning_rate": 3.664296296296296e-05, "loss": 1.8315, "step": 12775 }, { "epoch": 5.12, "grad_norm": 1.4227215051651, "learning_rate": 3.6605925925925924e-05, "loss": 1.7863, "step": 12800 }, { "epoch": 5.13, "grad_norm": 1.4327188730239868, "learning_rate": 3.656888888888889e-05, "loss": 1.8599, "step": 12825 }, { "epoch": 5.14, "grad_norm": 1.4590907096862793, "learning_rate": 3.6531851851851854e-05, "loss": 1.8372, "step": 12850 }, { "epoch": 5.15, "grad_norm": 1.4968684911727905, "learning_rate": 3.6494814814814816e-05, "loss": 1.8266, "step": 12875 }, { "epoch": 5.16, "grad_norm": 1.3817492723464966, "learning_rate": 3.645925925925926e-05, "loss": 1.8532, "step": 12900 }, { "epoch": 5.17, "grad_norm": 1.5503273010253906, "learning_rate": 3.642222222222222e-05, "loss": 1.8062, "step": 12925 }, { "epoch": 5.18, "grad_norm": 1.5043991804122925, "learning_rate": 3.638518518518519e-05, "loss": 1.8428, "step": 12950 }, { "epoch": 5.19, "grad_norm": 1.3269332647323608, "learning_rate": 3.6348148148148146e-05, "loss": 1.7959, "step": 12975 }, { "epoch": 5.2, "grad_norm": 1.3784312009811401, "learning_rate": 3.6311111111111114e-05, "loss": 1.8489, "step": 13000 }, { "epoch": 5.21, "grad_norm": 1.557144284248352, "learning_rate": 3.6274074074074076e-05, "loss": 1.8175, "step": 13025 }, { "epoch": 5.22, "grad_norm": 1.7027719020843506, "learning_rate": 3.623703703703704e-05, "loss": 1.8363, "step": 13050 }, { "epoch": 5.23, "grad_norm": 1.415287733078003, "learning_rate": 3.62e-05, "loss": 1.8242, "step": 13075 }, { "epoch": 5.24, "grad_norm": 1.4795790910720825, "learning_rate": 3.616296296296297e-05, "loss": 1.7947, "step": 13100 }, { "epoch": 5.25, "grad_norm": 1.703626036643982, "learning_rate": 3.612592592592593e-05, "loss": 1.8156, "step": 13125 }, { "epoch": 5.26, "grad_norm": 1.5055326223373413, "learning_rate": 3.608888888888889e-05, "loss": 1.8225, "step": 13150 }, { "epoch": 5.27, "grad_norm": 1.5992326736450195, "learning_rate": 3.605185185185185e-05, "loss": 1.8556, "step": 13175 }, { "epoch": 5.28, "grad_norm": 1.630657434463501, "learning_rate": 3.601481481481482e-05, "loss": 1.8139, "step": 13200 }, { "epoch": 5.29, "grad_norm": 1.4624261856079102, "learning_rate": 3.5977777777777775e-05, "loss": 1.8808, "step": 13225 }, { "epoch": 5.3, "grad_norm": 1.6484681367874146, "learning_rate": 3.5940740740740743e-05, "loss": 1.7941, "step": 13250 }, { "epoch": 5.31, "grad_norm": 1.5216662883758545, "learning_rate": 3.5903703703703705e-05, "loss": 1.7918, "step": 13275 }, { "epoch": 5.32, "grad_norm": 1.578412413597107, "learning_rate": 3.586666666666667e-05, "loss": 1.8681, "step": 13300 }, { "epoch": 5.33, "grad_norm": 1.547946572303772, "learning_rate": 3.582962962962963e-05, "loss": 1.8343, "step": 13325 }, { "epoch": 5.34, "grad_norm": 1.491708517074585, "learning_rate": 3.5792592592592596e-05, "loss": 1.7636, "step": 13350 }, { "epoch": 5.35, "grad_norm": 1.4895257949829102, "learning_rate": 3.575555555555556e-05, "loss": 1.8088, "step": 13375 }, { "epoch": 5.36, "grad_norm": 1.5237321853637695, "learning_rate": 3.571851851851852e-05, "loss": 1.8399, "step": 13400 }, { "epoch": 5.37, "grad_norm": 1.5580129623413086, "learning_rate": 3.568148148148148e-05, "loss": 1.8515, "step": 13425 }, { "epoch": 5.38, "grad_norm": 1.3289860486984253, "learning_rate": 3.564444444444445e-05, "loss": 1.7867, "step": 13450 }, { "epoch": 5.39, "grad_norm": 1.3806110620498657, "learning_rate": 3.560740740740741e-05, "loss": 1.8141, "step": 13475 }, { "epoch": 5.4, "grad_norm": 1.393282175064087, "learning_rate": 3.557037037037037e-05, "loss": 1.7835, "step": 13500 }, { "epoch": 5.41, "grad_norm": 1.3642027378082275, "learning_rate": 3.5533333333333334e-05, "loss": 1.797, "step": 13525 }, { "epoch": 5.42, "grad_norm": 1.378496766090393, "learning_rate": 3.54962962962963e-05, "loss": 1.8129, "step": 13550 }, { "epoch": 5.43, "grad_norm": 1.7297312021255493, "learning_rate": 3.545925925925926e-05, "loss": 1.8175, "step": 13575 }, { "epoch": 5.44, "grad_norm": 1.3795442581176758, "learning_rate": 3.5422222222222226e-05, "loss": 1.8089, "step": 13600 }, { "epoch": 5.45, "grad_norm": 1.577501654624939, "learning_rate": 3.538518518518519e-05, "loss": 1.8813, "step": 13625 }, { "epoch": 5.46, "grad_norm": 1.478531002998352, "learning_rate": 3.5348148148148156e-05, "loss": 1.7873, "step": 13650 }, { "epoch": 5.47, "grad_norm": 1.4849592447280884, "learning_rate": 3.531111111111111e-05, "loss": 1.8269, "step": 13675 }, { "epoch": 5.48, "grad_norm": 1.413120985031128, "learning_rate": 3.527407407407408e-05, "loss": 1.8449, "step": 13700 }, { "epoch": 5.49, "grad_norm": 1.6044095754623413, "learning_rate": 3.523703703703704e-05, "loss": 1.7789, "step": 13725 }, { "epoch": 5.5, "grad_norm": 1.4970237016677856, "learning_rate": 3.52e-05, "loss": 1.8088, "step": 13750 }, { "epoch": 5.51, "grad_norm": 1.735145926475525, "learning_rate": 3.516296296296296e-05, "loss": 1.8242, "step": 13775 }, { "epoch": 5.52, "grad_norm": 1.592650055885315, "learning_rate": 3.512592592592593e-05, "loss": 1.8115, "step": 13800 }, { "epoch": 5.53, "grad_norm": 1.3803372383117676, "learning_rate": 3.5088888888888886e-05, "loss": 1.7575, "step": 13825 }, { "epoch": 5.54, "grad_norm": 1.5937862396240234, "learning_rate": 3.5051851851851855e-05, "loss": 1.7927, "step": 13850 }, { "epoch": 5.55, "grad_norm": 1.6011449098587036, "learning_rate": 3.5014814814814816e-05, "loss": 1.8037, "step": 13875 }, { "epoch": 5.5600000000000005, "grad_norm": 1.4640209674835205, "learning_rate": 3.4977777777777785e-05, "loss": 1.7909, "step": 13900 }, { "epoch": 5.57, "grad_norm": 1.577725887298584, "learning_rate": 3.494074074074074e-05, "loss": 1.8111, "step": 13925 }, { "epoch": 5.58, "grad_norm": 1.4513524770736694, "learning_rate": 3.490370370370371e-05, "loss": 1.7605, "step": 13950 }, { "epoch": 5.59, "grad_norm": 1.6748623847961426, "learning_rate": 3.486666666666667e-05, "loss": 1.8199, "step": 13975 }, { "epoch": 5.6, "grad_norm": 1.5236202478408813, "learning_rate": 3.482962962962963e-05, "loss": 1.8162, "step": 14000 }, { "epoch": 5.61, "grad_norm": 1.5020064115524292, "learning_rate": 3.479259259259259e-05, "loss": 1.8282, "step": 14025 }, { "epoch": 5.62, "grad_norm": 1.5203158855438232, "learning_rate": 3.475555555555556e-05, "loss": 1.8871, "step": 14050 }, { "epoch": 5.63, "grad_norm": 1.4594610929489136, "learning_rate": 3.471851851851852e-05, "loss": 1.818, "step": 14075 }, { "epoch": 5.64, "grad_norm": 1.5922948122024536, "learning_rate": 3.4681481481481484e-05, "loss": 1.8806, "step": 14100 }, { "epoch": 5.65, "grad_norm": 1.5797369480133057, "learning_rate": 3.4644444444444446e-05, "loss": 1.8307, "step": 14125 }, { "epoch": 5.66, "grad_norm": 1.5268616676330566, "learning_rate": 3.4607407407407414e-05, "loss": 1.8021, "step": 14150 }, { "epoch": 5.67, "grad_norm": 1.6109580993652344, "learning_rate": 3.457037037037037e-05, "loss": 1.8541, "step": 14175 }, { "epoch": 5.68, "grad_norm": 1.4869205951690674, "learning_rate": 3.453333333333334e-05, "loss": 1.8282, "step": 14200 }, { "epoch": 5.6899999999999995, "grad_norm": 1.5621397495269775, "learning_rate": 3.44962962962963e-05, "loss": 1.8665, "step": 14225 }, { "epoch": 5.7, "grad_norm": 1.6197333335876465, "learning_rate": 3.445925925925926e-05, "loss": 1.9229, "step": 14250 }, { "epoch": 5.71, "grad_norm": 1.4683600664138794, "learning_rate": 3.442222222222222e-05, "loss": 1.8309, "step": 14275 }, { "epoch": 5.72, "grad_norm": 1.4849534034729004, "learning_rate": 3.438518518518519e-05, "loss": 1.8315, "step": 14300 }, { "epoch": 5.73, "grad_norm": 1.2944457530975342, "learning_rate": 3.434814814814815e-05, "loss": 1.7922, "step": 14325 }, { "epoch": 5.74, "grad_norm": 1.4732681512832642, "learning_rate": 3.431111111111111e-05, "loss": 1.8181, "step": 14350 }, { "epoch": 5.75, "grad_norm": 1.362735629081726, "learning_rate": 3.4274074074074075e-05, "loss": 1.8194, "step": 14375 }, { "epoch": 5.76, "grad_norm": 1.5724767446517944, "learning_rate": 3.423703703703704e-05, "loss": 1.9212, "step": 14400 }, { "epoch": 5.77, "grad_norm": 1.430247187614441, "learning_rate": 3.4200000000000005e-05, "loss": 1.8141, "step": 14425 }, { "epoch": 5.78, "grad_norm": 1.4439537525177002, "learning_rate": 3.4162962962962966e-05, "loss": 1.8393, "step": 14450 }, { "epoch": 5.79, "grad_norm": 1.6095616817474365, "learning_rate": 3.412592592592593e-05, "loss": 1.8514, "step": 14475 }, { "epoch": 5.8, "grad_norm": 1.3736265897750854, "learning_rate": 3.408888888888889e-05, "loss": 1.8419, "step": 14500 }, { "epoch": 5.8100000000000005, "grad_norm": 1.5470609664916992, "learning_rate": 3.405185185185185e-05, "loss": 1.8313, "step": 14525 }, { "epoch": 5.82, "grad_norm": 1.4805546998977661, "learning_rate": 3.401481481481482e-05, "loss": 1.8075, "step": 14550 }, { "epoch": 5.83, "grad_norm": 1.3933120965957642, "learning_rate": 3.397777777777778e-05, "loss": 1.7556, "step": 14575 }, { "epoch": 5.84, "grad_norm": 1.3401881456375122, "learning_rate": 3.394074074074074e-05, "loss": 1.7676, "step": 14600 }, { "epoch": 5.85, "grad_norm": 1.713220238685608, "learning_rate": 3.3903703703703704e-05, "loss": 1.8106, "step": 14625 }, { "epoch": 5.86, "grad_norm": 1.4203695058822632, "learning_rate": 3.3866666666666665e-05, "loss": 1.8415, "step": 14650 }, { "epoch": 5.87, "grad_norm": 1.406247854232788, "learning_rate": 3.3829629629629634e-05, "loss": 1.7898, "step": 14675 }, { "epoch": 5.88, "grad_norm": 1.422446846961975, "learning_rate": 3.3792592592592595e-05, "loss": 1.7958, "step": 14700 }, { "epoch": 5.89, "grad_norm": 1.5542668104171753, "learning_rate": 3.375555555555556e-05, "loss": 1.8299, "step": 14725 }, { "epoch": 5.9, "grad_norm": 1.426229476928711, "learning_rate": 3.371851851851852e-05, "loss": 1.8498, "step": 14750 }, { "epoch": 5.91, "grad_norm": 1.4984149932861328, "learning_rate": 3.368148148148148e-05, "loss": 1.8642, "step": 14775 }, { "epoch": 5.92, "grad_norm": 1.347084879875183, "learning_rate": 3.364444444444445e-05, "loss": 1.8321, "step": 14800 }, { "epoch": 5.93, "grad_norm": 1.4737646579742432, "learning_rate": 3.360740740740741e-05, "loss": 1.9252, "step": 14825 }, { "epoch": 5.9399999999999995, "grad_norm": 1.3631387948989868, "learning_rate": 3.357037037037037e-05, "loss": 1.8539, "step": 14850 }, { "epoch": 5.95, "grad_norm": 1.3909265995025635, "learning_rate": 3.353333333333333e-05, "loss": 1.8415, "step": 14875 }, { "epoch": 5.96, "grad_norm": 1.398703932762146, "learning_rate": 3.3496296296296295e-05, "loss": 1.823, "step": 14900 }, { "epoch": 5.97, "grad_norm": 1.5495917797088623, "learning_rate": 3.345925925925926e-05, "loss": 1.7733, "step": 14925 }, { "epoch": 5.98, "grad_norm": 1.7243595123291016, "learning_rate": 3.3422222222222224e-05, "loss": 1.8468, "step": 14950 }, { "epoch": 5.99, "grad_norm": 1.5244386196136475, "learning_rate": 3.3385185185185186e-05, "loss": 1.8229, "step": 14975 }, { "epoch": 6.0, "grad_norm": 1.4378917217254639, "learning_rate": 3.334814814814815e-05, "loss": 1.7693, "step": 15000 }, { "epoch": 6.0, "eval_gen_len": 13.2205, "eval_loss": 1.9290603399276733, "eval_rouge1": 51.6382, "eval_rouge2": 25.9065, "eval_rougeL": 47.4714, "eval_rougeLsum": 47.4661, "eval_runtime": 195.9112, "eval_samples_per_second": 102.087, "eval_steps_per_second": 1.598, "step": 15000 }, { "epoch": 6.01, "grad_norm": 1.421264886856079, "learning_rate": 3.3311111111111116e-05, "loss": 1.7225, "step": 15025 }, { "epoch": 6.02, "grad_norm": 1.5342766046524048, "learning_rate": 3.327407407407408e-05, "loss": 1.8321, "step": 15050 }, { "epoch": 6.03, "grad_norm": 1.4197121858596802, "learning_rate": 3.323703703703704e-05, "loss": 1.7555, "step": 15075 }, { "epoch": 6.04, "grad_norm": 1.6157759428024292, "learning_rate": 3.32e-05, "loss": 1.7768, "step": 15100 }, { "epoch": 6.05, "grad_norm": 1.5571355819702148, "learning_rate": 3.316296296296296e-05, "loss": 1.7692, "step": 15125 }, { "epoch": 6.06, "grad_norm": 1.5974923372268677, "learning_rate": 3.3125925925925924e-05, "loss": 1.7249, "step": 15150 }, { "epoch": 6.07, "grad_norm": 1.3685996532440186, "learning_rate": 3.308888888888889e-05, "loss": 1.7672, "step": 15175 }, { "epoch": 6.08, "grad_norm": 1.4658602476119995, "learning_rate": 3.3051851851851854e-05, "loss": 1.7717, "step": 15200 }, { "epoch": 6.09, "grad_norm": 1.399776577949524, "learning_rate": 3.3014814814814815e-05, "loss": 1.7609, "step": 15225 }, { "epoch": 6.1, "grad_norm": 1.6905499696731567, "learning_rate": 3.297777777777778e-05, "loss": 1.7796, "step": 15250 }, { "epoch": 6.11, "grad_norm": 1.7223976850509644, "learning_rate": 3.2940740740740745e-05, "loss": 1.7806, "step": 15275 }, { "epoch": 6.12, "grad_norm": 1.4597574472427368, "learning_rate": 3.29037037037037e-05, "loss": 1.7689, "step": 15300 }, { "epoch": 6.13, "grad_norm": 1.7513933181762695, "learning_rate": 3.286666666666667e-05, "loss": 1.8005, "step": 15325 }, { "epoch": 6.14, "grad_norm": 1.7440515756607056, "learning_rate": 3.282962962962963e-05, "loss": 1.7965, "step": 15350 }, { "epoch": 6.15, "grad_norm": 1.6327168941497803, "learning_rate": 3.279259259259259e-05, "loss": 1.7875, "step": 15375 }, { "epoch": 6.16, "grad_norm": 1.5745080709457397, "learning_rate": 3.275555555555555e-05, "loss": 1.7659, "step": 15400 }, { "epoch": 6.17, "grad_norm": 1.5755482912063599, "learning_rate": 3.271851851851852e-05, "loss": 1.7724, "step": 15425 }, { "epoch": 6.18, "grad_norm": 1.6102346181869507, "learning_rate": 3.268148148148148e-05, "loss": 1.7862, "step": 15450 }, { "epoch": 6.19, "grad_norm": 1.5103446245193481, "learning_rate": 3.2644444444444444e-05, "loss": 1.7739, "step": 15475 }, { "epoch": 6.2, "grad_norm": 1.4357705116271973, "learning_rate": 3.2607407407407406e-05, "loss": 1.7596, "step": 15500 }, { "epoch": 6.21, "grad_norm": 1.739065170288086, "learning_rate": 3.2570370370370374e-05, "loss": 1.8472, "step": 15525 }, { "epoch": 6.22, "grad_norm": 1.59120512008667, "learning_rate": 3.253333333333333e-05, "loss": 1.8353, "step": 15550 }, { "epoch": 6.23, "grad_norm": 1.7250970602035522, "learning_rate": 3.24962962962963e-05, "loss": 1.7692, "step": 15575 }, { "epoch": 6.24, "grad_norm": 1.6642422676086426, "learning_rate": 3.245925925925926e-05, "loss": 1.7367, "step": 15600 }, { "epoch": 6.25, "grad_norm": 1.4755923748016357, "learning_rate": 3.242222222222223e-05, "loss": 1.7742, "step": 15625 }, { "epoch": 6.26, "grad_norm": 1.5769152641296387, "learning_rate": 3.238518518518518e-05, "loss": 1.7956, "step": 15650 }, { "epoch": 6.27, "grad_norm": 1.437385082244873, "learning_rate": 3.234814814814815e-05, "loss": 1.7039, "step": 15675 }, { "epoch": 6.28, "grad_norm": 1.4990898370742798, "learning_rate": 3.231111111111111e-05, "loss": 1.7824, "step": 15700 }, { "epoch": 6.29, "grad_norm": 1.3943312168121338, "learning_rate": 3.2274074074074074e-05, "loss": 1.7593, "step": 15725 }, { "epoch": 6.3, "grad_norm": 1.6160963773727417, "learning_rate": 3.2237037037037035e-05, "loss": 1.7873, "step": 15750 }, { "epoch": 6.31, "grad_norm": 1.569349765777588, "learning_rate": 3.2200000000000003e-05, "loss": 1.8264, "step": 15775 }, { "epoch": 6.32, "grad_norm": 1.4803683757781982, "learning_rate": 3.216444444444445e-05, "loss": 1.7451, "step": 15800 }, { "epoch": 6.33, "grad_norm": 1.760981798171997, "learning_rate": 3.212740740740741e-05, "loss": 1.791, "step": 15825 }, { "epoch": 6.34, "grad_norm": 1.3866006135940552, "learning_rate": 3.209037037037037e-05, "loss": 1.7865, "step": 15850 }, { "epoch": 6.35, "grad_norm": 1.6246627569198608, "learning_rate": 3.2053333333333334e-05, "loss": 1.8029, "step": 15875 }, { "epoch": 6.36, "grad_norm": 1.4544792175292969, "learning_rate": 3.20162962962963e-05, "loss": 1.7549, "step": 15900 }, { "epoch": 6.37, "grad_norm": 1.4172948598861694, "learning_rate": 3.197925925925926e-05, "loss": 1.7956, "step": 15925 }, { "epoch": 6.38, "grad_norm": 1.4404555559158325, "learning_rate": 3.1942222222222225e-05, "loss": 1.7294, "step": 15950 }, { "epoch": 6.39, "grad_norm": 1.4792706966400146, "learning_rate": 3.190518518518519e-05, "loss": 1.7118, "step": 15975 }, { "epoch": 6.4, "grad_norm": 1.5171838998794556, "learning_rate": 3.186814814814815e-05, "loss": 1.7392, "step": 16000 }, { "epoch": 6.41, "grad_norm": 1.5134046077728271, "learning_rate": 3.183111111111111e-05, "loss": 1.7854, "step": 16025 }, { "epoch": 6.42, "grad_norm": 1.504233479499817, "learning_rate": 3.179407407407408e-05, "loss": 1.7571, "step": 16050 }, { "epoch": 6.43, "grad_norm": 1.272483468055725, "learning_rate": 3.175703703703704e-05, "loss": 1.7626, "step": 16075 }, { "epoch": 6.44, "grad_norm": 1.471863031387329, "learning_rate": 3.172e-05, "loss": 1.7756, "step": 16100 }, { "epoch": 6.45, "grad_norm": 1.3983781337738037, "learning_rate": 3.168296296296296e-05, "loss": 1.8251, "step": 16125 }, { "epoch": 6.46, "grad_norm": 1.3490697145462036, "learning_rate": 3.164592592592593e-05, "loss": 1.7751, "step": 16150 }, { "epoch": 6.47, "grad_norm": 1.5252445936203003, "learning_rate": 3.1608888888888886e-05, "loss": 1.7565, "step": 16175 }, { "epoch": 6.48, "grad_norm": 1.688368558883667, "learning_rate": 3.1571851851851854e-05, "loss": 1.8372, "step": 16200 }, { "epoch": 6.49, "grad_norm": 1.3352307081222534, "learning_rate": 3.1534814814814816e-05, "loss": 1.7646, "step": 16225 }, { "epoch": 6.5, "grad_norm": 1.5939990282058716, "learning_rate": 3.1497777777777784e-05, "loss": 1.7409, "step": 16250 }, { "epoch": 6.51, "grad_norm": 1.3315916061401367, "learning_rate": 3.146074074074074e-05, "loss": 1.767, "step": 16275 }, { "epoch": 6.52, "grad_norm": 1.367167592048645, "learning_rate": 3.142370370370371e-05, "loss": 1.7998, "step": 16300 }, { "epoch": 6.53, "grad_norm": 1.7223109006881714, "learning_rate": 3.138666666666667e-05, "loss": 1.8021, "step": 16325 }, { "epoch": 6.54, "grad_norm": 1.4448398351669312, "learning_rate": 3.134962962962963e-05, "loss": 1.8232, "step": 16350 }, { "epoch": 6.55, "grad_norm": 1.257012128829956, "learning_rate": 3.131259259259259e-05, "loss": 1.7962, "step": 16375 }, { "epoch": 6.5600000000000005, "grad_norm": 1.2631241083145142, "learning_rate": 3.127555555555556e-05, "loss": 1.7465, "step": 16400 }, { "epoch": 6.57, "grad_norm": 1.5594916343688965, "learning_rate": 3.123851851851852e-05, "loss": 1.7723, "step": 16425 }, { "epoch": 6.58, "grad_norm": 1.6267319917678833, "learning_rate": 3.1201481481481483e-05, "loss": 1.8153, "step": 16450 }, { "epoch": 6.59, "grad_norm": 1.3760591745376587, "learning_rate": 3.1164444444444445e-05, "loss": 1.7863, "step": 16475 }, { "epoch": 6.6, "grad_norm": 1.565403699874878, "learning_rate": 3.112740740740741e-05, "loss": 1.7395, "step": 16500 }, { "epoch": 6.61, "grad_norm": 1.562738060951233, "learning_rate": 3.109037037037037e-05, "loss": 1.7866, "step": 16525 }, { "epoch": 6.62, "grad_norm": 1.7263091802597046, "learning_rate": 3.1053333333333336e-05, "loss": 1.7532, "step": 16550 }, { "epoch": 6.63, "grad_norm": 1.4510164260864258, "learning_rate": 3.10162962962963e-05, "loss": 1.7455, "step": 16575 }, { "epoch": 6.64, "grad_norm": 1.474307894706726, "learning_rate": 3.0979259259259266e-05, "loss": 1.7317, "step": 16600 }, { "epoch": 6.65, "grad_norm": 1.6146974563598633, "learning_rate": 3.094222222222222e-05, "loss": 1.8102, "step": 16625 }, { "epoch": 6.66, "grad_norm": 1.4847644567489624, "learning_rate": 3.090518518518519e-05, "loss": 1.774, "step": 16650 }, { "epoch": 6.67, "grad_norm": 1.7083697319030762, "learning_rate": 3.086814814814815e-05, "loss": 1.8375, "step": 16675 }, { "epoch": 6.68, "grad_norm": 1.3661868572235107, "learning_rate": 3.083111111111111e-05, "loss": 1.8694, "step": 16700 }, { "epoch": 6.6899999999999995, "grad_norm": 1.7693723440170288, "learning_rate": 3.0794074074074074e-05, "loss": 1.8175, "step": 16725 }, { "epoch": 6.7, "grad_norm": 1.5230166912078857, "learning_rate": 3.075703703703704e-05, "loss": 1.7785, "step": 16750 }, { "epoch": 6.71, "grad_norm": 1.4941860437393188, "learning_rate": 3.072e-05, "loss": 1.837, "step": 16775 }, { "epoch": 6.72, "grad_norm": 1.4451031684875488, "learning_rate": 3.0682962962962966e-05, "loss": 1.7919, "step": 16800 }, { "epoch": 6.73, "grad_norm": 1.6617428064346313, "learning_rate": 3.064592592592593e-05, "loss": 1.8227, "step": 16825 }, { "epoch": 6.74, "grad_norm": 1.5577620267868042, "learning_rate": 3.0608888888888895e-05, "loss": 1.7545, "step": 16850 }, { "epoch": 6.75, "grad_norm": 1.6712673902511597, "learning_rate": 3.057185185185185e-05, "loss": 1.799, "step": 16875 }, { "epoch": 6.76, "grad_norm": 1.879231572151184, "learning_rate": 3.053481481481482e-05, "loss": 1.7737, "step": 16900 }, { "epoch": 6.77, "grad_norm": 1.4019575119018555, "learning_rate": 3.049777777777778e-05, "loss": 1.7626, "step": 16925 }, { "epoch": 6.78, "grad_norm": 1.3805930614471436, "learning_rate": 3.046074074074074e-05, "loss": 1.7682, "step": 16950 }, { "epoch": 6.79, "grad_norm": 1.5736984014511108, "learning_rate": 3.0423703703703703e-05, "loss": 1.8448, "step": 16975 }, { "epoch": 6.8, "grad_norm": 1.5094330310821533, "learning_rate": 3.0386666666666668e-05, "loss": 1.8033, "step": 17000 }, { "epoch": 6.8100000000000005, "grad_norm": 1.7193862199783325, "learning_rate": 3.0349629629629633e-05, "loss": 1.7892, "step": 17025 }, { "epoch": 6.82, "grad_norm": 1.7383610010147095, "learning_rate": 3.031259259259259e-05, "loss": 1.7517, "step": 17050 }, { "epoch": 6.83, "grad_norm": 1.5743800401687622, "learning_rate": 3.0275555555555556e-05, "loss": 1.8145, "step": 17075 }, { "epoch": 6.84, "grad_norm": 1.6040315628051758, "learning_rate": 3.023851851851852e-05, "loss": 1.7887, "step": 17100 }, { "epoch": 6.85, "grad_norm": 1.7475184202194214, "learning_rate": 3.020148148148148e-05, "loss": 1.7959, "step": 17125 }, { "epoch": 6.86, "grad_norm": 1.4680596590042114, "learning_rate": 3.0164444444444444e-05, "loss": 1.7865, "step": 17150 }, { "epoch": 6.87, "grad_norm": 1.5900845527648926, "learning_rate": 3.012740740740741e-05, "loss": 1.8171, "step": 17175 }, { "epoch": 6.88, "grad_norm": 1.5023775100708008, "learning_rate": 3.0090370370370374e-05, "loss": 1.7596, "step": 17200 }, { "epoch": 6.89, "grad_norm": 1.5605974197387695, "learning_rate": 3.0053333333333332e-05, "loss": 1.7812, "step": 17225 }, { "epoch": 6.9, "grad_norm": 1.612432837486267, "learning_rate": 3.0016296296296297e-05, "loss": 1.8062, "step": 17250 }, { "epoch": 6.91, "grad_norm": 1.664870262145996, "learning_rate": 2.9979259259259262e-05, "loss": 1.7445, "step": 17275 }, { "epoch": 6.92, "grad_norm": 1.4798489809036255, "learning_rate": 2.9943703703703708e-05, "loss": 1.7788, "step": 17300 }, { "epoch": 6.93, "grad_norm": 1.5232810974121094, "learning_rate": 2.9906666666666666e-05, "loss": 1.8383, "step": 17325 }, { "epoch": 6.9399999999999995, "grad_norm": 1.4982616901397705, "learning_rate": 2.986962962962963e-05, "loss": 1.7302, "step": 17350 }, { "epoch": 6.95, "grad_norm": 1.4312171936035156, "learning_rate": 2.9832592592592596e-05, "loss": 1.7573, "step": 17375 }, { "epoch": 6.96, "grad_norm": 1.3876653909683228, "learning_rate": 2.9795555555555554e-05, "loss": 1.7684, "step": 17400 }, { "epoch": 6.97, "grad_norm": 1.6542433500289917, "learning_rate": 2.975851851851852e-05, "loss": 1.7529, "step": 17425 }, { "epoch": 6.98, "grad_norm": 1.4732959270477295, "learning_rate": 2.9721481481481484e-05, "loss": 1.8128, "step": 17450 }, { "epoch": 6.99, "grad_norm": 1.4124960899353027, "learning_rate": 2.968444444444445e-05, "loss": 1.8079, "step": 17475 }, { "epoch": 7.0, "grad_norm": 1.466747760772705, "learning_rate": 2.9647407407407407e-05, "loss": 1.8214, "step": 17500 }, { "epoch": 7.0, "eval_gen_len": 13.125, "eval_loss": 1.9262617826461792, "eval_rouge1": 51.5538, "eval_rouge2": 25.7728, "eval_rougeL": 47.3812, "eval_rougeLsum": 47.3727, "eval_runtime": 196.7996, "eval_samples_per_second": 101.626, "eval_steps_per_second": 1.59, "step": 17500 } ], "logging_steps": 25, "max_steps": 37500, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5719030035841024e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }