{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3895, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012836970474967908, "grad_norm": 76.69837951660156, "learning_rate": 1.4000000000000001e-06, "loss": 14.1357, "step": 10 }, { "epoch": 0.025673940949935817, "grad_norm": 78.64288330078125, "learning_rate": 3.4000000000000005e-06, "loss": 13.9361, "step": 20 }, { "epoch": 0.038510911424903725, "grad_norm": NaN, "learning_rate": 5.2e-06, "loss": 13.2456, "step": 30 }, { "epoch": 0.051347881899871634, "grad_norm": 84.45552825927734, "learning_rate": 7.2e-06, "loss": 12.3189, "step": 40 }, { "epoch": 0.06418485237483953, "grad_norm": 90.85704040527344, "learning_rate": 9e-06, "loss": 10.2159, "step": 50 }, { "epoch": 0.07702182284980745, "grad_norm": 90.75808715820312, "learning_rate": 1.1000000000000001e-05, "loss": 8.5353, "step": 60 }, { "epoch": 0.08985879332477535, "grad_norm": 77.908447265625, "learning_rate": 1.3000000000000001e-05, "loss": 5.7586, "step": 70 }, { "epoch": 0.10269576379974327, "grad_norm": 51.35749435424805, "learning_rate": 1.5e-05, "loss": 3.5377, "step": 80 }, { "epoch": 0.11553273427471117, "grad_norm": 22.19648551940918, "learning_rate": 1.7000000000000003e-05, "loss": 2.0392, "step": 90 }, { "epoch": 0.12836970474967907, "grad_norm": 4.810085773468018, "learning_rate": 1.9e-05, "loss": 1.3851, "step": 100 }, { "epoch": 0.14120667522464697, "grad_norm": 3.834592342376709, "learning_rate": 2.1e-05, "loss": 0.9423, "step": 110 }, { "epoch": 0.1540436456996149, "grad_norm": 2.0345306396484375, "learning_rate": 2.3000000000000003e-05, "loss": 0.5697, "step": 120 }, { "epoch": 0.1668806161745828, "grad_norm": 0.8915618658065796, "learning_rate": 2.5e-05, "loss": 0.3697, "step": 130 }, { "epoch": 0.1797175866495507, "grad_norm": 0.4180503189563751, "learning_rate": 2.7000000000000002e-05, "loss": 0.2708, "step": 140 }, { "epoch": 0.1925545571245186, "grad_norm": 0.3554935157299042, "learning_rate": 2.9e-05, "loss": 0.2456, "step": 150 }, { "epoch": 0.20539152759948653, "grad_norm": 0.30976223945617676, "learning_rate": 3.1e-05, "loss": 0.2336, "step": 160 }, { "epoch": 0.21822849807445444, "grad_norm": 0.40389877557754517, "learning_rate": 3.3e-05, "loss": 0.2096, "step": 170 }, { "epoch": 0.23106546854942234, "grad_norm": 0.2854978144168854, "learning_rate": 3.5e-05, "loss": 0.198, "step": 180 }, { "epoch": 0.24390243902439024, "grad_norm": 0.22607630491256714, "learning_rate": 3.7e-05, "loss": 0.1828, "step": 190 }, { "epoch": 0.25673940949935814, "grad_norm": 0.3487206697463989, "learning_rate": 3.9000000000000006e-05, "loss": 0.1848, "step": 200 }, { "epoch": 0.26957637997432604, "grad_norm": 0.27090469002723694, "learning_rate": 4.1e-05, "loss": 0.171, "step": 210 }, { "epoch": 0.28241335044929394, "grad_norm": 0.2992759644985199, "learning_rate": 4.3e-05, "loss": 0.1575, "step": 220 }, { "epoch": 0.2952503209242619, "grad_norm": 0.25752368569374084, "learning_rate": 4.5e-05, "loss": 0.1585, "step": 230 }, { "epoch": 0.3080872913992298, "grad_norm": 0.1718735545873642, "learning_rate": 4.7e-05, "loss": 0.1648, "step": 240 }, { "epoch": 0.3209242618741977, "grad_norm": 0.23267413675785065, "learning_rate": 4.9e-05, "loss": 0.1536, "step": 250 }, { "epoch": 0.3337612323491656, "grad_norm": 0.2425757646560669, "learning_rate": 5.1000000000000006e-05, "loss": 0.1499, "step": 260 }, { "epoch": 0.3465982028241335, "grad_norm": 0.30258190631866455, "learning_rate": 5.300000000000001e-05, "loss": 0.1425, "step": 270 }, { "epoch": 0.3594351732991014, "grad_norm": 0.21171259880065918, "learning_rate": 5.500000000000001e-05, "loss": 0.1561, "step": 280 }, { "epoch": 0.3722721437740693, "grad_norm": 0.15556274354457855, "learning_rate": 5.6999999999999996e-05, "loss": 0.1482, "step": 290 }, { "epoch": 0.3851091142490372, "grad_norm": 0.13324399292469025, "learning_rate": 5.9e-05, "loss": 0.1384, "step": 300 }, { "epoch": 0.3979460847240051, "grad_norm": 0.2825668156147003, "learning_rate": 6.1e-05, "loss": 0.1484, "step": 310 }, { "epoch": 0.41078305519897307, "grad_norm": 0.20607219636440277, "learning_rate": 6.3e-05, "loss": 0.1528, "step": 320 }, { "epoch": 0.42362002567394097, "grad_norm": 0.17047886550426483, "learning_rate": 6.500000000000001e-05, "loss": 0.1354, "step": 330 }, { "epoch": 0.43645699614890887, "grad_norm": 0.15508385002613068, "learning_rate": 6.7e-05, "loss": 0.1432, "step": 340 }, { "epoch": 0.4492939666238768, "grad_norm": 0.14523576200008392, "learning_rate": 6.9e-05, "loss": 0.1343, "step": 350 }, { "epoch": 0.4621309370988447, "grad_norm": 0.14676764607429504, "learning_rate": 7.1e-05, "loss": 0.1414, "step": 360 }, { "epoch": 0.4749679075738126, "grad_norm": 0.166632741689682, "learning_rate": 7.3e-05, "loss": 0.1315, "step": 370 }, { "epoch": 0.4878048780487805, "grad_norm": 0.13161858916282654, "learning_rate": 7.500000000000001e-05, "loss": 0.1305, "step": 380 }, { "epoch": 0.5006418485237484, "grad_norm": 0.19676858186721802, "learning_rate": 7.7e-05, "loss": 0.1356, "step": 390 }, { "epoch": 0.5134788189987163, "grad_norm": 0.12109082192182541, "learning_rate": 7.900000000000001e-05, "loss": 0.133, "step": 400 }, { "epoch": 0.5263157894736842, "grad_norm": 0.16240255534648895, "learning_rate": 8.1e-05, "loss": 0.141, "step": 410 }, { "epoch": 0.5391527599486521, "grad_norm": 0.16620230674743652, "learning_rate": 8.3e-05, "loss": 0.1308, "step": 420 }, { "epoch": 0.55198973042362, "grad_norm": 0.14271290600299835, "learning_rate": 8.5e-05, "loss": 0.1258, "step": 430 }, { "epoch": 0.5648267008985879, "grad_norm": 0.1443270742893219, "learning_rate": 8.7e-05, "loss": 0.1312, "step": 440 }, { "epoch": 0.5776636713735558, "grad_norm": 0.15797071158885956, "learning_rate": 8.900000000000001e-05, "loss": 0.1286, "step": 450 }, { "epoch": 0.5905006418485238, "grad_norm": 0.19477838277816772, "learning_rate": 9.1e-05, "loss": 0.1419, "step": 460 }, { "epoch": 0.6033376123234917, "grad_norm": 0.1290614753961563, "learning_rate": 9.300000000000001e-05, "loss": 0.129, "step": 470 }, { "epoch": 0.6161745827984596, "grad_norm": 0.12268809229135513, "learning_rate": 9.5e-05, "loss": 0.1232, "step": 480 }, { "epoch": 0.6290115532734275, "grad_norm": 0.1401013731956482, "learning_rate": 9.7e-05, "loss": 0.1229, "step": 490 }, { "epoch": 0.6418485237483954, "grad_norm": 0.11726848036050797, "learning_rate": 9.900000000000001e-05, "loss": 0.1178, "step": 500 }, { "epoch": 0.6418485237483954, "eval_gen_len": 38.946, "eval_loss": 0.10930793732404709, "eval_rouge1": 0.5574, "eval_rouge2": 0.2575, "eval_rougeL": 0.5149, "eval_runtime": 45.7535, "eval_samples_per_second": 10.928, "eval_steps_per_second": 1.377, "step": 500 }, { "epoch": 0.6546854942233633, "grad_norm": 0.13438373804092407, "learning_rate": 9.985272459499264e-05, "loss": 0.1331, "step": 510 }, { "epoch": 0.6675224646983312, "grad_norm": 0.12564225494861603, "learning_rate": 9.955817378497792e-05, "loss": 0.1257, "step": 520 }, { "epoch": 0.6803594351732991, "grad_norm": 0.11165695637464523, "learning_rate": 9.926362297496319e-05, "loss": 0.1255, "step": 530 }, { "epoch": 0.693196405648267, "grad_norm": 0.16142414510250092, "learning_rate": 9.896907216494846e-05, "loss": 0.1384, "step": 540 }, { "epoch": 0.7060333761232349, "grad_norm": 0.15439870953559875, "learning_rate": 9.867452135493374e-05, "loss": 0.1188, "step": 550 }, { "epoch": 0.7188703465982028, "grad_norm": 0.1158837303519249, "learning_rate": 9.8379970544919e-05, "loss": 0.1256, "step": 560 }, { "epoch": 0.7317073170731707, "grad_norm": 0.1260393112897873, "learning_rate": 9.808541973490427e-05, "loss": 0.1327, "step": 570 }, { "epoch": 0.7445442875481386, "grad_norm": 0.12582792341709137, "learning_rate": 9.779086892488954e-05, "loss": 0.1244, "step": 580 }, { "epoch": 0.7573812580231065, "grad_norm": 0.967321515083313, "learning_rate": 9.749631811487482e-05, "loss": 0.1283, "step": 590 }, { "epoch": 0.7702182284980744, "grad_norm": 0.14996884763240814, "learning_rate": 9.720176730486009e-05, "loss": 0.1378, "step": 600 }, { "epoch": 0.7830551989730423, "grad_norm": 0.13016603887081146, "learning_rate": 9.690721649484537e-05, "loss": 0.1249, "step": 610 }, { "epoch": 0.7958921694480102, "grad_norm": 0.11623676121234894, "learning_rate": 9.661266568483064e-05, "loss": 0.135, "step": 620 }, { "epoch": 0.8087291399229781, "grad_norm": 0.14658813178539276, "learning_rate": 9.631811487481591e-05, "loss": 0.1176, "step": 630 }, { "epoch": 0.8215661103979461, "grad_norm": 0.13376665115356445, "learning_rate": 9.602356406480119e-05, "loss": 0.1208, "step": 640 }, { "epoch": 0.834403080872914, "grad_norm": 0.13538646697998047, "learning_rate": 9.572901325478646e-05, "loss": 0.1192, "step": 650 }, { "epoch": 0.8472400513478819, "grad_norm": 0.13593578338623047, "learning_rate": 9.543446244477173e-05, "loss": 0.1227, "step": 660 }, { "epoch": 0.8600770218228498, "grad_norm": 0.13085196912288666, "learning_rate": 9.5139911634757e-05, "loss": 0.1275, "step": 670 }, { "epoch": 0.8729139922978177, "grad_norm": 0.1571151316165924, "learning_rate": 9.484536082474227e-05, "loss": 0.1274, "step": 680 }, { "epoch": 0.8857509627727856, "grad_norm": 0.14053873717784882, "learning_rate": 9.455081001472754e-05, "loss": 0.1205, "step": 690 }, { "epoch": 0.8985879332477535, "grad_norm": 0.11424735933542252, "learning_rate": 9.425625920471282e-05, "loss": 0.1182, "step": 700 }, { "epoch": 0.9114249037227214, "grad_norm": 0.1393001526594162, "learning_rate": 9.396170839469809e-05, "loss": 0.1229, "step": 710 }, { "epoch": 0.9242618741976893, "grad_norm": 0.1388111263513565, "learning_rate": 9.366715758468336e-05, "loss": 0.1274, "step": 720 }, { "epoch": 0.9370988446726572, "grad_norm": 0.11912015080451965, "learning_rate": 9.337260677466864e-05, "loss": 0.118, "step": 730 }, { "epoch": 0.9499358151476252, "grad_norm": 0.15506206452846527, "learning_rate": 9.307805596465391e-05, "loss": 0.1258, "step": 740 }, { "epoch": 0.962772785622593, "grad_norm": 0.158128559589386, "learning_rate": 9.278350515463918e-05, "loss": 0.1229, "step": 750 }, { "epoch": 0.975609756097561, "grad_norm": 0.16017945110797882, "learning_rate": 9.248895434462446e-05, "loss": 0.1239, "step": 760 }, { "epoch": 0.9884467265725289, "grad_norm": 0.1304791122674942, "learning_rate": 9.219440353460973e-05, "loss": 0.1122, "step": 770 }, { "epoch": 1.0012836970474968, "grad_norm": 0.12392584979534149, "learning_rate": 9.189985272459499e-05, "loss": 0.127, "step": 780 }, { "epoch": 1.0141206675224648, "grad_norm": 0.10105091333389282, "learning_rate": 9.160530191458027e-05, "loss": 0.1155, "step": 790 }, { "epoch": 1.0269576379974326, "grad_norm": 0.10798871517181396, "learning_rate": 9.131075110456554e-05, "loss": 0.1263, "step": 800 }, { "epoch": 1.0397946084724006, "grad_norm": 0.1646156907081604, "learning_rate": 9.101620029455081e-05, "loss": 0.1135, "step": 810 }, { "epoch": 1.0526315789473684, "grad_norm": 0.13659092783927917, "learning_rate": 9.072164948453609e-05, "loss": 0.118, "step": 820 }, { "epoch": 1.0654685494223364, "grad_norm": 0.1449888050556183, "learning_rate": 9.042709867452136e-05, "loss": 0.1252, "step": 830 }, { "epoch": 1.0783055198973042, "grad_norm": 0.14084464311599731, "learning_rate": 9.013254786450663e-05, "loss": 0.1254, "step": 840 }, { "epoch": 1.0911424903722722, "grad_norm": 0.14235644042491913, "learning_rate": 8.983799705449191e-05, "loss": 0.1169, "step": 850 }, { "epoch": 1.10397946084724, "grad_norm": 0.13668040931224823, "learning_rate": 8.954344624447718e-05, "loss": 0.1199, "step": 860 }, { "epoch": 1.116816431322208, "grad_norm": 0.20514467358589172, "learning_rate": 8.924889543446246e-05, "loss": 0.118, "step": 870 }, { "epoch": 1.1296534017971758, "grad_norm": 0.13541142642498016, "learning_rate": 8.895434462444772e-05, "loss": 0.129, "step": 880 }, { "epoch": 1.1424903722721438, "grad_norm": 0.11607542634010315, "learning_rate": 8.865979381443299e-05, "loss": 0.1226, "step": 890 }, { "epoch": 1.1553273427471118, "grad_norm": 0.134485125541687, "learning_rate": 8.836524300441826e-05, "loss": 0.1166, "step": 900 }, { "epoch": 1.1681643132220796, "grad_norm": 0.12031760811805725, "learning_rate": 8.807069219440354e-05, "loss": 0.1118, "step": 910 }, { "epoch": 1.1810012836970474, "grad_norm": 0.14312677085399628, "learning_rate": 8.777614138438881e-05, "loss": 0.1271, "step": 920 }, { "epoch": 1.1938382541720154, "grad_norm": 0.1250048577785492, "learning_rate": 8.748159057437408e-05, "loss": 0.1116, "step": 930 }, { "epoch": 1.2066752246469834, "grad_norm": 0.12448818981647491, "learning_rate": 8.718703976435936e-05, "loss": 0.12, "step": 940 }, { "epoch": 1.2195121951219512, "grad_norm": 0.1207614317536354, "learning_rate": 8.689248895434463e-05, "loss": 0.1142, "step": 950 }, { "epoch": 1.2323491655969192, "grad_norm": 0.12647990882396698, "learning_rate": 8.65979381443299e-05, "loss": 0.1151, "step": 960 }, { "epoch": 1.245186136071887, "grad_norm": 0.12347666174173355, "learning_rate": 8.630338733431518e-05, "loss": 0.1217, "step": 970 }, { "epoch": 1.258023106546855, "grad_norm": 0.16950395703315735, "learning_rate": 8.600883652430045e-05, "loss": 0.1218, "step": 980 }, { "epoch": 1.2708600770218228, "grad_norm": 0.12122233211994171, "learning_rate": 8.571428571428571e-05, "loss": 0.1139, "step": 990 }, { "epoch": 1.2836970474967908, "grad_norm": 0.1224595233798027, "learning_rate": 8.541973490427099e-05, "loss": 0.1162, "step": 1000 }, { "epoch": 1.2836970474967908, "eval_gen_len": 38.946, "eval_loss": 0.10218328982591629, "eval_rouge1": 0.5746, "eval_rouge2": 0.2819, "eval_rougeL": 0.5376, "eval_runtime": 45.0734, "eval_samples_per_second": 11.093, "eval_steps_per_second": 1.398, "step": 1000 }, { "epoch": 1.2965340179717586, "grad_norm": 0.0963892713189125, "learning_rate": 8.512518409425626e-05, "loss": 0.1222, "step": 1010 }, { "epoch": 1.3093709884467266, "grad_norm": 0.1362753063440323, "learning_rate": 8.483063328424153e-05, "loss": 0.1145, "step": 1020 }, { "epoch": 1.3222079589216944, "grad_norm": 0.09700595587491989, "learning_rate": 8.453608247422681e-05, "loss": 0.1185, "step": 1030 }, { "epoch": 1.3350449293966624, "grad_norm": 0.13865801692008972, "learning_rate": 8.424153166421208e-05, "loss": 0.1228, "step": 1040 }, { "epoch": 1.3478818998716302, "grad_norm": 0.11458808183670044, "learning_rate": 8.394698085419735e-05, "loss": 0.1092, "step": 1050 }, { "epoch": 1.3607188703465982, "grad_norm": 0.12909230589866638, "learning_rate": 8.365243004418263e-05, "loss": 0.1153, "step": 1060 }, { "epoch": 1.3735558408215662, "grad_norm": 0.13270749151706696, "learning_rate": 8.33578792341679e-05, "loss": 0.1207, "step": 1070 }, { "epoch": 1.386392811296534, "grad_norm": 0.1388499140739441, "learning_rate": 8.306332842415318e-05, "loss": 0.1271, "step": 1080 }, { "epoch": 1.3992297817715018, "grad_norm": 0.11819977313280106, "learning_rate": 8.276877761413844e-05, "loss": 0.1221, "step": 1090 }, { "epoch": 1.4120667522464698, "grad_norm": 0.13087821006774902, "learning_rate": 8.247422680412371e-05, "loss": 0.1092, "step": 1100 }, { "epoch": 1.4249037227214378, "grad_norm": 0.14269869029521942, "learning_rate": 8.217967599410898e-05, "loss": 0.1323, "step": 1110 }, { "epoch": 1.4377406931964056, "grad_norm": 0.11728661507368088, "learning_rate": 8.188512518409426e-05, "loss": 0.116, "step": 1120 }, { "epoch": 1.4505776636713734, "grad_norm": 0.12322080880403519, "learning_rate": 8.159057437407953e-05, "loss": 0.1162, "step": 1130 }, { "epoch": 1.4634146341463414, "grad_norm": 0.14137819409370422, "learning_rate": 8.12960235640648e-05, "loss": 0.1166, "step": 1140 }, { "epoch": 1.4762516046213094, "grad_norm": 0.11483679711818695, "learning_rate": 8.100147275405008e-05, "loss": 0.113, "step": 1150 }, { "epoch": 1.4890885750962772, "grad_norm": 0.107373908162117, "learning_rate": 8.070692194403535e-05, "loss": 0.1086, "step": 1160 }, { "epoch": 1.501925545571245, "grad_norm": 0.1413784623146057, "learning_rate": 8.041237113402063e-05, "loss": 0.1127, "step": 1170 }, { "epoch": 1.514762516046213, "grad_norm": 0.15129907429218292, "learning_rate": 8.01178203240059e-05, "loss": 0.1094, "step": 1180 }, { "epoch": 1.527599486521181, "grad_norm": 0.11779375374317169, "learning_rate": 7.982326951399117e-05, "loss": 0.1113, "step": 1190 }, { "epoch": 1.540436456996149, "grad_norm": 0.11644338816404343, "learning_rate": 7.952871870397643e-05, "loss": 0.1242, "step": 1200 }, { "epoch": 1.5532734274711169, "grad_norm": 0.14850880205631256, "learning_rate": 7.923416789396171e-05, "loss": 0.1192, "step": 1210 }, { "epoch": 1.5661103979460846, "grad_norm": 0.11069446802139282, "learning_rate": 7.893961708394698e-05, "loss": 0.1043, "step": 1220 }, { "epoch": 1.5789473684210527, "grad_norm": 0.0926310271024704, "learning_rate": 7.864506627393225e-05, "loss": 0.1218, "step": 1230 }, { "epoch": 1.5917843388960207, "grad_norm": 0.1573435515165329, "learning_rate": 7.835051546391753e-05, "loss": 0.1181, "step": 1240 }, { "epoch": 1.6046213093709885, "grad_norm": 0.11641980707645416, "learning_rate": 7.80559646539028e-05, "loss": 0.1206, "step": 1250 }, { "epoch": 1.6174582798459562, "grad_norm": 0.119265116751194, "learning_rate": 7.776141384388808e-05, "loss": 0.1148, "step": 1260 }, { "epoch": 1.6302952503209243, "grad_norm": 0.12308722734451294, "learning_rate": 7.746686303387335e-05, "loss": 0.1132, "step": 1270 }, { "epoch": 1.6431322207958923, "grad_norm": 0.12015580385923386, "learning_rate": 7.717231222385862e-05, "loss": 0.113, "step": 1280 }, { "epoch": 1.65596919127086, "grad_norm": 0.09488837420940399, "learning_rate": 7.68777614138439e-05, "loss": 0.112, "step": 1290 }, { "epoch": 1.6688061617458279, "grad_norm": 0.11132404953241348, "learning_rate": 7.658321060382917e-05, "loss": 0.1085, "step": 1300 }, { "epoch": 1.6816431322207959, "grad_norm": 0.15267649292945862, "learning_rate": 7.628865979381443e-05, "loss": 0.1136, "step": 1310 }, { "epoch": 1.6944801026957639, "grad_norm": 0.13199123740196228, "learning_rate": 7.59941089837997e-05, "loss": 0.1145, "step": 1320 }, { "epoch": 1.7073170731707317, "grad_norm": 0.11657251417636871, "learning_rate": 7.569955817378498e-05, "loss": 0.111, "step": 1330 }, { "epoch": 1.7201540436456995, "grad_norm": 0.15839266777038574, "learning_rate": 7.540500736377025e-05, "loss": 0.1166, "step": 1340 }, { "epoch": 1.7329910141206675, "grad_norm": 0.10701854526996613, "learning_rate": 7.511045655375553e-05, "loss": 0.1127, "step": 1350 }, { "epoch": 1.7458279845956355, "grad_norm": 0.16936787962913513, "learning_rate": 7.48159057437408e-05, "loss": 0.113, "step": 1360 }, { "epoch": 1.7586649550706035, "grad_norm": 0.10048405826091766, "learning_rate": 7.452135493372607e-05, "loss": 0.1139, "step": 1370 }, { "epoch": 1.7715019255455713, "grad_norm": 0.11125636100769043, "learning_rate": 7.422680412371135e-05, "loss": 0.1156, "step": 1380 }, { "epoch": 1.784338896020539, "grad_norm": 0.10491804033517838, "learning_rate": 7.393225331369662e-05, "loss": 0.1227, "step": 1390 }, { "epoch": 1.797175866495507, "grad_norm": 0.1252523511648178, "learning_rate": 7.36377025036819e-05, "loss": 0.1258, "step": 1400 }, { "epoch": 1.810012836970475, "grad_norm": 0.10493435710668564, "learning_rate": 7.334315169366715e-05, "loss": 0.1153, "step": 1410 }, { "epoch": 1.822849807445443, "grad_norm": 0.11997751146554947, "learning_rate": 7.304860088365243e-05, "loss": 0.1171, "step": 1420 }, { "epoch": 1.8356867779204107, "grad_norm": 0.10238035768270493, "learning_rate": 7.27540500736377e-05, "loss": 0.1144, "step": 1430 }, { "epoch": 1.8485237483953787, "grad_norm": 0.12434195727109909, "learning_rate": 7.245949926362298e-05, "loss": 0.1212, "step": 1440 }, { "epoch": 1.8613607188703467, "grad_norm": 0.09716463088989258, "learning_rate": 7.216494845360825e-05, "loss": 0.1161, "step": 1450 }, { "epoch": 1.8741976893453145, "grad_norm": 0.18025720119476318, "learning_rate": 7.187039764359352e-05, "loss": 0.1191, "step": 1460 }, { "epoch": 1.8870346598202823, "grad_norm": 0.10300459712743759, "learning_rate": 7.15758468335788e-05, "loss": 0.1161, "step": 1470 }, { "epoch": 1.8998716302952503, "grad_norm": 0.11518652737140656, "learning_rate": 7.128129602356407e-05, "loss": 0.1062, "step": 1480 }, { "epoch": 1.9127086007702183, "grad_norm": 0.10772902518510818, "learning_rate": 7.098674521354934e-05, "loss": 0.114, "step": 1490 }, { "epoch": 1.925545571245186, "grad_norm": 0.1605733186006546, "learning_rate": 7.069219440353462e-05, "loss": 0.1106, "step": 1500 }, { "epoch": 1.925545571245186, "eval_gen_len": 38.946, "eval_loss": 0.09927912056446075, "eval_rouge1": 0.5823, "eval_rouge2": 0.2956, "eval_rougeL": 0.5489, "eval_runtime": 45.4027, "eval_samples_per_second": 11.013, "eval_steps_per_second": 1.388, "step": 1500 }, { "epoch": 1.938382541720154, "grad_norm": 0.12198202311992645, "learning_rate": 7.039764359351989e-05, "loss": 0.1183, "step": 1510 }, { "epoch": 1.951219512195122, "grad_norm": 0.1133396103978157, "learning_rate": 7.010309278350515e-05, "loss": 0.1188, "step": 1520 }, { "epoch": 1.96405648267009, "grad_norm": 0.10706740617752075, "learning_rate": 6.980854197349043e-05, "loss": 0.1161, "step": 1530 }, { "epoch": 1.976893453145058, "grad_norm": 0.10656236857175827, "learning_rate": 6.95139911634757e-05, "loss": 0.1113, "step": 1540 }, { "epoch": 1.9897304236200257, "grad_norm": 0.1278591752052307, "learning_rate": 6.921944035346097e-05, "loss": 0.1262, "step": 1550 }, { "epoch": 2.0025673940949935, "grad_norm": 0.11729343235492706, "learning_rate": 6.892488954344625e-05, "loss": 0.1267, "step": 1560 }, { "epoch": 2.0154043645699615, "grad_norm": 0.10659424960613251, "learning_rate": 6.863033873343152e-05, "loss": 0.0972, "step": 1570 }, { "epoch": 2.0282413350449295, "grad_norm": 0.13483816385269165, "learning_rate": 6.83357879234168e-05, "loss": 0.1075, "step": 1580 }, { "epoch": 2.041078305519897, "grad_norm": 0.11035773903131485, "learning_rate": 6.804123711340207e-05, "loss": 0.1041, "step": 1590 }, { "epoch": 2.053915275994865, "grad_norm": 0.11331629753112793, "learning_rate": 6.774668630338734e-05, "loss": 0.1107, "step": 1600 }, { "epoch": 2.066752246469833, "grad_norm": 0.10604571551084518, "learning_rate": 6.745213549337262e-05, "loss": 0.113, "step": 1610 }, { "epoch": 2.079589216944801, "grad_norm": 0.1317732185125351, "learning_rate": 6.715758468335788e-05, "loss": 0.1205, "step": 1620 }, { "epoch": 2.092426187419769, "grad_norm": 0.1633516103029251, "learning_rate": 6.686303387334315e-05, "loss": 0.1213, "step": 1630 }, { "epoch": 2.1052631578947367, "grad_norm": 0.1562272608280182, "learning_rate": 6.656848306332842e-05, "loss": 0.1082, "step": 1640 }, { "epoch": 2.1181001283697047, "grad_norm": 0.17369812726974487, "learning_rate": 6.62739322533137e-05, "loss": 0.1042, "step": 1650 }, { "epoch": 2.1309370988446728, "grad_norm": 0.12046299129724503, "learning_rate": 6.597938144329897e-05, "loss": 0.1097, "step": 1660 }, { "epoch": 2.1437740693196408, "grad_norm": 0.11788310110569, "learning_rate": 6.568483063328424e-05, "loss": 0.1161, "step": 1670 }, { "epoch": 2.1566110397946083, "grad_norm": 0.11287978291511536, "learning_rate": 6.539027982326952e-05, "loss": 0.1196, "step": 1680 }, { "epoch": 2.1694480102695763, "grad_norm": 0.1154395118355751, "learning_rate": 6.509572901325479e-05, "loss": 0.1145, "step": 1690 }, { "epoch": 2.1822849807445444, "grad_norm": 0.10837022215127945, "learning_rate": 6.480117820324007e-05, "loss": 0.1167, "step": 1700 }, { "epoch": 2.1951219512195124, "grad_norm": 0.11331544071435928, "learning_rate": 6.450662739322534e-05, "loss": 0.1115, "step": 1710 }, { "epoch": 2.20795892169448, "grad_norm": 0.11895423382520676, "learning_rate": 6.421207658321061e-05, "loss": 0.1102, "step": 1720 }, { "epoch": 2.220795892169448, "grad_norm": 0.14609269797801971, "learning_rate": 6.391752577319587e-05, "loss": 0.1133, "step": 1730 }, { "epoch": 2.233632862644416, "grad_norm": 0.13491936028003693, "learning_rate": 6.362297496318115e-05, "loss": 0.1072, "step": 1740 }, { "epoch": 2.246469833119384, "grad_norm": 0.13540756702423096, "learning_rate": 6.332842415316642e-05, "loss": 0.114, "step": 1750 }, { "epoch": 2.2593068035943515, "grad_norm": 0.15043611824512482, "learning_rate": 6.30338733431517e-05, "loss": 0.1132, "step": 1760 }, { "epoch": 2.2721437740693196, "grad_norm": 0.14203940331935883, "learning_rate": 6.273932253313697e-05, "loss": 0.1142, "step": 1770 }, { "epoch": 2.2849807445442876, "grad_norm": 0.11540842801332474, "learning_rate": 6.244477172312224e-05, "loss": 0.1055, "step": 1780 }, { "epoch": 2.2978177150192556, "grad_norm": 0.1832491159439087, "learning_rate": 6.215022091310752e-05, "loss": 0.1177, "step": 1790 }, { "epoch": 2.3106546854942236, "grad_norm": 0.11431492120027542, "learning_rate": 6.185567010309279e-05, "loss": 0.1058, "step": 1800 }, { "epoch": 2.323491655969191, "grad_norm": 0.12939855456352234, "learning_rate": 6.156111929307806e-05, "loss": 0.1122, "step": 1810 }, { "epoch": 2.336328626444159, "grad_norm": 0.12890468537807465, "learning_rate": 6.126656848306334e-05, "loss": 0.1127, "step": 1820 }, { "epoch": 2.349165596919127, "grad_norm": 0.11087578535079956, "learning_rate": 6.097201767304861e-05, "loss": 0.1106, "step": 1830 }, { "epoch": 2.3620025673940948, "grad_norm": 0.12481900304555893, "learning_rate": 6.067746686303387e-05, "loss": 0.1155, "step": 1840 }, { "epoch": 2.3748395378690628, "grad_norm": 0.11346503347158432, "learning_rate": 6.0382916053019144e-05, "loss": 0.1095, "step": 1850 }, { "epoch": 2.387676508344031, "grad_norm": 0.13271845877170563, "learning_rate": 6.008836524300442e-05, "loss": 0.1035, "step": 1860 }, { "epoch": 2.400513478818999, "grad_norm": 0.12727977335453033, "learning_rate": 5.979381443298969e-05, "loss": 0.1163, "step": 1870 }, { "epoch": 2.413350449293967, "grad_norm": 0.11618130654096603, "learning_rate": 5.9499263622974965e-05, "loss": 0.1127, "step": 1880 }, { "epoch": 2.4261874197689344, "grad_norm": 0.12864802777767181, "learning_rate": 5.920471281296024e-05, "loss": 0.1137, "step": 1890 }, { "epoch": 2.4390243902439024, "grad_norm": 0.10020538419485092, "learning_rate": 5.891016200294551e-05, "loss": 0.1054, "step": 1900 }, { "epoch": 2.4518613607188704, "grad_norm": 0.25161072611808777, "learning_rate": 5.861561119293079e-05, "loss": 0.107, "step": 1910 }, { "epoch": 2.4646983311938384, "grad_norm": 0.11326297372579575, "learning_rate": 5.832106038291606e-05, "loss": 0.1132, "step": 1920 }, { "epoch": 2.477535301668806, "grad_norm": 0.28271356225013733, "learning_rate": 5.8026509572901334e-05, "loss": 0.1121, "step": 1930 }, { "epoch": 2.490372272143774, "grad_norm": 0.10053003579378128, "learning_rate": 5.7731958762886594e-05, "loss": 0.1092, "step": 1940 }, { "epoch": 2.503209242618742, "grad_norm": 0.1034500002861023, "learning_rate": 5.743740795287187e-05, "loss": 0.1111, "step": 1950 }, { "epoch": 2.51604621309371, "grad_norm": 0.12638093531131744, "learning_rate": 5.714285714285714e-05, "loss": 0.1122, "step": 1960 }, { "epoch": 2.528883183568678, "grad_norm": 0.12756876647472382, "learning_rate": 5.6848306332842415e-05, "loss": 0.1053, "step": 1970 }, { "epoch": 2.5417201540436456, "grad_norm": 0.14028741419315338, "learning_rate": 5.655375552282769e-05, "loss": 0.1143, "step": 1980 }, { "epoch": 2.5545571245186136, "grad_norm": 0.1252150535583496, "learning_rate": 5.625920471281296e-05, "loss": 0.1172, "step": 1990 }, { "epoch": 2.5673940949935816, "grad_norm": 0.09696774929761887, "learning_rate": 5.596465390279824e-05, "loss": 0.1047, "step": 2000 }, { "epoch": 2.5673940949935816, "eval_gen_len": 38.946, "eval_loss": 0.09703746438026428, "eval_rouge1": 0.5894, "eval_rouge2": 0.3049, "eval_rougeL": 0.5547, "eval_runtime": 45.1347, "eval_samples_per_second": 11.078, "eval_steps_per_second": 1.396, "step": 2000 }, { "epoch": 2.580231065468549, "grad_norm": 0.13524310290813446, "learning_rate": 5.567010309278351e-05, "loss": 0.1112, "step": 2010 }, { "epoch": 2.593068035943517, "grad_norm": 0.10453896969556808, "learning_rate": 5.5375552282768784e-05, "loss": 0.1073, "step": 2020 }, { "epoch": 2.605905006418485, "grad_norm": 0.1299864947795868, "learning_rate": 5.508100147275406e-05, "loss": 0.1067, "step": 2030 }, { "epoch": 2.6187419768934532, "grad_norm": 0.11628852039575577, "learning_rate": 5.478645066273933e-05, "loss": 0.1072, "step": 2040 }, { "epoch": 2.6315789473684212, "grad_norm": 0.12102477252483368, "learning_rate": 5.449189985272459e-05, "loss": 0.1133, "step": 2050 }, { "epoch": 2.644415917843389, "grad_norm": 0.08944839239120483, "learning_rate": 5.4197349042709865e-05, "loss": 0.1058, "step": 2060 }, { "epoch": 2.657252888318357, "grad_norm": 0.11976416409015656, "learning_rate": 5.390279823269514e-05, "loss": 0.115, "step": 2070 }, { "epoch": 2.670089858793325, "grad_norm": 0.11056056618690491, "learning_rate": 5.360824742268041e-05, "loss": 0.1107, "step": 2080 }, { "epoch": 2.682926829268293, "grad_norm": 0.12344369292259216, "learning_rate": 5.3313696612665687e-05, "loss": 0.1255, "step": 2090 }, { "epoch": 2.6957637997432604, "grad_norm": 0.10227925330400467, "learning_rate": 5.301914580265096e-05, "loss": 0.1179, "step": 2100 }, { "epoch": 2.7086007702182284, "grad_norm": 0.09863247722387314, "learning_rate": 5.2724594992636234e-05, "loss": 0.1034, "step": 2110 }, { "epoch": 2.7214377406931964, "grad_norm": 0.12329726666212082, "learning_rate": 5.243004418262151e-05, "loss": 0.109, "step": 2120 }, { "epoch": 2.7342747111681645, "grad_norm": 0.128681480884552, "learning_rate": 5.213549337260678e-05, "loss": 0.111, "step": 2130 }, { "epoch": 2.7471116816431325, "grad_norm": 0.1252344846725464, "learning_rate": 5.1840942562592055e-05, "loss": 0.1052, "step": 2140 }, { "epoch": 2.7599486521181, "grad_norm": 0.11999202519655228, "learning_rate": 5.1546391752577315e-05, "loss": 0.1056, "step": 2150 }, { "epoch": 2.772785622593068, "grad_norm": 0.12448093295097351, "learning_rate": 5.125184094256259e-05, "loss": 0.1279, "step": 2160 }, { "epoch": 2.785622593068036, "grad_norm": 0.32093188166618347, "learning_rate": 5.095729013254786e-05, "loss": 0.1018, "step": 2170 }, { "epoch": 2.7984595635430036, "grad_norm": 0.10641879588365555, "learning_rate": 5.0662739322533137e-05, "loss": 0.11, "step": 2180 }, { "epoch": 2.8112965340179716, "grad_norm": 0.12216117233037949, "learning_rate": 5.036818851251841e-05, "loss": 0.1099, "step": 2190 }, { "epoch": 2.8241335044929397, "grad_norm": 0.1177031397819519, "learning_rate": 5.0073637702503684e-05, "loss": 0.1156, "step": 2200 }, { "epoch": 2.8369704749679077, "grad_norm": 0.21809132397174835, "learning_rate": 4.977908689248896e-05, "loss": 0.1033, "step": 2210 }, { "epoch": 2.8498074454428757, "grad_norm": 0.16358229517936707, "learning_rate": 4.948453608247423e-05, "loss": 0.1046, "step": 2220 }, { "epoch": 2.8626444159178432, "grad_norm": 0.20664657652378082, "learning_rate": 4.91899852724595e-05, "loss": 0.1111, "step": 2230 }, { "epoch": 2.8754813863928113, "grad_norm": 0.1042567566037178, "learning_rate": 4.889543446244477e-05, "loss": 0.1122, "step": 2240 }, { "epoch": 2.8883183568677793, "grad_norm": 0.12193156033754349, "learning_rate": 4.8600883652430046e-05, "loss": 0.1024, "step": 2250 }, { "epoch": 2.901155327342747, "grad_norm": 0.12314718961715698, "learning_rate": 4.830633284241532e-05, "loss": 0.1114, "step": 2260 }, { "epoch": 2.913992297817715, "grad_norm": 0.12970231473445892, "learning_rate": 4.801178203240059e-05, "loss": 0.1062, "step": 2270 }, { "epoch": 2.926829268292683, "grad_norm": 0.11207956075668335, "learning_rate": 4.771723122238587e-05, "loss": 0.1135, "step": 2280 }, { "epoch": 2.939666238767651, "grad_norm": 0.10265418887138367, "learning_rate": 4.7422680412371134e-05, "loss": 0.1095, "step": 2290 }, { "epoch": 2.952503209242619, "grad_norm": 0.17097508907318115, "learning_rate": 4.712812960235641e-05, "loss": 0.1063, "step": 2300 }, { "epoch": 2.965340179717587, "grad_norm": 0.1322011947631836, "learning_rate": 4.683357879234168e-05, "loss": 0.111, "step": 2310 }, { "epoch": 2.9781771501925545, "grad_norm": 0.10939087718725204, "learning_rate": 4.6539027982326955e-05, "loss": 0.1177, "step": 2320 }, { "epoch": 2.9910141206675225, "grad_norm": 0.18089720606803894, "learning_rate": 4.624447717231223e-05, "loss": 0.1042, "step": 2330 }, { "epoch": 3.0038510911424905, "grad_norm": 0.1323990374803543, "learning_rate": 4.5949926362297496e-05, "loss": 0.1098, "step": 2340 }, { "epoch": 3.016688061617458, "grad_norm": 0.15342199802398682, "learning_rate": 4.565537555228277e-05, "loss": 0.0985, "step": 2350 }, { "epoch": 3.029525032092426, "grad_norm": 0.11682497709989548, "learning_rate": 4.536082474226804e-05, "loss": 0.1109, "step": 2360 }, { "epoch": 3.042362002567394, "grad_norm": 0.17593421041965485, "learning_rate": 4.506627393225332e-05, "loss": 0.1053, "step": 2370 }, { "epoch": 3.055198973042362, "grad_norm": 0.13586147129535675, "learning_rate": 4.477172312223859e-05, "loss": 0.1029, "step": 2380 }, { "epoch": 3.06803594351733, "grad_norm": 0.14202643930912018, "learning_rate": 4.447717231222386e-05, "loss": 0.1087, "step": 2390 }, { "epoch": 3.0808729139922977, "grad_norm": 0.10535432398319244, "learning_rate": 4.418262150220913e-05, "loss": 0.1199, "step": 2400 }, { "epoch": 3.0937098844672657, "grad_norm": 0.13306142389774323, "learning_rate": 4.3888070692194405e-05, "loss": 0.1031, "step": 2410 }, { "epoch": 3.1065468549422337, "grad_norm": 0.14567367732524872, "learning_rate": 4.359351988217968e-05, "loss": 0.1091, "step": 2420 }, { "epoch": 3.1193838254172017, "grad_norm": 0.12033534795045853, "learning_rate": 4.329896907216495e-05, "loss": 0.1014, "step": 2430 }, { "epoch": 3.1322207958921693, "grad_norm": 0.11646905541419983, "learning_rate": 4.3004418262150226e-05, "loss": 0.1059, "step": 2440 }, { "epoch": 3.1450577663671373, "grad_norm": 0.11194875836372375, "learning_rate": 4.270986745213549e-05, "loss": 0.1051, "step": 2450 }, { "epoch": 3.1578947368421053, "grad_norm": 0.09517650306224823, "learning_rate": 4.241531664212077e-05, "loss": 0.106, "step": 2460 }, { "epoch": 3.1707317073170733, "grad_norm": 0.10933737456798553, "learning_rate": 4.212076583210604e-05, "loss": 0.1038, "step": 2470 }, { "epoch": 3.183568677792041, "grad_norm": 0.11497558653354645, "learning_rate": 4.1826215022091314e-05, "loss": 0.121, "step": 2480 }, { "epoch": 3.196405648267009, "grad_norm": 0.10219521820545197, "learning_rate": 4.153166421207659e-05, "loss": 0.1137, "step": 2490 }, { "epoch": 3.209242618741977, "grad_norm": 0.13737627863883972, "learning_rate": 4.1237113402061855e-05, "loss": 0.1052, "step": 2500 }, { "epoch": 3.209242618741977, "eval_gen_len": 38.946, "eval_loss": 0.09647729247808456, "eval_rouge1": 0.5912, "eval_rouge2": 0.3082, "eval_rougeL": 0.5595, "eval_runtime": 47.0681, "eval_samples_per_second": 10.623, "eval_steps_per_second": 1.338, "step": 2500 }, { "epoch": 3.222079589216945, "grad_norm": 0.11823912709951401, "learning_rate": 4.094256259204713e-05, "loss": 0.1061, "step": 2510 }, { "epoch": 3.2349165596919125, "grad_norm": 0.1412975937128067, "learning_rate": 4.06480117820324e-05, "loss": 0.1071, "step": 2520 }, { "epoch": 3.2477535301668805, "grad_norm": 0.09195394068956375, "learning_rate": 4.0353460972017676e-05, "loss": 0.11, "step": 2530 }, { "epoch": 3.2605905006418485, "grad_norm": 0.11850762367248535, "learning_rate": 4.005891016200295e-05, "loss": 0.1033, "step": 2540 }, { "epoch": 3.2734274711168165, "grad_norm": 0.09788330644369125, "learning_rate": 3.976435935198822e-05, "loss": 0.1102, "step": 2550 }, { "epoch": 3.2862644415917845, "grad_norm": 0.3657642602920532, "learning_rate": 3.946980854197349e-05, "loss": 0.1062, "step": 2560 }, { "epoch": 3.299101412066752, "grad_norm": 0.1347455084323883, "learning_rate": 3.9175257731958764e-05, "loss": 0.1036, "step": 2570 }, { "epoch": 3.31193838254172, "grad_norm": 0.11970392614603043, "learning_rate": 3.888070692194404e-05, "loss": 0.1059, "step": 2580 }, { "epoch": 3.324775353016688, "grad_norm": 0.13223974406719208, "learning_rate": 3.858615611192931e-05, "loss": 0.1104, "step": 2590 }, { "epoch": 3.337612323491656, "grad_norm": 0.13846062123775482, "learning_rate": 3.8291605301914585e-05, "loss": 0.1051, "step": 2600 }, { "epoch": 3.3504492939666237, "grad_norm": 0.12654447555541992, "learning_rate": 3.799705449189985e-05, "loss": 0.1047, "step": 2610 }, { "epoch": 3.3632862644415917, "grad_norm": 0.18010291457176208, "learning_rate": 3.7702503681885126e-05, "loss": 0.1049, "step": 2620 }, { "epoch": 3.3761232349165597, "grad_norm": 0.09747201949357986, "learning_rate": 3.74079528718704e-05, "loss": 0.1117, "step": 2630 }, { "epoch": 3.3889602053915278, "grad_norm": 0.105229951441288, "learning_rate": 3.7113402061855674e-05, "loss": 0.1036, "step": 2640 }, { "epoch": 3.4017971758664953, "grad_norm": 0.11923747509717941, "learning_rate": 3.681885125184095e-05, "loss": 0.1161, "step": 2650 }, { "epoch": 3.4146341463414633, "grad_norm": 0.11879535019397736, "learning_rate": 3.6524300441826214e-05, "loss": 0.1048, "step": 2660 }, { "epoch": 3.4274711168164314, "grad_norm": 0.12212618440389633, "learning_rate": 3.622974963181149e-05, "loss": 0.1156, "step": 2670 }, { "epoch": 3.4403080872913994, "grad_norm": 0.11665867269039154, "learning_rate": 3.593519882179676e-05, "loss": 0.1062, "step": 2680 }, { "epoch": 3.453145057766367, "grad_norm": 0.162534698843956, "learning_rate": 3.5640648011782035e-05, "loss": 0.1104, "step": 2690 }, { "epoch": 3.465982028241335, "grad_norm": 0.1339045912027359, "learning_rate": 3.534609720176731e-05, "loss": 0.1218, "step": 2700 }, { "epoch": 3.478818998716303, "grad_norm": 0.13339383900165558, "learning_rate": 3.5051546391752576e-05, "loss": 0.1099, "step": 2710 }, { "epoch": 3.491655969191271, "grad_norm": 0.22776636481285095, "learning_rate": 3.475699558173785e-05, "loss": 0.1053, "step": 2720 }, { "epoch": 3.504492939666239, "grad_norm": 0.10782419145107269, "learning_rate": 3.4462444771723124e-05, "loss": 0.1109, "step": 2730 }, { "epoch": 3.5173299101412066, "grad_norm": 0.17186006903648376, "learning_rate": 3.41678939617084e-05, "loss": 0.1116, "step": 2740 }, { "epoch": 3.5301668806161746, "grad_norm": 0.11577029526233673, "learning_rate": 3.387334315169367e-05, "loss": 0.1112, "step": 2750 }, { "epoch": 3.5430038510911426, "grad_norm": 0.12498428672552109, "learning_rate": 3.357879234167894e-05, "loss": 0.1043, "step": 2760 }, { "epoch": 3.55584082156611, "grad_norm": 0.11685289442539215, "learning_rate": 3.328424153166421e-05, "loss": 0.1029, "step": 2770 }, { "epoch": 3.568677792041078, "grad_norm": 0.10624662786722183, "learning_rate": 3.2989690721649485e-05, "loss": 0.1104, "step": 2780 }, { "epoch": 3.581514762516046, "grad_norm": 0.3390846252441406, "learning_rate": 3.269513991163476e-05, "loss": 0.1019, "step": 2790 }, { "epoch": 3.594351732991014, "grad_norm": 0.10731125622987747, "learning_rate": 3.240058910162003e-05, "loss": 0.103, "step": 2800 }, { "epoch": 3.607188703465982, "grad_norm": 0.1548955887556076, "learning_rate": 3.2106038291605307e-05, "loss": 0.1069, "step": 2810 }, { "epoch": 3.62002567394095, "grad_norm": 0.11366964876651764, "learning_rate": 3.1811487481590573e-05, "loss": 0.1119, "step": 2820 }, { "epoch": 3.6328626444159178, "grad_norm": 0.13698551058769226, "learning_rate": 3.151693667157585e-05, "loss": 0.1035, "step": 2830 }, { "epoch": 3.645699614890886, "grad_norm": 0.20564177632331848, "learning_rate": 3.122238586156112e-05, "loss": 0.1073, "step": 2840 }, { "epoch": 3.658536585365854, "grad_norm": 0.13262344896793365, "learning_rate": 3.0927835051546395e-05, "loss": 0.1037, "step": 2850 }, { "epoch": 3.6713735558408214, "grad_norm": 0.10157434642314911, "learning_rate": 3.063328424153167e-05, "loss": 0.0991, "step": 2860 }, { "epoch": 3.6842105263157894, "grad_norm": 0.14752894639968872, "learning_rate": 3.0338733431516935e-05, "loss": 0.1154, "step": 2870 }, { "epoch": 3.6970474967907574, "grad_norm": 0.1402595192193985, "learning_rate": 3.004418262150221e-05, "loss": 0.1068, "step": 2880 }, { "epoch": 3.7098844672657254, "grad_norm": 0.1572103500366211, "learning_rate": 2.9749631811487483e-05, "loss": 0.1017, "step": 2890 }, { "epoch": 3.7227214377406934, "grad_norm": 0.10825837403535843, "learning_rate": 2.9455081001472756e-05, "loss": 0.108, "step": 2900 }, { "epoch": 3.735558408215661, "grad_norm": 0.09703366458415985, "learning_rate": 2.916053019145803e-05, "loss": 0.103, "step": 2910 }, { "epoch": 3.748395378690629, "grad_norm": 0.11436077952384949, "learning_rate": 2.8865979381443297e-05, "loss": 0.1163, "step": 2920 }, { "epoch": 3.761232349165597, "grad_norm": 0.13263103365898132, "learning_rate": 2.857142857142857e-05, "loss": 0.1078, "step": 2930 }, { "epoch": 3.7740693196405646, "grad_norm": 0.12120107561349869, "learning_rate": 2.8276877761413845e-05, "loss": 0.1061, "step": 2940 }, { "epoch": 3.7869062901155326, "grad_norm": 0.10355032235383987, "learning_rate": 2.798232695139912e-05, "loss": 0.1084, "step": 2950 }, { "epoch": 3.7997432605905006, "grad_norm": 0.12106519192457199, "learning_rate": 2.7687776141384392e-05, "loss": 0.099, "step": 2960 }, { "epoch": 3.8125802310654686, "grad_norm": 0.11807779967784882, "learning_rate": 2.7393225331369666e-05, "loss": 0.1076, "step": 2970 }, { "epoch": 3.8254172015404366, "grad_norm": 0.22053521871566772, "learning_rate": 2.7098674521354933e-05, "loss": 0.0998, "step": 2980 }, { "epoch": 3.8382541720154046, "grad_norm": 0.12982416152954102, "learning_rate": 2.6804123711340206e-05, "loss": 0.1112, "step": 2990 }, { "epoch": 3.851091142490372, "grad_norm": 0.10256698727607727, "learning_rate": 2.650957290132548e-05, "loss": 0.1081, "step": 3000 }, { "epoch": 3.851091142490372, "eval_gen_len": 38.946, "eval_loss": 0.09579482674598694, "eval_rouge1": 0.592, "eval_rouge2": 0.3111, "eval_rougeL": 0.5586, "eval_runtime": 46.2091, "eval_samples_per_second": 10.82, "eval_steps_per_second": 1.363, "step": 3000 }, { "epoch": 3.8639281129653402, "grad_norm": 0.09973964840173721, "learning_rate": 2.6215022091310754e-05, "loss": 0.1113, "step": 3010 }, { "epoch": 3.8767650834403082, "grad_norm": 0.1375608742237091, "learning_rate": 2.5920471281296028e-05, "loss": 0.0979, "step": 3020 }, { "epoch": 3.889602053915276, "grad_norm": 0.129994735121727, "learning_rate": 2.5625920471281295e-05, "loss": 0.1094, "step": 3030 }, { "epoch": 3.902439024390244, "grad_norm": 0.11172019690275192, "learning_rate": 2.5331369661266568e-05, "loss": 0.1061, "step": 3040 }, { "epoch": 3.915275994865212, "grad_norm": 0.13806381821632385, "learning_rate": 2.5036818851251842e-05, "loss": 0.1001, "step": 3050 }, { "epoch": 3.92811296534018, "grad_norm": 0.10557816922664642, "learning_rate": 2.4742268041237116e-05, "loss": 0.1035, "step": 3060 }, { "epoch": 3.940949935815148, "grad_norm": 0.13717150688171387, "learning_rate": 2.4447717231222386e-05, "loss": 0.1048, "step": 3070 }, { "epoch": 3.9537869062901154, "grad_norm": 0.31350722908973694, "learning_rate": 2.415316642120766e-05, "loss": 0.1171, "step": 3080 }, { "epoch": 3.9666238767650834, "grad_norm": 0.12666529417037964, "learning_rate": 2.3858615611192933e-05, "loss": 0.1111, "step": 3090 }, { "epoch": 3.9794608472400514, "grad_norm": 0.1276416778564453, "learning_rate": 2.3564064801178204e-05, "loss": 0.1151, "step": 3100 }, { "epoch": 3.992297817715019, "grad_norm": 0.11248279362916946, "learning_rate": 2.3269513991163478e-05, "loss": 0.1013, "step": 3110 }, { "epoch": 4.005134788189987, "grad_norm": 0.14889618754386902, "learning_rate": 2.2974963181148748e-05, "loss": 0.1072, "step": 3120 }, { "epoch": 4.017971758664955, "grad_norm": 0.10598665475845337, "learning_rate": 2.268041237113402e-05, "loss": 0.1017, "step": 3130 }, { "epoch": 4.030808729139923, "grad_norm": 0.11106213927268982, "learning_rate": 2.2385861561119295e-05, "loss": 0.1028, "step": 3140 }, { "epoch": 4.043645699614891, "grad_norm": 0.14373856782913208, "learning_rate": 2.2091310751104566e-05, "loss": 0.1086, "step": 3150 }, { "epoch": 4.056482670089859, "grad_norm": 0.13207481801509857, "learning_rate": 2.179675994108984e-05, "loss": 0.0997, "step": 3160 }, { "epoch": 4.069319640564827, "grad_norm": 0.11155064404010773, "learning_rate": 2.1502209131075113e-05, "loss": 0.1083, "step": 3170 }, { "epoch": 4.082156611039794, "grad_norm": 0.10416271537542343, "learning_rate": 2.1207658321060383e-05, "loss": 0.1164, "step": 3180 }, { "epoch": 4.094993581514762, "grad_norm": 0.11257210373878479, "learning_rate": 2.0913107511045657e-05, "loss": 0.1121, "step": 3190 }, { "epoch": 4.10783055198973, "grad_norm": 0.2550837993621826, "learning_rate": 2.0618556701030927e-05, "loss": 0.1112, "step": 3200 }, { "epoch": 4.120667522464698, "grad_norm": 0.1105370745062828, "learning_rate": 2.03240058910162e-05, "loss": 0.1064, "step": 3210 }, { "epoch": 4.133504492939666, "grad_norm": 0.11678767949342728, "learning_rate": 2.0029455081001475e-05, "loss": 0.1051, "step": 3220 }, { "epoch": 4.146341463414634, "grad_norm": 0.09064505994319916, "learning_rate": 1.9734904270986745e-05, "loss": 0.1062, "step": 3230 }, { "epoch": 4.159178433889602, "grad_norm": 0.11429174244403839, "learning_rate": 1.944035346097202e-05, "loss": 0.1082, "step": 3240 }, { "epoch": 4.17201540436457, "grad_norm": 0.11602449417114258, "learning_rate": 1.9145802650957293e-05, "loss": 0.1008, "step": 3250 }, { "epoch": 4.184852374839538, "grad_norm": 0.09767124801874161, "learning_rate": 1.8851251840942563e-05, "loss": 0.1052, "step": 3260 }, { "epoch": 4.197689345314505, "grad_norm": 0.08755617588758469, "learning_rate": 1.8556701030927837e-05, "loss": 0.1021, "step": 3270 }, { "epoch": 4.2105263157894735, "grad_norm": 0.08597078919410706, "learning_rate": 1.8262150220913107e-05, "loss": 0.1034, "step": 3280 }, { "epoch": 4.2233632862644415, "grad_norm": 0.12039302289485931, "learning_rate": 1.796759941089838e-05, "loss": 0.1171, "step": 3290 }, { "epoch": 4.2362002567394095, "grad_norm": 0.11615891754627228, "learning_rate": 1.7673048600883655e-05, "loss": 0.1044, "step": 3300 }, { "epoch": 4.2490372272143775, "grad_norm": 0.1104285717010498, "learning_rate": 1.7378497790868925e-05, "loss": 0.1025, "step": 3310 }, { "epoch": 4.2618741976893455, "grad_norm": 0.10667522996664047, "learning_rate": 1.70839469808542e-05, "loss": 0.099, "step": 3320 }, { "epoch": 4.2747111681643135, "grad_norm": 0.11607446521520615, "learning_rate": 1.678939617083947e-05, "loss": 0.1085, "step": 3330 }, { "epoch": 4.2875481386392815, "grad_norm": 0.09650270640850067, "learning_rate": 1.6494845360824743e-05, "loss": 0.0944, "step": 3340 }, { "epoch": 4.300385109114249, "grad_norm": 0.11899662762880325, "learning_rate": 1.6200294550810016e-05, "loss": 0.1002, "step": 3350 }, { "epoch": 4.313222079589217, "grad_norm": 0.12044407427310944, "learning_rate": 1.5905743740795287e-05, "loss": 0.1083, "step": 3360 }, { "epoch": 4.326059050064185, "grad_norm": 0.09684235602617264, "learning_rate": 1.561119293078056e-05, "loss": 0.1012, "step": 3370 }, { "epoch": 4.338896020539153, "grad_norm": 0.12105019390583038, "learning_rate": 1.5316642120765834e-05, "loss": 0.1034, "step": 3380 }, { "epoch": 4.351732991014121, "grad_norm": 0.14494775235652924, "learning_rate": 1.5022091310751105e-05, "loss": 0.0926, "step": 3390 }, { "epoch": 4.364569961489089, "grad_norm": 0.1320798546075821, "learning_rate": 1.4727540500736378e-05, "loss": 0.1137, "step": 3400 }, { "epoch": 4.377406931964057, "grad_norm": 0.1228160411119461, "learning_rate": 1.4432989690721649e-05, "loss": 0.1071, "step": 3410 }, { "epoch": 4.390243902439025, "grad_norm": 0.12092225253582001, "learning_rate": 1.4138438880706922e-05, "loss": 0.1003, "step": 3420 }, { "epoch": 4.403080872913993, "grad_norm": 0.10531840473413467, "learning_rate": 1.3843888070692196e-05, "loss": 0.1083, "step": 3430 }, { "epoch": 4.41591784338896, "grad_norm": 0.14029191434383392, "learning_rate": 1.3549337260677466e-05, "loss": 0.1074, "step": 3440 }, { "epoch": 4.428754813863928, "grad_norm": 0.10859360545873642, "learning_rate": 1.325478645066274e-05, "loss": 0.1004, "step": 3450 }, { "epoch": 4.441591784338896, "grad_norm": 0.10318469256162643, "learning_rate": 1.2960235640648014e-05, "loss": 0.104, "step": 3460 }, { "epoch": 4.454428754813864, "grad_norm": 0.12979723513126373, "learning_rate": 1.2665684830633284e-05, "loss": 0.0968, "step": 3470 }, { "epoch": 4.467265725288832, "grad_norm": 0.14862096309661865, "learning_rate": 1.2371134020618558e-05, "loss": 0.1124, "step": 3480 }, { "epoch": 4.4801026957638, "grad_norm": 0.1266416311264038, "learning_rate": 1.207658321060383e-05, "loss": 0.1135, "step": 3490 }, { "epoch": 4.492939666238768, "grad_norm": 0.12597477436065674, "learning_rate": 1.1782032400589102e-05, "loss": 0.1071, "step": 3500 }, { "epoch": 4.492939666238768, "eval_gen_len": 38.946, "eval_loss": 0.09506849944591522, "eval_rouge1": 0.5959, "eval_rouge2": 0.3135, "eval_rougeL": 0.5625, "eval_runtime": 45.8514, "eval_samples_per_second": 10.905, "eval_steps_per_second": 1.374, "step": 3500 }, { "epoch": 4.505776636713735, "grad_norm": 0.11168000102043152, "learning_rate": 1.1487481590574374e-05, "loss": 0.107, "step": 3510 }, { "epoch": 4.518613607188703, "grad_norm": 0.1055913120508194, "learning_rate": 1.1192930780559648e-05, "loss": 0.1001, "step": 3520 }, { "epoch": 4.531450577663671, "grad_norm": 0.10443459451198578, "learning_rate": 1.089837997054492e-05, "loss": 0.1117, "step": 3530 }, { "epoch": 4.544287548138639, "grad_norm": 0.1707678884267807, "learning_rate": 1.0603829160530192e-05, "loss": 0.095, "step": 3540 }, { "epoch": 4.557124518613607, "grad_norm": 0.10272827744483948, "learning_rate": 1.0309278350515464e-05, "loss": 0.1027, "step": 3550 }, { "epoch": 4.569961489088575, "grad_norm": 0.14477787911891937, "learning_rate": 1.0014727540500737e-05, "loss": 0.114, "step": 3560 }, { "epoch": 4.582798459563543, "grad_norm": 0.12023382633924484, "learning_rate": 9.72017673048601e-06, "loss": 0.111, "step": 3570 }, { "epoch": 4.595635430038511, "grad_norm": 0.11823578923940659, "learning_rate": 9.425625920471282e-06, "loss": 0.1149, "step": 3580 }, { "epoch": 4.608472400513479, "grad_norm": 0.13746647536754608, "learning_rate": 9.131075110456554e-06, "loss": 0.1068, "step": 3590 }, { "epoch": 4.621309370988447, "grad_norm": 0.11988291144371033, "learning_rate": 8.836524300441827e-06, "loss": 0.0955, "step": 3600 }, { "epoch": 4.634146341463414, "grad_norm": 0.09879586100578308, "learning_rate": 8.5419734904271e-06, "loss": 0.1031, "step": 3610 }, { "epoch": 4.646983311938382, "grad_norm": 0.1175203025341034, "learning_rate": 8.247422680412371e-06, "loss": 0.1135, "step": 3620 }, { "epoch": 4.65982028241335, "grad_norm": 0.10659980028867722, "learning_rate": 7.952871870397643e-06, "loss": 0.1005, "step": 3630 }, { "epoch": 4.672657252888318, "grad_norm": 0.12701040506362915, "learning_rate": 7.658321060382917e-06, "loss": 0.1088, "step": 3640 }, { "epoch": 4.685494223363286, "grad_norm": 0.10160648077726364, "learning_rate": 7.363770250368189e-06, "loss": 0.1076, "step": 3650 }, { "epoch": 4.698331193838254, "grad_norm": 0.11782427132129669, "learning_rate": 7.069219440353461e-06, "loss": 0.1054, "step": 3660 }, { "epoch": 4.711168164313222, "grad_norm": 0.11307085305452347, "learning_rate": 6.774668630338733e-06, "loss": 0.1093, "step": 3670 }, { "epoch": 4.7240051347881895, "grad_norm": 0.13703173398971558, "learning_rate": 6.480117820324007e-06, "loss": 0.1043, "step": 3680 }, { "epoch": 4.7368421052631575, "grad_norm": 0.11539970338344574, "learning_rate": 6.185567010309279e-06, "loss": 0.1016, "step": 3690 }, { "epoch": 4.7496790757381255, "grad_norm": 0.10878204554319382, "learning_rate": 5.891016200294551e-06, "loss": 0.1065, "step": 3700 }, { "epoch": 4.7625160462130935, "grad_norm": 0.11433741450309753, "learning_rate": 5.596465390279824e-06, "loss": 0.1097, "step": 3710 }, { "epoch": 4.775353016688062, "grad_norm": 0.12714996933937073, "learning_rate": 5.301914580265096e-06, "loss": 0.1102, "step": 3720 }, { "epoch": 4.78818998716303, "grad_norm": 0.1310754418373108, "learning_rate": 5.007363770250369e-06, "loss": 0.1029, "step": 3730 }, { "epoch": 4.801026957637998, "grad_norm": 0.10799030214548111, "learning_rate": 4.712812960235641e-06, "loss": 0.1072, "step": 3740 }, { "epoch": 4.813863928112966, "grad_norm": 0.12069649994373322, "learning_rate": 4.418262150220914e-06, "loss": 0.1092, "step": 3750 }, { "epoch": 4.826700898587934, "grad_norm": 0.09965723752975464, "learning_rate": 4.123711340206186e-06, "loss": 0.0987, "step": 3760 }, { "epoch": 4.839537869062902, "grad_norm": 0.1061498150229454, "learning_rate": 3.8291605301914585e-06, "loss": 0.1026, "step": 3770 }, { "epoch": 4.852374839537869, "grad_norm": 0.10951300710439682, "learning_rate": 3.5346097201767306e-06, "loss": 0.101, "step": 3780 }, { "epoch": 4.865211810012837, "grad_norm": 0.11674579232931137, "learning_rate": 3.2400589101620034e-06, "loss": 0.1026, "step": 3790 }, { "epoch": 4.878048780487805, "grad_norm": 0.12058935314416885, "learning_rate": 2.9455081001472755e-06, "loss": 0.1021, "step": 3800 }, { "epoch": 4.890885750962773, "grad_norm": 0.11995814740657806, "learning_rate": 2.650957290132548e-06, "loss": 0.1066, "step": 3810 }, { "epoch": 4.903722721437741, "grad_norm": 0.14141815900802612, "learning_rate": 2.3564064801178204e-06, "loss": 0.1064, "step": 3820 }, { "epoch": 4.916559691912709, "grad_norm": 0.10517096519470215, "learning_rate": 2.061855670103093e-06, "loss": 0.1008, "step": 3830 }, { "epoch": 4.929396662387677, "grad_norm": 0.10266517102718353, "learning_rate": 1.7673048600883653e-06, "loss": 0.1037, "step": 3840 }, { "epoch": 4.942233632862644, "grad_norm": 0.114813894033432, "learning_rate": 1.4727540500736377e-06, "loss": 0.1026, "step": 3850 }, { "epoch": 4.955070603337612, "grad_norm": 0.09497864544391632, "learning_rate": 1.1782032400589102e-06, "loss": 0.112, "step": 3860 }, { "epoch": 4.96790757381258, "grad_norm": 0.11201301217079163, "learning_rate": 8.836524300441826e-07, "loss": 0.1083, "step": 3870 }, { "epoch": 4.980744544287548, "grad_norm": 0.12003788352012634, "learning_rate": 5.891016200294551e-07, "loss": 0.1117, "step": 3880 }, { "epoch": 4.993581514762516, "grad_norm": 0.1579122096300125, "learning_rate": 2.9455081001472755e-07, "loss": 0.1106, "step": 3890 } ], "logging_steps": 10, "max_steps": 3895, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8431794231705600.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }