diff --git "a/checkpoint-46942/trainer_state.json" "b/checkpoint-46942/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-46942/trainer_state.json" @@ -0,0 +1,13363 @@ +{ + "best_metric": 0.06252285093069077, + "best_model_checkpoint": "autotrain-yz7wm-laa5q/checkpoint-46942", + "epoch": 14.0, + "eval_steps": 500, + "global_step": 46942, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007456009543692216, + "grad_norm": 45.01611328125, + "learning_rate": 1.252534892043421e-07, + "loss": 4.1595, + "step": 25 + }, + { + "epoch": 0.014912019087384432, + "grad_norm": 71.49336242675781, + "learning_rate": 2.74364785876178e-07, + "loss": 2.8452, + "step": 50 + }, + { + "epoch": 0.022368028631076647, + "grad_norm": 73.92076110839844, + "learning_rate": 4.234760825480138e-07, + "loss": 3.1876, + "step": 75 + }, + { + "epoch": 0.029824038174768863, + "grad_norm": 35.54814529418945, + "learning_rate": 5.725873792198497e-07, + "loss": 3.1838, + "step": 100 + }, + { + "epoch": 0.03728004771846108, + "grad_norm": 45.263160705566406, + "learning_rate": 7.157342240248122e-07, + "loss": 3.8779, + "step": 125 + }, + { + "epoch": 0.044736057262153295, + "grad_norm": 57.34968566894531, + "learning_rate": 8.64845520696648e-07, + "loss": 3.3267, + "step": 150 + }, + { + "epoch": 0.05219206680584551, + "grad_norm": 54.16720199584961, + "learning_rate": 1.013956817368484e-06, + "loss": 2.8099, + "step": 175 + }, + { + "epoch": 0.059648076349537726, + "grad_norm": 66.92790222167969, + "learning_rate": 1.1571036621734464e-06, + "loss": 3.0295, + "step": 200 + }, + { + "epoch": 0.06710408589322994, + "grad_norm": 49.223506927490234, + "learning_rate": 1.3062149588452823e-06, + "loss": 3.5417, + "step": 225 + }, + { + "epoch": 0.07456009543692216, + "grad_norm": 28.979656219482422, + "learning_rate": 1.455326255517118e-06, + "loss": 2.8779, + "step": 250 + }, + { + "epoch": 0.08201610498061437, + "grad_norm": 58.497398376464844, + "learning_rate": 1.6044375521889539e-06, + "loss": 3.8402, + "step": 275 + }, + { + "epoch": 0.08947211452430659, + "grad_norm": 39.97136306762695, + "learning_rate": 1.7535488488607896e-06, + "loss": 2.8882, + "step": 300 + }, + { + "epoch": 0.0969281240679988, + "grad_norm": 59.521690368652344, + "learning_rate": 1.8966956936657524e-06, + "loss": 3.3457, + "step": 325 + }, + { + "epoch": 0.10438413361169102, + "grad_norm": 46.65399932861328, + "learning_rate": 2.045806990337588e-06, + "loss": 3.2512, + "step": 350 + }, + { + "epoch": 0.11184014315538324, + "grad_norm": 72.7174072265625, + "learning_rate": 2.194918287009424e-06, + "loss": 3.4201, + "step": 375 + }, + { + "epoch": 0.11929615269907545, + "grad_norm": 42.24202346801758, + "learning_rate": 2.34402958368126e-06, + "loss": 2.9563, + "step": 400 + }, + { + "epoch": 0.12675216224276767, + "grad_norm": 41.53296661376953, + "learning_rate": 2.493140880353096e-06, + "loss": 3.2227, + "step": 425 + }, + { + "epoch": 0.13420817178645988, + "grad_norm": 39.92971420288086, + "learning_rate": 2.6422521770249313e-06, + "loss": 2.9322, + "step": 450 + }, + { + "epoch": 0.1416641813301521, + "grad_norm": 56.5375862121582, + "learning_rate": 2.7913634736967676e-06, + "loss": 3.1309, + "step": 475 + }, + { + "epoch": 0.14912019087384432, + "grad_norm": 50.100341796875, + "learning_rate": 2.940474770368603e-06, + "loss": 3.28, + "step": 500 + }, + { + "epoch": 0.15657620041753653, + "grad_norm": 45.758445739746094, + "learning_rate": 3.089586067040439e-06, + "loss": 2.9504, + "step": 525 + }, + { + "epoch": 0.16403220996122875, + "grad_norm": 57.31959915161133, + "learning_rate": 3.238697363712275e-06, + "loss": 3.2988, + "step": 550 + }, + { + "epoch": 0.17148821950492096, + "grad_norm": 79.53318786621094, + "learning_rate": 3.3878086603841104e-06, + "loss": 2.6523, + "step": 575 + }, + { + "epoch": 0.17894422904861318, + "grad_norm": 60.8985481262207, + "learning_rate": 3.5369199570559468e-06, + "loss": 2.7978, + "step": 600 + }, + { + "epoch": 0.1864002385923054, + "grad_norm": 40.064605712890625, + "learning_rate": 3.6860312537277827e-06, + "loss": 2.6045, + "step": 625 + }, + { + "epoch": 0.1938562481359976, + "grad_norm": 40.542083740234375, + "learning_rate": 3.835142550399618e-06, + "loss": 2.7585, + "step": 650 + }, + { + "epoch": 0.20131225767968983, + "grad_norm": 18.070505142211914, + "learning_rate": 3.984253847071454e-06, + "loss": 2.274, + "step": 675 + }, + { + "epoch": 0.20876826722338204, + "grad_norm": 41.26124572753906, + "learning_rate": 4.1333651437432904e-06, + "loss": 2.6105, + "step": 700 + }, + { + "epoch": 0.21622427676707426, + "grad_norm": 49.613983154296875, + "learning_rate": 4.282476440415126e-06, + "loss": 2.2197, + "step": 725 + }, + { + "epoch": 0.22368028631076647, + "grad_norm": 31.069345474243164, + "learning_rate": 4.431587737086961e-06, + "loss": 2.4944, + "step": 750 + }, + { + "epoch": 0.2311362958544587, + "grad_norm": 47.66035461425781, + "learning_rate": 4.580699033758798e-06, + "loss": 2.5574, + "step": 775 + }, + { + "epoch": 0.2385923053981509, + "grad_norm": 46.78519058227539, + "learning_rate": 4.729810330430633e-06, + "loss": 2.6472, + "step": 800 + }, + { + "epoch": 0.24604831494184312, + "grad_norm": 45.38592529296875, + "learning_rate": 4.87892162710247e-06, + "loss": 2.8814, + "step": 825 + }, + { + "epoch": 0.25350432448553534, + "grad_norm": 40.204872131347656, + "learning_rate": 5.028032923774306e-06, + "loss": 2.4151, + "step": 850 + }, + { + "epoch": 0.2609603340292276, + "grad_norm": 18.755455017089844, + "learning_rate": 5.1771442204461406e-06, + "loss": 2.8743, + "step": 875 + }, + { + "epoch": 0.26841634357291977, + "grad_norm": 45.9006233215332, + "learning_rate": 5.326255517117977e-06, + "loss": 2.646, + "step": 900 + }, + { + "epoch": 0.275872353116612, + "grad_norm": 41.10917282104492, + "learning_rate": 5.475366813789813e-06, + "loss": 2.3558, + "step": 925 + }, + { + "epoch": 0.2833283626603042, + "grad_norm": 17.992292404174805, + "learning_rate": 5.624478110461649e-06, + "loss": 2.4183, + "step": 950 + }, + { + "epoch": 0.29078437220399644, + "grad_norm": 40.263668060302734, + "learning_rate": 5.773589407133484e-06, + "loss": 2.4558, + "step": 975 + }, + { + "epoch": 0.29824038174768863, + "grad_norm": 45.932186126708984, + "learning_rate": 5.9227007038053206e-06, + "loss": 2.1838, + "step": 1000 + }, + { + "epoch": 0.3056963912913809, + "grad_norm": 80.81851196289062, + "learning_rate": 6.071812000477156e-06, + "loss": 2.2045, + "step": 1025 + }, + { + "epoch": 0.31315240083507306, + "grad_norm": 5.3733954429626465, + "learning_rate": 6.220923297148992e-06, + "loss": 2.0902, + "step": 1050 + }, + { + "epoch": 0.3206084103787653, + "grad_norm": 44.87431716918945, + "learning_rate": 6.370034593820829e-06, + "loss": 2.373, + "step": 1075 + }, + { + "epoch": 0.3280644199224575, + "grad_norm": 42.271202087402344, + "learning_rate": 6.519145890492664e-06, + "loss": 2.5756, + "step": 1100 + }, + { + "epoch": 0.33552042946614974, + "grad_norm": 67.8478775024414, + "learning_rate": 6.6682571871645006e-06, + "loss": 2.4065, + "step": 1125 + }, + { + "epoch": 0.3429764390098419, + "grad_norm": 49.30979919433594, + "learning_rate": 6.817368483836335e-06, + "loss": 2.4142, + "step": 1150 + }, + { + "epoch": 0.35043244855353417, + "grad_norm": 43.35612869262695, + "learning_rate": 6.9664797805081715e-06, + "loss": 2.4022, + "step": 1175 + }, + { + "epoch": 0.35788845809722636, + "grad_norm": 61.253299713134766, + "learning_rate": 7.115591077180007e-06, + "loss": 2.6125, + "step": 1200 + }, + { + "epoch": 0.3653444676409186, + "grad_norm": 23.769813537597656, + "learning_rate": 7.264702373851843e-06, + "loss": 2.5133, + "step": 1225 + }, + { + "epoch": 0.3728004771846108, + "grad_norm": 27.436845779418945, + "learning_rate": 7.41381367052368e-06, + "loss": 1.711, + "step": 1250 + }, + { + "epoch": 0.38025648672830303, + "grad_norm": 12.745654106140137, + "learning_rate": 7.562924967195516e-06, + "loss": 2.2353, + "step": 1275 + }, + { + "epoch": 0.3877124962719952, + "grad_norm": 53.59686279296875, + "learning_rate": 7.71203626386735e-06, + "loss": 2.3294, + "step": 1300 + }, + { + "epoch": 0.39516850581568747, + "grad_norm": 29.58496856689453, + "learning_rate": 7.861147560539186e-06, + "loss": 2.5862, + "step": 1325 + }, + { + "epoch": 0.40262451535937965, + "grad_norm": 50.43803405761719, + "learning_rate": 8.010258857211023e-06, + "loss": 2.1057, + "step": 1350 + }, + { + "epoch": 0.4100805249030719, + "grad_norm": 41.37960433959961, + "learning_rate": 8.159370153882859e-06, + "loss": 2.0668, + "step": 1375 + }, + { + "epoch": 0.4175365344467641, + "grad_norm": 40.834102630615234, + "learning_rate": 8.308481450554695e-06, + "loss": 2.4631, + "step": 1400 + }, + { + "epoch": 0.42499254399045633, + "grad_norm": 21.868328094482422, + "learning_rate": 8.457592747226532e-06, + "loss": 2.3086, + "step": 1425 + }, + { + "epoch": 0.4324485535341485, + "grad_norm": 27.166046142578125, + "learning_rate": 8.606704043898366e-06, + "loss": 2.2048, + "step": 1450 + }, + { + "epoch": 0.43990456307784076, + "grad_norm": 35.7780876159668, + "learning_rate": 8.755815340570203e-06, + "loss": 2.2996, + "step": 1475 + }, + { + "epoch": 0.44736057262153295, + "grad_norm": 23.72397804260254, + "learning_rate": 8.904926637242037e-06, + "loss": 2.2948, + "step": 1500 + }, + { + "epoch": 0.4548165821652252, + "grad_norm": 31.295326232910156, + "learning_rate": 9.054037933913873e-06, + "loss": 1.6803, + "step": 1525 + }, + { + "epoch": 0.4622725917089174, + "grad_norm": 32.05657196044922, + "learning_rate": 9.20314923058571e-06, + "loss": 2.1551, + "step": 1550 + }, + { + "epoch": 0.4697286012526096, + "grad_norm": 12.753507614135742, + "learning_rate": 9.352260527257546e-06, + "loss": 2.177, + "step": 1575 + }, + { + "epoch": 0.4771846107963018, + "grad_norm": 18.606224060058594, + "learning_rate": 9.50137182392938e-06, + "loss": 1.781, + "step": 1600 + }, + { + "epoch": 0.48464062033999405, + "grad_norm": 14.352407455444336, + "learning_rate": 9.650483120601217e-06, + "loss": 1.8743, + "step": 1625 + }, + { + "epoch": 0.49209662988368624, + "grad_norm": 41.37252426147461, + "learning_rate": 9.799594417273053e-06, + "loss": 1.8291, + "step": 1650 + }, + { + "epoch": 0.4995526394273785, + "grad_norm": 34.19483184814453, + "learning_rate": 9.94870571394489e-06, + "loss": 1.7963, + "step": 1675 + }, + { + "epoch": 0.5070086489710707, + "grad_norm": 32.18720626831055, + "learning_rate": 1.0097817010616724e-05, + "loss": 1.4178, + "step": 1700 + }, + { + "epoch": 0.5144646585147629, + "grad_norm": 17.12651824951172, + "learning_rate": 1.024692830728856e-05, + "loss": 1.6896, + "step": 1725 + }, + { + "epoch": 0.5219206680584552, + "grad_norm": 32.80767059326172, + "learning_rate": 1.0396039603960395e-05, + "loss": 1.6759, + "step": 1750 + }, + { + "epoch": 0.5293766776021473, + "grad_norm": 62.32932662963867, + "learning_rate": 1.0545150900632232e-05, + "loss": 1.7309, + "step": 1775 + }, + { + "epoch": 0.5368326871458395, + "grad_norm": 55.90302276611328, + "learning_rate": 1.0694262197304068e-05, + "loss": 1.8298, + "step": 1800 + }, + { + "epoch": 0.5442886966895317, + "grad_norm": 22.71988868713379, + "learning_rate": 1.0843373493975904e-05, + "loss": 1.6003, + "step": 1825 + }, + { + "epoch": 0.551744706233224, + "grad_norm": 18.055763244628906, + "learning_rate": 1.099248479064774e-05, + "loss": 1.5038, + "step": 1850 + }, + { + "epoch": 0.5592007157769162, + "grad_norm": 36.303192138671875, + "learning_rate": 1.1141596087319577e-05, + "loss": 1.5591, + "step": 1875 + }, + { + "epoch": 0.5666567253206084, + "grad_norm": 4.943870544433594, + "learning_rate": 1.1290707383991412e-05, + "loss": 1.7191, + "step": 1900 + }, + { + "epoch": 0.5741127348643006, + "grad_norm": 34.8875846862793, + "learning_rate": 1.1439818680663248e-05, + "loss": 1.4872, + "step": 1925 + }, + { + "epoch": 0.5815687444079929, + "grad_norm": 104.60694885253906, + "learning_rate": 1.1588929977335083e-05, + "loss": 2.0672, + "step": 1950 + }, + { + "epoch": 0.5890247539516851, + "grad_norm": 8.631030082702637, + "learning_rate": 1.1738041274006919e-05, + "loss": 1.5613, + "step": 1975 + }, + { + "epoch": 0.5964807634953773, + "grad_norm": 25.058610916137695, + "learning_rate": 1.1887152570678755e-05, + "loss": 1.2022, + "step": 2000 + }, + { + "epoch": 0.6039367730390695, + "grad_norm": 22.961284637451172, + "learning_rate": 1.2036263867350592e-05, + "loss": 1.6524, + "step": 2025 + }, + { + "epoch": 0.6113927825827618, + "grad_norm": 21.290424346923828, + "learning_rate": 1.2185375164022426e-05, + "loss": 1.6892, + "step": 2050 + }, + { + "epoch": 0.6188487921264539, + "grad_norm": 4.9671406745910645, + "learning_rate": 1.2334486460694263e-05, + "loss": 1.2334, + "step": 2075 + }, + { + "epoch": 0.6263048016701461, + "grad_norm": 13.194502830505371, + "learning_rate": 1.2483597757366099e-05, + "loss": 1.3793, + "step": 2100 + }, + { + "epoch": 0.6337608112138383, + "grad_norm": 51.21866989135742, + "learning_rate": 1.2632709054037934e-05, + "loss": 1.6602, + "step": 2125 + }, + { + "epoch": 0.6412168207575306, + "grad_norm": 38.84183883666992, + "learning_rate": 1.278182035070977e-05, + "loss": 1.5312, + "step": 2150 + }, + { + "epoch": 0.6486728303012228, + "grad_norm": 9.87134838104248, + "learning_rate": 1.2930931647381605e-05, + "loss": 1.6768, + "step": 2175 + }, + { + "epoch": 0.656128839844915, + "grad_norm": 30.30939292907715, + "learning_rate": 1.3080042944053441e-05, + "loss": 1.2343, + "step": 2200 + }, + { + "epoch": 0.6635848493886072, + "grad_norm": 64.63691711425781, + "learning_rate": 1.3229154240725277e-05, + "loss": 1.4027, + "step": 2225 + }, + { + "epoch": 0.6710408589322995, + "grad_norm": 29.965591430664062, + "learning_rate": 1.3378265537397114e-05, + "loss": 1.1293, + "step": 2250 + }, + { + "epoch": 0.6784968684759917, + "grad_norm": 44.84248733520508, + "learning_rate": 1.352737683406895e-05, + "loss": 1.6333, + "step": 2275 + }, + { + "epoch": 0.6859528780196839, + "grad_norm": 13.158123016357422, + "learning_rate": 1.3676488130740786e-05, + "loss": 1.5718, + "step": 2300 + }, + { + "epoch": 0.693408887563376, + "grad_norm": 50.25960922241211, + "learning_rate": 1.3825599427412623e-05, + "loss": 1.5637, + "step": 2325 + }, + { + "epoch": 0.7008648971070683, + "grad_norm": 33.488555908203125, + "learning_rate": 1.3974710724084459e-05, + "loss": 1.3898, + "step": 2350 + }, + { + "epoch": 0.7083209066507605, + "grad_norm": 43.9734001159668, + "learning_rate": 1.4123822020756294e-05, + "loss": 1.4057, + "step": 2375 + }, + { + "epoch": 0.7157769161944527, + "grad_norm": 50.97043228149414, + "learning_rate": 1.4272933317428128e-05, + "loss": 1.6272, + "step": 2400 + }, + { + "epoch": 0.7232329257381449, + "grad_norm": 25.401338577270508, + "learning_rate": 1.4422044614099963e-05, + "loss": 0.9952, + "step": 2425 + }, + { + "epoch": 0.7306889352818372, + "grad_norm": 6.340377330780029, + "learning_rate": 1.45711559107718e-05, + "loss": 1.0306, + "step": 2450 + }, + { + "epoch": 0.7381449448255294, + "grad_norm": 23.072500228881836, + "learning_rate": 1.4720267207443636e-05, + "loss": 0.7644, + "step": 2475 + }, + { + "epoch": 0.7456009543692216, + "grad_norm": 58.61784362792969, + "learning_rate": 1.4869378504115472e-05, + "loss": 1.4553, + "step": 2500 + }, + { + "epoch": 0.7530569639129138, + "grad_norm": 29.832626342773438, + "learning_rate": 1.5018489800787308e-05, + "loss": 1.1454, + "step": 2525 + }, + { + "epoch": 0.7605129734566061, + "grad_norm": 43.455284118652344, + "learning_rate": 1.5167601097459145e-05, + "loss": 1.6609, + "step": 2550 + }, + { + "epoch": 0.7679689830002983, + "grad_norm": 38.37151336669922, + "learning_rate": 1.531671239413098e-05, + "loss": 1.2618, + "step": 2575 + }, + { + "epoch": 0.7754249925439904, + "grad_norm": 13.673453330993652, + "learning_rate": 1.5465823690802816e-05, + "loss": 1.2044, + "step": 2600 + }, + { + "epoch": 0.7828810020876826, + "grad_norm": 38.352325439453125, + "learning_rate": 1.5614934987474654e-05, + "loss": 1.2281, + "step": 2625 + }, + { + "epoch": 0.7903370116313749, + "grad_norm": 53.28618621826172, + "learning_rate": 1.576404628414649e-05, + "loss": 1.6322, + "step": 2650 + }, + { + "epoch": 0.7977930211750671, + "grad_norm": 7.921282768249512, + "learning_rate": 1.5913157580818326e-05, + "loss": 1.06, + "step": 2675 + }, + { + "epoch": 0.8052490307187593, + "grad_norm": 8.301175117492676, + "learning_rate": 1.6062268877490158e-05, + "loss": 1.0309, + "step": 2700 + }, + { + "epoch": 0.8127050402624515, + "grad_norm": 62.71298599243164, + "learning_rate": 1.6211380174161996e-05, + "loss": 0.9688, + "step": 2725 + }, + { + "epoch": 0.8201610498061438, + "grad_norm": 27.857236862182617, + "learning_rate": 1.636049147083383e-05, + "loss": 1.2604, + "step": 2750 + }, + { + "epoch": 0.827617059349836, + "grad_norm": 38.996822357177734, + "learning_rate": 1.6503638315638794e-05, + "loss": 1.4185, + "step": 2775 + }, + { + "epoch": 0.8350730688935282, + "grad_norm": 29.560543060302734, + "learning_rate": 1.6652749612310632e-05, + "loss": 1.6107, + "step": 2800 + }, + { + "epoch": 0.8425290784372204, + "grad_norm": 17.703155517578125, + "learning_rate": 1.6801860908982463e-05, + "loss": 1.1194, + "step": 2825 + }, + { + "epoch": 0.8499850879809127, + "grad_norm": 23.545133590698242, + "learning_rate": 1.69509722056543e-05, + "loss": 1.1722, + "step": 2850 + }, + { + "epoch": 0.8574410975246048, + "grad_norm": 37.812320709228516, + "learning_rate": 1.7100083502326136e-05, + "loss": 1.6829, + "step": 2875 + }, + { + "epoch": 0.864897107068297, + "grad_norm": 3.4065306186676025, + "learning_rate": 1.724919479899797e-05, + "loss": 1.3637, + "step": 2900 + }, + { + "epoch": 0.8723531166119892, + "grad_norm": 52.31818771362305, + "learning_rate": 1.739830609566981e-05, + "loss": 1.4565, + "step": 2925 + }, + { + "epoch": 0.8798091261556815, + "grad_norm": 3.736452341079712, + "learning_rate": 1.7547417392341643e-05, + "loss": 0.9151, + "step": 2950 + }, + { + "epoch": 0.8872651356993737, + "grad_norm": 40.5432014465332, + "learning_rate": 1.769652868901348e-05, + "loss": 1.0446, + "step": 2975 + }, + { + "epoch": 0.8947211452430659, + "grad_norm": 29.45325469970703, + "learning_rate": 1.7845639985685316e-05, + "loss": 1.0087, + "step": 3000 + }, + { + "epoch": 0.9021771547867581, + "grad_norm": 22.604948043823242, + "learning_rate": 1.7994751282357154e-05, + "loss": 1.0438, + "step": 3025 + }, + { + "epoch": 0.9096331643304504, + "grad_norm": 34.655269622802734, + "learning_rate": 1.814386257902899e-05, + "loss": 1.1603, + "step": 3050 + }, + { + "epoch": 0.9170891738741426, + "grad_norm": 33.17934799194336, + "learning_rate": 1.8292973875700823e-05, + "loss": 0.9943, + "step": 3075 + }, + { + "epoch": 0.9245451834178348, + "grad_norm": 11.942368507385254, + "learning_rate": 1.844208517237266e-05, + "loss": 1.0218, + "step": 3100 + }, + { + "epoch": 0.932001192961527, + "grad_norm": 39.37548065185547, + "learning_rate": 1.8591196469044496e-05, + "loss": 1.4509, + "step": 3125 + }, + { + "epoch": 0.9394572025052192, + "grad_norm": 16.941686630249023, + "learning_rate": 1.874030776571633e-05, + "loss": 0.9837, + "step": 3150 + }, + { + "epoch": 0.9469132120489114, + "grad_norm": 8.383081436157227, + "learning_rate": 1.8889419062388165e-05, + "loss": 0.8842, + "step": 3175 + }, + { + "epoch": 0.9543692215926036, + "grad_norm": 49.28553009033203, + "learning_rate": 1.9038530359060003e-05, + "loss": 1.2376, + "step": 3200 + }, + { + "epoch": 0.9618252311362958, + "grad_norm": 12.15333366394043, + "learning_rate": 1.9187641655731838e-05, + "loss": 0.8409, + "step": 3225 + }, + { + "epoch": 0.9692812406799881, + "grad_norm": 9.068802833557129, + "learning_rate": 1.9336752952403676e-05, + "loss": 1.17, + "step": 3250 + }, + { + "epoch": 0.9767372502236803, + "grad_norm": 15.055663108825684, + "learning_rate": 1.948586424907551e-05, + "loss": 0.9645, + "step": 3275 + }, + { + "epoch": 0.9841932597673725, + "grad_norm": 25.818368911743164, + "learning_rate": 1.963497554574735e-05, + "loss": 0.9591, + "step": 3300 + }, + { + "epoch": 0.9916492693110647, + "grad_norm": 22.713308334350586, + "learning_rate": 1.9784086842419183e-05, + "loss": 1.187, + "step": 3325 + }, + { + "epoch": 0.999105278854757, + "grad_norm": 16.34677505493164, + "learning_rate": 1.9933198139091018e-05, + "loss": 0.8725, + "step": 3350 + }, + { + "epoch": 1.0, + "eval_gen_len": 8.8074, + "eval_loss": 0.8921580910682678, + "eval_rouge1": 70.8609, + "eval_rouge2": 52.0236, + "eval_rougeL": 70.7329, + "eval_rougeLsum": 70.7231, + "eval_runtime": 95.4163, + "eval_samples_per_second": 17.576, + "eval_steps_per_second": 4.402, + "step": 3353 + }, + { + "epoch": 1.0065612883984492, + "grad_norm": 18.03757667541504, + "learning_rate": 2.0082309435762856e-05, + "loss": 1.0941, + "step": 3375 + }, + { + "epoch": 1.0140172979421413, + "grad_norm": 16.70610809326172, + "learning_rate": 2.023142073243469e-05, + "loss": 0.9132, + "step": 3400 + }, + { + "epoch": 1.0214733074858335, + "grad_norm": 6.246175289154053, + "learning_rate": 2.0380532029106525e-05, + "loss": 0.892, + "step": 3425 + }, + { + "epoch": 1.0289293170295257, + "grad_norm": 24.790555953979492, + "learning_rate": 2.052964332577836e-05, + "loss": 0.7534, + "step": 3450 + }, + { + "epoch": 1.036385326573218, + "grad_norm": 31.111759185791016, + "learning_rate": 2.0678754622450198e-05, + "loss": 0.6501, + "step": 3475 + }, + { + "epoch": 1.0438413361169103, + "grad_norm": 29.69679069519043, + "learning_rate": 2.0827865919122033e-05, + "loss": 0.6393, + "step": 3500 + }, + { + "epoch": 1.0512973456606025, + "grad_norm": 31.25632095336914, + "learning_rate": 2.097697721579387e-05, + "loss": 1.1405, + "step": 3525 + }, + { + "epoch": 1.0587533552042947, + "grad_norm": 15.574368476867676, + "learning_rate": 2.1126088512465705e-05, + "loss": 0.9955, + "step": 3550 + }, + { + "epoch": 1.0662093647479869, + "grad_norm": 4.4662370681762695, + "learning_rate": 2.127519980913754e-05, + "loss": 1.0895, + "step": 3575 + }, + { + "epoch": 1.073665374291679, + "grad_norm": 36.54032516479492, + "learning_rate": 2.1424311105809378e-05, + "loss": 0.6859, + "step": 3600 + }, + { + "epoch": 1.0811213838353713, + "grad_norm": 17.539880752563477, + "learning_rate": 2.1573422402481213e-05, + "loss": 0.7532, + "step": 3625 + }, + { + "epoch": 1.0885773933790635, + "grad_norm": 9.007013320922852, + "learning_rate": 2.172253369915305e-05, + "loss": 0.6377, + "step": 3650 + }, + { + "epoch": 1.0960334029227556, + "grad_norm": 32.22686004638672, + "learning_rate": 2.1871644995824885e-05, + "loss": 0.8879, + "step": 3675 + }, + { + "epoch": 1.103489412466448, + "grad_norm": 18.83819580078125, + "learning_rate": 2.202075629249672e-05, + "loss": 0.8378, + "step": 3700 + }, + { + "epoch": 1.1109454220101402, + "grad_norm": 27.475950241088867, + "learning_rate": 2.2169867589168555e-05, + "loss": 1.0073, + "step": 3725 + }, + { + "epoch": 1.1184014315538324, + "grad_norm": 36.23466110229492, + "learning_rate": 2.2318978885840393e-05, + "loss": 0.6546, + "step": 3750 + }, + { + "epoch": 1.1258574410975246, + "grad_norm": 23.86202049255371, + "learning_rate": 2.2468090182512227e-05, + "loss": 0.9016, + "step": 3775 + }, + { + "epoch": 1.1333134506412168, + "grad_norm": 1.7889130115509033, + "learning_rate": 2.2617201479184062e-05, + "loss": 0.9408, + "step": 3800 + }, + { + "epoch": 1.140769460184909, + "grad_norm": 11.396666526794434, + "learning_rate": 2.27663127758559e-05, + "loss": 0.843, + "step": 3825 + }, + { + "epoch": 1.1482254697286012, + "grad_norm": 5.5733113288879395, + "learning_rate": 2.2915424072527735e-05, + "loss": 0.8923, + "step": 3850 + }, + { + "epoch": 1.1556814792722934, + "grad_norm": 23.367023468017578, + "learning_rate": 2.3064535369199573e-05, + "loss": 0.5774, + "step": 3875 + }, + { + "epoch": 1.1631374888159858, + "grad_norm": 33.5429573059082, + "learning_rate": 2.3213646665871407e-05, + "loss": 0.9683, + "step": 3900 + }, + { + "epoch": 1.170593498359678, + "grad_norm": 37.91108703613281, + "learning_rate": 2.3362757962543245e-05, + "loss": 0.8493, + "step": 3925 + }, + { + "epoch": 1.1780495079033702, + "grad_norm": 22.969072341918945, + "learning_rate": 2.351186925921508e-05, + "loss": 1.0987, + "step": 3950 + }, + { + "epoch": 1.1855055174470623, + "grad_norm": 29.6101131439209, + "learning_rate": 2.3660980555886915e-05, + "loss": 0.8729, + "step": 3975 + }, + { + "epoch": 1.1929615269907545, + "grad_norm": 9.252951622009277, + "learning_rate": 2.381009185255875e-05, + "loss": 0.6307, + "step": 4000 + }, + { + "epoch": 1.2004175365344467, + "grad_norm": 20.931673049926758, + "learning_rate": 2.3959203149230587e-05, + "loss": 0.6223, + "step": 4025 + }, + { + "epoch": 1.207873546078139, + "grad_norm": 14.8717679977417, + "learning_rate": 2.4108314445902422e-05, + "loss": 0.9005, + "step": 4050 + }, + { + "epoch": 1.2153295556218313, + "grad_norm": 45.239173889160156, + "learning_rate": 2.4257425742574257e-05, + "loss": 0.9411, + "step": 4075 + }, + { + "epoch": 1.2227855651655235, + "grad_norm": 2.455791473388672, + "learning_rate": 2.4406537039246095e-05, + "loss": 0.6699, + "step": 4100 + }, + { + "epoch": 1.2302415747092157, + "grad_norm": 9.25529956817627, + "learning_rate": 2.455564833591793e-05, + "loss": 0.9491, + "step": 4125 + }, + { + "epoch": 1.2376975842529079, + "grad_norm": 13.52274227142334, + "learning_rate": 2.4704759632589767e-05, + "loss": 1.1475, + "step": 4150 + }, + { + "epoch": 1.2451535937966, + "grad_norm": 1.3682821989059448, + "learning_rate": 2.4853870929261602e-05, + "loss": 0.6599, + "step": 4175 + }, + { + "epoch": 1.2526096033402923, + "grad_norm": 41.495574951171875, + "learning_rate": 2.5002982225933437e-05, + "loss": 0.8708, + "step": 4200 + }, + { + "epoch": 1.2600656128839844, + "grad_norm": 10.83969497680664, + "learning_rate": 2.5152093522605275e-05, + "loss": 0.6794, + "step": 4225 + }, + { + "epoch": 1.2675216224276766, + "grad_norm": 24.65216827392578, + "learning_rate": 2.530120481927711e-05, + "loss": 0.5716, + "step": 4250 + }, + { + "epoch": 1.2749776319713688, + "grad_norm": 25.4478759765625, + "learning_rate": 2.5450316115948947e-05, + "loss": 0.8103, + "step": 4275 + }, + { + "epoch": 1.2824336415150612, + "grad_norm": 35.4855842590332, + "learning_rate": 2.559942741262078e-05, + "loss": 0.9094, + "step": 4300 + }, + { + "epoch": 1.2898896510587534, + "grad_norm": 24.363826751708984, + "learning_rate": 2.574853870929262e-05, + "loss": 0.9492, + "step": 4325 + }, + { + "epoch": 1.2973456606024456, + "grad_norm": 0.5661748051643372, + "learning_rate": 2.589765000596445e-05, + "loss": 0.895, + "step": 4350 + }, + { + "epoch": 1.3048016701461378, + "grad_norm": 38.63605499267578, + "learning_rate": 2.6046761302636293e-05, + "loss": 0.7079, + "step": 4375 + }, + { + "epoch": 1.31225767968983, + "grad_norm": 25.456178665161133, + "learning_rate": 2.6195872599308124e-05, + "loss": 0.7659, + "step": 4400 + }, + { + "epoch": 1.3197136892335222, + "grad_norm": 16.090669631958008, + "learning_rate": 2.6344983895979962e-05, + "loss": 0.6873, + "step": 4425 + }, + { + "epoch": 1.3271696987772144, + "grad_norm": 3.9270734786987305, + "learning_rate": 2.6494095192651797e-05, + "loss": 0.7105, + "step": 4450 + }, + { + "epoch": 1.3346257083209068, + "grad_norm": 22.171485900878906, + "learning_rate": 2.664320648932363e-05, + "loss": 0.6842, + "step": 4475 + }, + { + "epoch": 1.3420817178645987, + "grad_norm": 9.457509994506836, + "learning_rate": 2.679231778599547e-05, + "loss": 0.8012, + "step": 4500 + }, + { + "epoch": 1.3495377274082911, + "grad_norm": 33.985633850097656, + "learning_rate": 2.69414290826673e-05, + "loss": 0.8246, + "step": 4525 + }, + { + "epoch": 1.3569937369519833, + "grad_norm": 21.28675079345703, + "learning_rate": 2.7090540379339142e-05, + "loss": 0.7623, + "step": 4550 + }, + { + "epoch": 1.3644497464956755, + "grad_norm": 47.42694091796875, + "learning_rate": 2.7239651676010973e-05, + "loss": 1.1292, + "step": 4575 + }, + { + "epoch": 1.3719057560393677, + "grad_norm": 25.696435928344727, + "learning_rate": 2.7388762972682815e-05, + "loss": 0.7367, + "step": 4600 + }, + { + "epoch": 1.37936176558306, + "grad_norm": 43.43827819824219, + "learning_rate": 2.7537874269354646e-05, + "loss": 0.8869, + "step": 4625 + }, + { + "epoch": 1.386817775126752, + "grad_norm": 26.207239151000977, + "learning_rate": 2.7686985566026484e-05, + "loss": 1.0223, + "step": 4650 + }, + { + "epoch": 1.3942737846704443, + "grad_norm": 0.6299751996994019, + "learning_rate": 2.783609686269832e-05, + "loss": 0.4644, + "step": 4675 + }, + { + "epoch": 1.4017297942141367, + "grad_norm": 4.853407382965088, + "learning_rate": 2.7985208159370157e-05, + "loss": 0.7567, + "step": 4700 + }, + { + "epoch": 1.4091858037578289, + "grad_norm": 2.0797219276428223, + "learning_rate": 2.813431945604199e-05, + "loss": 0.8935, + "step": 4725 + }, + { + "epoch": 1.416641813301521, + "grad_norm": 4.396637916564941, + "learning_rate": 2.8283430752713826e-05, + "loss": 0.6973, + "step": 4750 + }, + { + "epoch": 1.4240978228452132, + "grad_norm": 27.564001083374023, + "learning_rate": 2.8432542049385664e-05, + "loss": 0.7045, + "step": 4775 + }, + { + "epoch": 1.4315538323889054, + "grad_norm": 11.872142791748047, + "learning_rate": 2.8581653346057495e-05, + "loss": 0.7692, + "step": 4800 + }, + { + "epoch": 1.4390098419325976, + "grad_norm": 14.561450958251953, + "learning_rate": 2.8730764642729337e-05, + "loss": 0.533, + "step": 4825 + }, + { + "epoch": 1.4464658514762898, + "grad_norm": 5.9031147956848145, + "learning_rate": 2.8879875939401168e-05, + "loss": 0.6997, + "step": 4850 + }, + { + "epoch": 1.4539218610199822, + "grad_norm": 17.50626564025879, + "learning_rate": 2.9028987236073006e-05, + "loss": 0.5615, + "step": 4875 + }, + { + "epoch": 1.4613778705636742, + "grad_norm": 1.092649221420288, + "learning_rate": 2.917809853274484e-05, + "loss": 0.834, + "step": 4900 + }, + { + "epoch": 1.4688338801073666, + "grad_norm": 1.7949668169021606, + "learning_rate": 2.932720982941668e-05, + "loss": 0.7914, + "step": 4925 + }, + { + "epoch": 1.4762898896510588, + "grad_norm": 45.338478088378906, + "learning_rate": 2.9476321126088513e-05, + "loss": 0.838, + "step": 4950 + }, + { + "epoch": 1.483745899194751, + "grad_norm": 42.90887451171875, + "learning_rate": 2.962543242276035e-05, + "loss": 0.5953, + "step": 4975 + }, + { + "epoch": 1.4912019087384432, + "grad_norm": 27.481857299804688, + "learning_rate": 2.9774543719432186e-05, + "loss": 0.644, + "step": 5000 + }, + { + "epoch": 1.4986579182821353, + "grad_norm": 12.581745147705078, + "learning_rate": 2.9923655016104024e-05, + "loss": 0.7452, + "step": 5025 + }, + { + "epoch": 1.5061139278258278, + "grad_norm": 6.364358425140381, + "learning_rate": 3.007276631277586e-05, + "loss": 0.5567, + "step": 5050 + }, + { + "epoch": 1.5135699373695197, + "grad_norm": 16.981321334838867, + "learning_rate": 3.022187760944769e-05, + "loss": 0.3983, + "step": 5075 + }, + { + "epoch": 1.5210259469132121, + "grad_norm": 41.80778503417969, + "learning_rate": 3.037098890611953e-05, + "loss": 0.6953, + "step": 5100 + }, + { + "epoch": 1.528481956456904, + "grad_norm": 12.307695388793945, + "learning_rate": 3.052010020279136e-05, + "loss": 0.8288, + "step": 5125 + }, + { + "epoch": 1.5359379660005965, + "grad_norm": 48.170352935791016, + "learning_rate": 3.06692114994632e-05, + "loss": 0.7873, + "step": 5150 + }, + { + "epoch": 1.5433939755442887, + "grad_norm": 11.568841934204102, + "learning_rate": 3.081832279613504e-05, + "loss": 0.8033, + "step": 5175 + }, + { + "epoch": 1.5508499850879809, + "grad_norm": 37.305301666259766, + "learning_rate": 3.0967434092806877e-05, + "loss": 0.7461, + "step": 5200 + }, + { + "epoch": 1.558305994631673, + "grad_norm": 47.25547790527344, + "learning_rate": 3.111654538947871e-05, + "loss": 0.7598, + "step": 5225 + }, + { + "epoch": 1.5657620041753653, + "grad_norm": 45.243309020996094, + "learning_rate": 3.1265656686150546e-05, + "loss": 0.5867, + "step": 5250 + }, + { + "epoch": 1.5732180137190577, + "grad_norm": 10.84839916229248, + "learning_rate": 3.141476798282238e-05, + "loss": 0.7368, + "step": 5275 + }, + { + "epoch": 1.5806740232627496, + "grad_norm": 31.785808563232422, + "learning_rate": 3.1563879279494215e-05, + "loss": 0.8298, + "step": 5300 + }, + { + "epoch": 1.588130032806442, + "grad_norm": 28.766090393066406, + "learning_rate": 3.171299057616605e-05, + "loss": 0.7021, + "step": 5325 + }, + { + "epoch": 1.5955860423501342, + "grad_norm": 7.013548374176025, + "learning_rate": 3.1862101872837884e-05, + "loss": 0.6686, + "step": 5350 + }, + { + "epoch": 1.6030420518938264, + "grad_norm": 19.23529052734375, + "learning_rate": 3.201121316950972e-05, + "loss": 0.9054, + "step": 5375 + }, + { + "epoch": 1.6104980614375186, + "grad_norm": 19.3592529296875, + "learning_rate": 3.216032446618156e-05, + "loss": 0.7062, + "step": 5400 + }, + { + "epoch": 1.6179540709812108, + "grad_norm": 20.657779693603516, + "learning_rate": 3.23094357628534e-05, + "loss": 0.6802, + "step": 5425 + }, + { + "epoch": 1.6254100805249032, + "grad_norm": 27.61529541015625, + "learning_rate": 3.245854705952523e-05, + "loss": 0.7271, + "step": 5450 + }, + { + "epoch": 1.6328660900685952, + "grad_norm": 15.712939262390137, + "learning_rate": 3.260765835619707e-05, + "loss": 0.5594, + "step": 5475 + }, + { + "epoch": 1.6403220996122876, + "grad_norm": 45.1962776184082, + "learning_rate": 3.27567696528689e-05, + "loss": 0.8737, + "step": 5500 + }, + { + "epoch": 1.6477781091559796, + "grad_norm": 27.638429641723633, + "learning_rate": 3.2905880949540744e-05, + "loss": 0.5651, + "step": 5525 + }, + { + "epoch": 1.655234118699672, + "grad_norm": 9.217710494995117, + "learning_rate": 3.3054992246212575e-05, + "loss": 0.4875, + "step": 5550 + }, + { + "epoch": 1.6626901282433642, + "grad_norm": 34.59236145019531, + "learning_rate": 3.320410354288441e-05, + "loss": 0.5334, + "step": 5575 + }, + { + "epoch": 1.6701461377870563, + "grad_norm": 26.130611419677734, + "learning_rate": 3.3353214839556244e-05, + "loss": 0.5423, + "step": 5600 + }, + { + "epoch": 1.6776021473307487, + "grad_norm": 28.84840965270996, + "learning_rate": 3.350232613622808e-05, + "loss": 0.8482, + "step": 5625 + }, + { + "epoch": 1.6850581568744407, + "grad_norm": 27.17482566833496, + "learning_rate": 3.365143743289992e-05, + "loss": 0.7935, + "step": 5650 + }, + { + "epoch": 1.6925141664181331, + "grad_norm": 8.833431243896484, + "learning_rate": 3.380054872957175e-05, + "loss": 0.7983, + "step": 5675 + }, + { + "epoch": 1.699970175961825, + "grad_norm": 15.625532150268555, + "learning_rate": 3.394966002624359e-05, + "loss": 0.5971, + "step": 5700 + }, + { + "epoch": 1.7074261855055175, + "grad_norm": 24.900188446044922, + "learning_rate": 3.409877132291542e-05, + "loss": 0.6406, + "step": 5725 + }, + { + "epoch": 1.7148821950492097, + "grad_norm": 36.51063919067383, + "learning_rate": 3.4247882619587266e-05, + "loss": 0.5764, + "step": 5750 + }, + { + "epoch": 1.7223382045929019, + "grad_norm": 33.81806564331055, + "learning_rate": 3.43969939162591e-05, + "loss": 0.6808, + "step": 5775 + }, + { + "epoch": 1.729794214136594, + "grad_norm": 21.35527801513672, + "learning_rate": 3.4546105212930935e-05, + "loss": 0.6032, + "step": 5800 + }, + { + "epoch": 1.7372502236802863, + "grad_norm": 0.8462117314338684, + "learning_rate": 3.4695216509602766e-05, + "loss": 0.7005, + "step": 5825 + }, + { + "epoch": 1.7447062332239787, + "grad_norm": 13.562004089355469, + "learning_rate": 3.4844327806274604e-05, + "loss": 0.4941, + "step": 5850 + }, + { + "epoch": 1.7521622427676706, + "grad_norm": 34.314453125, + "learning_rate": 3.499343910294644e-05, + "loss": 0.5255, + "step": 5875 + }, + { + "epoch": 1.759618252311363, + "grad_norm": 53.929893493652344, + "learning_rate": 3.514255039961828e-05, + "loss": 0.5339, + "step": 5900 + }, + { + "epoch": 1.7670742618550552, + "grad_norm": 0.5989261269569397, + "learning_rate": 3.529166169629011e-05, + "loss": 0.7101, + "step": 5925 + }, + { + "epoch": 1.7745302713987474, + "grad_norm": 12.922759056091309, + "learning_rate": 3.544077299296194e-05, + "loss": 0.6036, + "step": 5950 + }, + { + "epoch": 1.7819862809424396, + "grad_norm": 38.75687026977539, + "learning_rate": 3.558988428963379e-05, + "loss": 0.6559, + "step": 5975 + }, + { + "epoch": 1.7894422904861318, + "grad_norm": 48.64384078979492, + "learning_rate": 3.573899558630562e-05, + "loss": 0.5848, + "step": 6000 + }, + { + "epoch": 1.7968983000298242, + "grad_norm": 24.728282928466797, + "learning_rate": 3.588810688297746e-05, + "loss": 0.4153, + "step": 6025 + }, + { + "epoch": 1.8043543095735162, + "grad_norm": 34.48488998413086, + "learning_rate": 3.603721817964929e-05, + "loss": 0.478, + "step": 6050 + }, + { + "epoch": 1.8118103191172086, + "grad_norm": 25.55767250061035, + "learning_rate": 3.6186329476321126e-05, + "loss": 0.6322, + "step": 6075 + }, + { + "epoch": 1.8192663286609005, + "grad_norm": 30.601152420043945, + "learning_rate": 3.6335440772992964e-05, + "loss": 0.6907, + "step": 6100 + }, + { + "epoch": 1.826722338204593, + "grad_norm": 9.281543731689453, + "learning_rate": 3.64845520696648e-05, + "loss": 0.7105, + "step": 6125 + }, + { + "epoch": 1.8341783477482851, + "grad_norm": 0.9574500918388367, + "learning_rate": 3.6633663366336634e-05, + "loss": 0.7844, + "step": 6150 + }, + { + "epoch": 1.8416343572919773, + "grad_norm": 6.887031078338623, + "learning_rate": 3.678277466300847e-05, + "loss": 0.7604, + "step": 6175 + }, + { + "epoch": 1.8490903668356695, + "grad_norm": 15.275524139404297, + "learning_rate": 3.693188595968031e-05, + "loss": 0.453, + "step": 6200 + }, + { + "epoch": 1.8565463763793617, + "grad_norm": 33.609596252441406, + "learning_rate": 3.708099725635215e-05, + "loss": 0.9226, + "step": 6225 + }, + { + "epoch": 1.8640023859230541, + "grad_norm": 12.9977388381958, + "learning_rate": 3.723010855302398e-05, + "loss": 0.6253, + "step": 6250 + }, + { + "epoch": 1.871458395466746, + "grad_norm": 1.9839307069778442, + "learning_rate": 3.737921984969581e-05, + "loss": 0.5391, + "step": 6275 + }, + { + "epoch": 1.8789144050104385, + "grad_norm": 48.89108657836914, + "learning_rate": 3.752833114636765e-05, + "loss": 0.5714, + "step": 6300 + }, + { + "epoch": 1.8863704145541307, + "grad_norm": 45.01472091674805, + "learning_rate": 3.7677442443039486e-05, + "loss": 0.7613, + "step": 6325 + }, + { + "epoch": 1.8938264240978229, + "grad_norm": 16.330175399780273, + "learning_rate": 3.7826553739711324e-05, + "loss": 0.4172, + "step": 6350 + }, + { + "epoch": 1.901282433641515, + "grad_norm": 26.01683235168457, + "learning_rate": 3.7975665036383156e-05, + "loss": 0.4404, + "step": 6375 + }, + { + "epoch": 1.9087384431852072, + "grad_norm": 2.129399061203003, + "learning_rate": 3.8124776333054994e-05, + "loss": 0.4257, + "step": 6400 + }, + { + "epoch": 1.9161944527288997, + "grad_norm": 15.925755500793457, + "learning_rate": 3.827388762972683e-05, + "loss": 0.4837, + "step": 6425 + }, + { + "epoch": 1.9236504622725916, + "grad_norm": 57.3784065246582, + "learning_rate": 3.842299892639867e-05, + "loss": 0.5822, + "step": 6450 + }, + { + "epoch": 1.931106471816284, + "grad_norm": 20.885623931884766, + "learning_rate": 3.85721102230705e-05, + "loss": 0.5244, + "step": 6475 + }, + { + "epoch": 1.938562481359976, + "grad_norm": 14.874778747558594, + "learning_rate": 3.872122151974234e-05, + "loss": 0.7411, + "step": 6500 + }, + { + "epoch": 1.9460184909036684, + "grad_norm": 35.995628356933594, + "learning_rate": 3.887033281641417e-05, + "loss": 0.6805, + "step": 6525 + }, + { + "epoch": 1.9534745004473606, + "grad_norm": 18.882844924926758, + "learning_rate": 3.901944411308601e-05, + "loss": 0.2871, + "step": 6550 + }, + { + "epoch": 1.9609305099910528, + "grad_norm": 18.62187385559082, + "learning_rate": 3.9168555409757846e-05, + "loss": 0.4653, + "step": 6575 + }, + { + "epoch": 1.968386519534745, + "grad_norm": 6.905665397644043, + "learning_rate": 3.931766670642968e-05, + "loss": 0.6991, + "step": 6600 + }, + { + "epoch": 1.9758425290784372, + "grad_norm": 14.318357467651367, + "learning_rate": 3.9466778003101516e-05, + "loss": 0.6759, + "step": 6625 + }, + { + "epoch": 1.9832985386221296, + "grad_norm": 16.711145401000977, + "learning_rate": 3.9615889299773354e-05, + "loss": 0.6427, + "step": 6650 + }, + { + "epoch": 1.9907545481658215, + "grad_norm": 16.86481475830078, + "learning_rate": 3.976500059644519e-05, + "loss": 0.3138, + "step": 6675 + }, + { + "epoch": 1.998210557709514, + "grad_norm": 37.8138313293457, + "learning_rate": 3.991411189311702e-05, + "loss": 0.6059, + "step": 6700 + }, + { + "epoch": 2.0, + "eval_gen_len": 8.746, + "eval_loss": 0.5014411211013794, + "eval_rouge1": 77.2572, + "eval_rouge2": 60.0488, + "eval_rougeL": 77.0876, + "eval_rougeLsum": 77.0478, + "eval_runtime": 96.4359, + "eval_samples_per_second": 17.39, + "eval_steps_per_second": 4.355, + "step": 6706 + }, + { + "epoch": 2.005666567253206, + "grad_norm": 7.6668219566345215, + "learning_rate": 4.006322318978886e-05, + "loss": 0.3362, + "step": 6725 + }, + { + "epoch": 2.0131225767968983, + "grad_norm": 13.261967658996582, + "learning_rate": 4.021233448646069e-05, + "loss": 0.4647, + "step": 6750 + }, + { + "epoch": 2.0205785863405907, + "grad_norm": 36.11294937133789, + "learning_rate": 4.036144578313254e-05, + "loss": 0.3214, + "step": 6775 + }, + { + "epoch": 2.0280345958842827, + "grad_norm": 19.81526756286621, + "learning_rate": 4.051055707980437e-05, + "loss": 0.311, + "step": 6800 + }, + { + "epoch": 2.035490605427975, + "grad_norm": 9.729710578918457, + "learning_rate": 4.0659668376476206e-05, + "loss": 0.4271, + "step": 6825 + }, + { + "epoch": 2.042946614971667, + "grad_norm": 20.228961944580078, + "learning_rate": 4.080877967314804e-05, + "loss": 0.419, + "step": 6850 + }, + { + "epoch": 2.0504026245153595, + "grad_norm": 1.6113264560699463, + "learning_rate": 4.0957890969819876e-05, + "loss": 0.4115, + "step": 6875 + }, + { + "epoch": 2.0578586340590515, + "grad_norm": 11.554544448852539, + "learning_rate": 4.1107002266491714e-05, + "loss": 0.492, + "step": 6900 + }, + { + "epoch": 2.065314643602744, + "grad_norm": 46.82448959350586, + "learning_rate": 4.1256113563163545e-05, + "loss": 0.3516, + "step": 6925 + }, + { + "epoch": 2.072770653146436, + "grad_norm": 9.364810943603516, + "learning_rate": 4.140522485983538e-05, + "loss": 0.3212, + "step": 6950 + }, + { + "epoch": 2.0802266626901282, + "grad_norm": 21.646303176879883, + "learning_rate": 4.1554336156507214e-05, + "loss": 0.5081, + "step": 6975 + }, + { + "epoch": 2.0876826722338206, + "grad_norm": 9.770012855529785, + "learning_rate": 4.170344745317906e-05, + "loss": 0.2004, + "step": 7000 + }, + { + "epoch": 2.0951386817775126, + "grad_norm": 0.1513725370168686, + "learning_rate": 4.185255874985089e-05, + "loss": 0.4032, + "step": 7025 + }, + { + "epoch": 2.102594691321205, + "grad_norm": 26.403615951538086, + "learning_rate": 4.200167004652273e-05, + "loss": 0.2091, + "step": 7050 + }, + { + "epoch": 2.110050700864897, + "grad_norm": 11.628287315368652, + "learning_rate": 4.215078134319456e-05, + "loss": 0.406, + "step": 7075 + }, + { + "epoch": 2.1175067104085894, + "grad_norm": 24.525611877441406, + "learning_rate": 4.22998926398664e-05, + "loss": 0.4365, + "step": 7100 + }, + { + "epoch": 2.1249627199522814, + "grad_norm": 31.789121627807617, + "learning_rate": 4.2449003936538236e-05, + "loss": 0.3611, + "step": 7125 + }, + { + "epoch": 2.1324187294959738, + "grad_norm": 21.67193603515625, + "learning_rate": 4.259811523321007e-05, + "loss": 0.3525, + "step": 7150 + }, + { + "epoch": 2.1398747390396657, + "grad_norm": 2.3932220935821533, + "learning_rate": 4.2747226529881905e-05, + "loss": 0.3076, + "step": 7175 + }, + { + "epoch": 2.147330748583358, + "grad_norm": 24.538375854492188, + "learning_rate": 4.289633782655374e-05, + "loss": 0.4386, + "step": 7200 + }, + { + "epoch": 2.1547867581270506, + "grad_norm": 4.2655229568481445, + "learning_rate": 4.304544912322558e-05, + "loss": 0.2893, + "step": 7225 + }, + { + "epoch": 2.1622427676707425, + "grad_norm": 19.003803253173828, + "learning_rate": 4.319456041989741e-05, + "loss": 0.4218, + "step": 7250 + }, + { + "epoch": 2.169698777214435, + "grad_norm": 0.4998781681060791, + "learning_rate": 4.334367171656925e-05, + "loss": 0.3635, + "step": 7275 + }, + { + "epoch": 2.177154786758127, + "grad_norm": 10.384448051452637, + "learning_rate": 4.349278301324108e-05, + "loss": 0.4281, + "step": 7300 + }, + { + "epoch": 2.1846107963018193, + "grad_norm": 14.134237289428711, + "learning_rate": 4.3641894309912926e-05, + "loss": 0.4452, + "step": 7325 + }, + { + "epoch": 2.1920668058455113, + "grad_norm": 11.797569274902344, + "learning_rate": 4.379100560658476e-05, + "loss": 0.4506, + "step": 7350 + }, + { + "epoch": 2.1995228153892037, + "grad_norm": 27.334152221679688, + "learning_rate": 4.3940116903256596e-05, + "loss": 0.3131, + "step": 7375 + }, + { + "epoch": 2.206978824932896, + "grad_norm": 0.558601438999176, + "learning_rate": 4.408922819992843e-05, + "loss": 0.2482, + "step": 7400 + }, + { + "epoch": 2.214434834476588, + "grad_norm": 22.792238235473633, + "learning_rate": 4.4238339496600265e-05, + "loss": 0.4553, + "step": 7425 + }, + { + "epoch": 2.2218908440202805, + "grad_norm": 11.284687042236328, + "learning_rate": 4.43874507932721e-05, + "loss": 0.4019, + "step": 7450 + }, + { + "epoch": 2.2293468535639724, + "grad_norm": 31.599943161010742, + "learning_rate": 4.4536562089943934e-05, + "loss": 0.4052, + "step": 7475 + }, + { + "epoch": 2.236802863107665, + "grad_norm": 22.682697296142578, + "learning_rate": 4.468567338661577e-05, + "loss": 0.3055, + "step": 7500 + }, + { + "epoch": 2.244258872651357, + "grad_norm": 9.347318649291992, + "learning_rate": 4.4834784683287603e-05, + "loss": 0.3473, + "step": 7525 + }, + { + "epoch": 2.2517148821950492, + "grad_norm": 7.00664758682251, + "learning_rate": 4.498389597995945e-05, + "loss": 0.481, + "step": 7550 + }, + { + "epoch": 2.2591708917387416, + "grad_norm": 12.196782112121582, + "learning_rate": 4.513300727663128e-05, + "loss": 0.2791, + "step": 7575 + }, + { + "epoch": 2.2666269012824336, + "grad_norm": 1.9481580257415771, + "learning_rate": 4.528211857330312e-05, + "loss": 0.5509, + "step": 7600 + }, + { + "epoch": 2.274082910826126, + "grad_norm": 5.687070846557617, + "learning_rate": 4.543122986997495e-05, + "loss": 0.438, + "step": 7625 + }, + { + "epoch": 2.281538920369818, + "grad_norm": 33.834632873535156, + "learning_rate": 4.558034116664679e-05, + "loss": 0.3629, + "step": 7650 + }, + { + "epoch": 2.2889949299135104, + "grad_norm": 13.697815895080566, + "learning_rate": 4.5729452463318625e-05, + "loss": 0.3274, + "step": 7675 + }, + { + "epoch": 2.2964509394572024, + "grad_norm": 13.013066291809082, + "learning_rate": 4.587856375999046e-05, + "loss": 0.3808, + "step": 7700 + }, + { + "epoch": 2.3039069490008948, + "grad_norm": 18.45660972595215, + "learning_rate": 4.6027675056662294e-05, + "loss": 0.4417, + "step": 7725 + }, + { + "epoch": 2.3113629585445867, + "grad_norm": 16.66852569580078, + "learning_rate": 4.6176786353334125e-05, + "loss": 0.4158, + "step": 7750 + }, + { + "epoch": 2.318818968088279, + "grad_norm": 19.178466796875, + "learning_rate": 4.632589765000597e-05, + "loss": 0.5195, + "step": 7775 + }, + { + "epoch": 2.3262749776319716, + "grad_norm": 66.8802490234375, + "learning_rate": 4.64750089466778e-05, + "loss": 0.4877, + "step": 7800 + }, + { + "epoch": 2.3337309871756635, + "grad_norm": 2.8636233806610107, + "learning_rate": 4.662412024334964e-05, + "loss": 0.3721, + "step": 7825 + }, + { + "epoch": 2.341186996719356, + "grad_norm": 11.779047012329102, + "learning_rate": 4.677323154002147e-05, + "loss": 0.406, + "step": 7850 + }, + { + "epoch": 2.348643006263048, + "grad_norm": 8.003138542175293, + "learning_rate": 4.692234283669331e-05, + "loss": 0.404, + "step": 7875 + }, + { + "epoch": 2.3560990158067403, + "grad_norm": 15.239642143249512, + "learning_rate": 4.707145413336515e-05, + "loss": 0.3539, + "step": 7900 + }, + { + "epoch": 2.3635550253504323, + "grad_norm": 6.205272197723389, + "learning_rate": 4.7220565430036985e-05, + "loss": 0.4542, + "step": 7925 + }, + { + "epoch": 2.3710110348941247, + "grad_norm": 15.588403701782227, + "learning_rate": 4.7369676726708816e-05, + "loss": 0.5109, + "step": 7950 + }, + { + "epoch": 2.3784670444378166, + "grad_norm": 10.954691886901855, + "learning_rate": 4.7518788023380654e-05, + "loss": 0.3682, + "step": 7975 + }, + { + "epoch": 2.385923053981509, + "grad_norm": 2.190718412399292, + "learning_rate": 4.766789932005249e-05, + "loss": 0.4104, + "step": 8000 + }, + { + "epoch": 2.3933790635252015, + "grad_norm": 2.808455467224121, + "learning_rate": 4.7817010616724323e-05, + "loss": 0.8046, + "step": 8025 + }, + { + "epoch": 2.4008350730688934, + "grad_norm": 17.908613204956055, + "learning_rate": 4.796612191339616e-05, + "loss": 0.6078, + "step": 8050 + }, + { + "epoch": 2.408291082612586, + "grad_norm": 28.843978881835938, + "learning_rate": 4.811523321006799e-05, + "loss": 0.3912, + "step": 8075 + }, + { + "epoch": 2.415747092156278, + "grad_norm": 31.94698715209961, + "learning_rate": 4.826434450673983e-05, + "loss": 0.452, + "step": 8100 + }, + { + "epoch": 2.42320310169997, + "grad_norm": 26.91057586669922, + "learning_rate": 4.841345580341167e-05, + "loss": 0.4344, + "step": 8125 + }, + { + "epoch": 2.4306591112436626, + "grad_norm": 10.383529663085938, + "learning_rate": 4.856256710008351e-05, + "loss": 0.4195, + "step": 8150 + }, + { + "epoch": 2.4381151207873546, + "grad_norm": 24.33693504333496, + "learning_rate": 4.871167839675534e-05, + "loss": 0.4018, + "step": 8175 + }, + { + "epoch": 2.445571130331047, + "grad_norm": 11.268268585205078, + "learning_rate": 4.8860789693427176e-05, + "loss": 0.4334, + "step": 8200 + }, + { + "epoch": 2.453027139874739, + "grad_norm": 41.07155990600586, + "learning_rate": 4.9009900990099014e-05, + "loss": 0.3055, + "step": 8225 + }, + { + "epoch": 2.4604831494184314, + "grad_norm": 10.522299766540527, + "learning_rate": 4.915901228677085e-05, + "loss": 0.4488, + "step": 8250 + }, + { + "epoch": 2.4679391589621233, + "grad_norm": 26.76223373413086, + "learning_rate": 4.9308123583442683e-05, + "loss": 0.3703, + "step": 8275 + }, + { + "epoch": 2.4753951685058158, + "grad_norm": 16.908048629760742, + "learning_rate": 4.945723488011452e-05, + "loss": 0.4244, + "step": 8300 + }, + { + "epoch": 2.4828511780495077, + "grad_norm": 20.12076759338379, + "learning_rate": 4.960634617678635e-05, + "loss": 0.4508, + "step": 8325 + }, + { + "epoch": 2.4903071875932, + "grad_norm": 60.24528503417969, + "learning_rate": 4.975545747345819e-05, + "loss": 0.4754, + "step": 8350 + }, + { + "epoch": 2.4977631971368925, + "grad_norm": 11.230194091796875, + "learning_rate": 4.990456877013003e-05, + "loss": 0.2251, + "step": 8375 + }, + { + "epoch": 2.5052192066805845, + "grad_norm": 7.123372554779053, + "learning_rate": 4.999403515283264e-05, + "loss": 0.528, + "step": 8400 + }, + { + "epoch": 2.512675216224277, + "grad_norm": 0.412019282579422, + "learning_rate": 4.997746613292331e-05, + "loss": 0.5279, + "step": 8425 + }, + { + "epoch": 2.520131225767969, + "grad_norm": 40.20630645751953, + "learning_rate": 4.996089711301397e-05, + "loss": 0.4045, + "step": 8450 + }, + { + "epoch": 2.5275872353116613, + "grad_norm": 71.36479949951172, + "learning_rate": 4.9944328093104634e-05, + "loss": 0.5081, + "step": 8475 + }, + { + "epoch": 2.5350432448553533, + "grad_norm": 30.50684356689453, + "learning_rate": 4.992775907319531e-05, + "loss": 0.322, + "step": 8500 + }, + { + "epoch": 2.5424992543990457, + "grad_norm": 2.659013271331787, + "learning_rate": 4.991119005328597e-05, + "loss": 0.3447, + "step": 8525 + }, + { + "epoch": 2.5499552639427376, + "grad_norm": 0.8650076985359192, + "learning_rate": 4.9894621033376634e-05, + "loss": 0.4414, + "step": 8550 + }, + { + "epoch": 2.55741127348643, + "grad_norm": 18.277727127075195, + "learning_rate": 4.9878052013467304e-05, + "loss": 0.5041, + "step": 8575 + }, + { + "epoch": 2.5648672830301225, + "grad_norm": 13.272942543029785, + "learning_rate": 4.9861482993557966e-05, + "loss": 0.4364, + "step": 8600 + }, + { + "epoch": 2.5723232925738144, + "grad_norm": 3.7234790325164795, + "learning_rate": 4.9844913973648635e-05, + "loss": 0.3505, + "step": 8625 + }, + { + "epoch": 2.579779302117507, + "grad_norm": 1.593865156173706, + "learning_rate": 4.98283449537393e-05, + "loss": 0.4215, + "step": 8650 + }, + { + "epoch": 2.587235311661199, + "grad_norm": 32.194034576416016, + "learning_rate": 4.981177593382996e-05, + "loss": 0.4747, + "step": 8675 + }, + { + "epoch": 2.594691321204891, + "grad_norm": 15.71013355255127, + "learning_rate": 4.979520691392063e-05, + "loss": 0.2939, + "step": 8700 + }, + { + "epoch": 2.6021473307485836, + "grad_norm": 13.419342041015625, + "learning_rate": 4.97786378940113e-05, + "loss": 0.2265, + "step": 8725 + }, + { + "epoch": 2.6096033402922756, + "grad_norm": 12.071022033691406, + "learning_rate": 4.976206887410196e-05, + "loss": 0.3058, + "step": 8750 + }, + { + "epoch": 2.6170593498359676, + "grad_norm": 27.79282569885254, + "learning_rate": 4.974549985419263e-05, + "loss": 0.457, + "step": 8775 + }, + { + "epoch": 2.62451535937966, + "grad_norm": 36.58905792236328, + "learning_rate": 4.972893083428329e-05, + "loss": 0.464, + "step": 8800 + }, + { + "epoch": 2.6319713689233524, + "grad_norm": 32.43028259277344, + "learning_rate": 4.971236181437396e-05, + "loss": 0.437, + "step": 8825 + }, + { + "epoch": 2.6394273784670443, + "grad_norm": 3.9236373901367188, + "learning_rate": 4.969579279446462e-05, + "loss": 0.3969, + "step": 8850 + }, + { + "epoch": 2.6468833880107367, + "grad_norm": 20.850723266601562, + "learning_rate": 4.967922377455529e-05, + "loss": 0.4677, + "step": 8875 + }, + { + "epoch": 2.6543393975544287, + "grad_norm": 37.82166290283203, + "learning_rate": 4.9662654754645955e-05, + "loss": 0.442, + "step": 8900 + }, + { + "epoch": 2.661795407098121, + "grad_norm": 65.03778076171875, + "learning_rate": 4.964608573473662e-05, + "loss": 0.39, + "step": 8925 + }, + { + "epoch": 2.6692514166418135, + "grad_norm": 3.3093178272247314, + "learning_rate": 4.9629516714827286e-05, + "loss": 0.3141, + "step": 8950 + }, + { + "epoch": 2.6767074261855055, + "grad_norm": 0.09177004545927048, + "learning_rate": 4.9612947694917955e-05, + "loss": 0.2913, + "step": 8975 + }, + { + "epoch": 2.6841634357291975, + "grad_norm": 0.9577639698982239, + "learning_rate": 4.959637867500862e-05, + "loss": 0.2738, + "step": 9000 + }, + { + "epoch": 2.69161944527289, + "grad_norm": 26.145336151123047, + "learning_rate": 4.957980965509929e-05, + "loss": 0.1834, + "step": 9025 + }, + { + "epoch": 2.6990754548165823, + "grad_norm": 0.26785144209861755, + "learning_rate": 4.956324063518995e-05, + "loss": 0.4066, + "step": 9050 + }, + { + "epoch": 2.7065314643602743, + "grad_norm": 10.481317520141602, + "learning_rate": 4.954667161528062e-05, + "loss": 0.2305, + "step": 9075 + }, + { + "epoch": 2.7139874739039667, + "grad_norm": 13.711938858032227, + "learning_rate": 4.953010259537128e-05, + "loss": 0.3086, + "step": 9100 + }, + { + "epoch": 2.7214434834476586, + "grad_norm": 28.684236526489258, + "learning_rate": 4.951353357546194e-05, + "loss": 0.4683, + "step": 9125 + }, + { + "epoch": 2.728899492991351, + "grad_norm": 21.328371047973633, + "learning_rate": 4.949696455555261e-05, + "loss": 0.5316, + "step": 9150 + }, + { + "epoch": 2.7363555025350434, + "grad_norm": 0.19380249083042145, + "learning_rate": 4.9480395535643274e-05, + "loss": 0.3556, + "step": 9175 + }, + { + "epoch": 2.7438115120787354, + "grad_norm": 1.5455032587051392, + "learning_rate": 4.9463826515733943e-05, + "loss": 0.3856, + "step": 9200 + }, + { + "epoch": 2.751267521622428, + "grad_norm": 11.467935562133789, + "learning_rate": 4.944725749582461e-05, + "loss": 0.2703, + "step": 9225 + }, + { + "epoch": 2.75872353116612, + "grad_norm": 49.632301330566406, + "learning_rate": 4.9430688475915275e-05, + "loss": 0.4603, + "step": 9250 + }, + { + "epoch": 2.766179540709812, + "grad_norm": 1.1420247554779053, + "learning_rate": 4.9414119456005944e-05, + "loss": 0.4527, + "step": 9275 + }, + { + "epoch": 2.773635550253504, + "grad_norm": 29.728267669677734, + "learning_rate": 4.9397550436096606e-05, + "loss": 0.3515, + "step": 9300 + }, + { + "epoch": 2.7810915597971966, + "grad_norm": 18.269168853759766, + "learning_rate": 4.938098141618727e-05, + "loss": 0.5242, + "step": 9325 + }, + { + "epoch": 2.7885475693408885, + "grad_norm": 18.85061264038086, + "learning_rate": 4.936441239627794e-05, + "loss": 0.4382, + "step": 9350 + }, + { + "epoch": 2.796003578884581, + "grad_norm": 1.695752501487732, + "learning_rate": 4.93478433763686e-05, + "loss": 0.3397, + "step": 9375 + }, + { + "epoch": 2.8034595884282734, + "grad_norm": 12.2448148727417, + "learning_rate": 4.933127435645927e-05, + "loss": 0.2823, + "step": 9400 + }, + { + "epoch": 2.8109155979719653, + "grad_norm": 10.061016082763672, + "learning_rate": 4.931470533654993e-05, + "loss": 0.2442, + "step": 9425 + }, + { + "epoch": 2.8183716075156577, + "grad_norm": 7.403264999389648, + "learning_rate": 4.92981363166406e-05, + "loss": 0.2298, + "step": 9450 + }, + { + "epoch": 2.8258276170593497, + "grad_norm": 25.536867141723633, + "learning_rate": 4.928156729673127e-05, + "loss": 0.3031, + "step": 9475 + }, + { + "epoch": 2.833283626603042, + "grad_norm": 32.47145462036133, + "learning_rate": 4.926499827682193e-05, + "loss": 0.3897, + "step": 9500 + }, + { + "epoch": 2.8407396361467345, + "grad_norm": 17.720930099487305, + "learning_rate": 4.92484292569126e-05, + "loss": 0.4836, + "step": 9525 + }, + { + "epoch": 2.8481956456904265, + "grad_norm": 21.164533615112305, + "learning_rate": 4.9231860237003264e-05, + "loss": 0.3634, + "step": 9550 + }, + { + "epoch": 2.8556516552341185, + "grad_norm": 4.224794864654541, + "learning_rate": 4.9215291217093926e-05, + "loss": 0.4028, + "step": 9575 + }, + { + "epoch": 2.863107664777811, + "grad_norm": 17.30121612548828, + "learning_rate": 4.9198722197184595e-05, + "loss": 0.363, + "step": 9600 + }, + { + "epoch": 2.8705636743215033, + "grad_norm": 0.28851985931396484, + "learning_rate": 4.918215317727526e-05, + "loss": 0.3385, + "step": 9625 + }, + { + "epoch": 2.8780196838651952, + "grad_norm": 0.16066333651542664, + "learning_rate": 4.9165584157365927e-05, + "loss": 0.2903, + "step": 9650 + }, + { + "epoch": 2.8854756934088877, + "grad_norm": 0.3555677533149719, + "learning_rate": 4.914901513745659e-05, + "loss": 0.3744, + "step": 9675 + }, + { + "epoch": 2.8929317029525796, + "grad_norm": 10.346195220947266, + "learning_rate": 4.913244611754726e-05, + "loss": 0.1898, + "step": 9700 + }, + { + "epoch": 2.900387712496272, + "grad_norm": 2.9600460529327393, + "learning_rate": 4.911587709763793e-05, + "loss": 0.5744, + "step": 9725 + }, + { + "epoch": 2.9078437220399644, + "grad_norm": 27.990514755249023, + "learning_rate": 4.909930807772859e-05, + "loss": 0.2074, + "step": 9750 + }, + { + "epoch": 2.9152997315836564, + "grad_norm": 21.985830307006836, + "learning_rate": 4.908273905781925e-05, + "loss": 0.2648, + "step": 9775 + }, + { + "epoch": 2.9227557411273484, + "grad_norm": 9.374412536621094, + "learning_rate": 4.906617003790992e-05, + "loss": 0.3699, + "step": 9800 + }, + { + "epoch": 2.930211750671041, + "grad_norm": 42.18376541137695, + "learning_rate": 4.904960101800058e-05, + "loss": 0.4782, + "step": 9825 + }, + { + "epoch": 2.937667760214733, + "grad_norm": 20.689416885375977, + "learning_rate": 4.903303199809125e-05, + "loss": 0.3298, + "step": 9850 + }, + { + "epoch": 2.945123769758425, + "grad_norm": 3.0027916431427, + "learning_rate": 4.9016462978181915e-05, + "loss": 0.2212, + "step": 9875 + }, + { + "epoch": 2.9525797793021176, + "grad_norm": 25.889209747314453, + "learning_rate": 4.899989395827258e-05, + "loss": 0.4582, + "step": 9900 + }, + { + "epoch": 2.9600357888458095, + "grad_norm": 7.678776264190674, + "learning_rate": 4.898332493836325e-05, + "loss": 0.3316, + "step": 9925 + }, + { + "epoch": 2.967491798389502, + "grad_norm": 20.867313385009766, + "learning_rate": 4.8966755918453915e-05, + "loss": 0.1974, + "step": 9950 + }, + { + "epoch": 2.9749478079331944, + "grad_norm": 0.7414972186088562, + "learning_rate": 4.895018689854458e-05, + "loss": 0.2738, + "step": 9975 + }, + { + "epoch": 2.9824038174768863, + "grad_norm": 20.320951461791992, + "learning_rate": 4.893361787863525e-05, + "loss": 0.4032, + "step": 10000 + }, + { + "epoch": 2.9898598270205787, + "grad_norm": 16.26959800720215, + "learning_rate": 4.891704885872591e-05, + "loss": 0.3909, + "step": 10025 + }, + { + "epoch": 2.9973158365642707, + "grad_norm": 6.262996673583984, + "learning_rate": 4.890047983881658e-05, + "loss": 0.2618, + "step": 10050 + }, + { + "epoch": 3.0, + "eval_gen_len": 8.7299, + "eval_loss": 0.32486870884895325, + "eval_rouge1": 84.5703, + "eval_rouge2": 68.4496, + "eval_rougeL": 84.3448, + "eval_rougeLsum": 84.3457, + "eval_runtime": 97.2972, + "eval_samples_per_second": 17.236, + "eval_steps_per_second": 4.317, + "step": 10059 + }, + { + "epoch": 3.004771846107963, + "grad_norm": 15.485138893127441, + "learning_rate": 4.888391081890724e-05, + "loss": 0.1829, + "step": 10075 + }, + { + "epoch": 3.012227855651655, + "grad_norm": 27.907909393310547, + "learning_rate": 4.886734179899791e-05, + "loss": 0.2219, + "step": 10100 + }, + { + "epoch": 3.0196838651953475, + "grad_norm": 21.688621520996094, + "learning_rate": 4.885077277908857e-05, + "loss": 0.3546, + "step": 10125 + }, + { + "epoch": 3.0271398747390394, + "grad_norm": 13.243431091308594, + "learning_rate": 4.8834203759179234e-05, + "loss": 0.2267, + "step": 10150 + }, + { + "epoch": 3.034595884282732, + "grad_norm": 3.4912607669830322, + "learning_rate": 4.881763473926991e-05, + "loss": 0.2021, + "step": 10175 + }, + { + "epoch": 3.0420518938264243, + "grad_norm": 24.04950714111328, + "learning_rate": 4.880106571936057e-05, + "loss": 0.2818, + "step": 10200 + }, + { + "epoch": 3.0495079033701162, + "grad_norm": 7.100139141082764, + "learning_rate": 4.8784496699451235e-05, + "loss": 0.3174, + "step": 10225 + }, + { + "epoch": 3.0569639129138086, + "grad_norm": 4.30832576751709, + "learning_rate": 4.8767927679541904e-05, + "loss": 0.2278, + "step": 10250 + }, + { + "epoch": 3.0644199224575006, + "grad_norm": 18.782655715942383, + "learning_rate": 4.8751358659632566e-05, + "loss": 0.2695, + "step": 10275 + }, + { + "epoch": 3.071875932001193, + "grad_norm": 33.604618072509766, + "learning_rate": 4.8734789639723236e-05, + "loss": 0.2521, + "step": 10300 + }, + { + "epoch": 3.079331941544885, + "grad_norm": 3.4777698516845703, + "learning_rate": 4.87182206198139e-05, + "loss": 0.1609, + "step": 10325 + }, + { + "epoch": 3.0867879510885774, + "grad_norm": 0.6923168301582336, + "learning_rate": 4.870165159990456e-05, + "loss": 0.2375, + "step": 10350 + }, + { + "epoch": 3.09424396063227, + "grad_norm": 2.32352352142334, + "learning_rate": 4.868508257999523e-05, + "loss": 0.1933, + "step": 10375 + }, + { + "epoch": 3.1016999701759618, + "grad_norm": 0.3760017156600952, + "learning_rate": 4.86685135600859e-05, + "loss": 0.159, + "step": 10400 + }, + { + "epoch": 3.109155979719654, + "grad_norm": 18.588369369506836, + "learning_rate": 4.865194454017656e-05, + "loss": 0.276, + "step": 10425 + }, + { + "epoch": 3.116611989263346, + "grad_norm": 22.050600051879883, + "learning_rate": 4.863537552026723e-05, + "loss": 0.2279, + "step": 10450 + }, + { + "epoch": 3.1240679988070386, + "grad_norm": 63.09767150878906, + "learning_rate": 4.861880650035789e-05, + "loss": 0.2141, + "step": 10475 + }, + { + "epoch": 3.1315240083507305, + "grad_norm": 11.172486305236816, + "learning_rate": 4.860223748044856e-05, + "loss": 0.1474, + "step": 10500 + }, + { + "epoch": 3.138980017894423, + "grad_norm": 13.7357816696167, + "learning_rate": 4.8585668460539224e-05, + "loss": 0.2779, + "step": 10525 + }, + { + "epoch": 3.1464360274381153, + "grad_norm": 13.144248962402344, + "learning_rate": 4.8569099440629886e-05, + "loss": 0.2409, + "step": 10550 + }, + { + "epoch": 3.1538920369818073, + "grad_norm": 0.10206674039363861, + "learning_rate": 4.8552530420720555e-05, + "loss": 0.3604, + "step": 10575 + }, + { + "epoch": 3.1613480465254997, + "grad_norm": 13.896751403808594, + "learning_rate": 4.853596140081122e-05, + "loss": 0.3053, + "step": 10600 + }, + { + "epoch": 3.1688040560691917, + "grad_norm": 4.51378870010376, + "learning_rate": 4.851939238090189e-05, + "loss": 0.195, + "step": 10625 + }, + { + "epoch": 3.176260065612884, + "grad_norm": 41.94772720336914, + "learning_rate": 4.8502823360992556e-05, + "loss": 0.2235, + "step": 10650 + }, + { + "epoch": 3.183716075156576, + "grad_norm": 0.20179374516010284, + "learning_rate": 4.848625434108322e-05, + "loss": 0.1965, + "step": 10675 + }, + { + "epoch": 3.1911720847002685, + "grad_norm": 0.6293454766273499, + "learning_rate": 4.846968532117389e-05, + "loss": 0.1513, + "step": 10700 + }, + { + "epoch": 3.1986280942439604, + "grad_norm": 3.8151888847351074, + "learning_rate": 4.845311630126455e-05, + "loss": 0.2603, + "step": 10725 + }, + { + "epoch": 3.206084103787653, + "grad_norm": 53.60226058959961, + "learning_rate": 4.843654728135522e-05, + "loss": 0.212, + "step": 10750 + }, + { + "epoch": 3.2135401133313453, + "grad_norm": 39.32175827026367, + "learning_rate": 4.841997826144588e-05, + "loss": 0.2588, + "step": 10775 + }, + { + "epoch": 3.2209961228750372, + "grad_norm": 4.778318881988525, + "learning_rate": 4.8403409241536543e-05, + "loss": 0.2521, + "step": 10800 + }, + { + "epoch": 3.2284521324187296, + "grad_norm": 16.796152114868164, + "learning_rate": 4.838684022162721e-05, + "loss": 0.1909, + "step": 10825 + }, + { + "epoch": 3.2359081419624216, + "grad_norm": 0.5204095840454102, + "learning_rate": 4.8370271201717875e-05, + "loss": 0.1942, + "step": 10850 + }, + { + "epoch": 3.243364151506114, + "grad_norm": 15.695696830749512, + "learning_rate": 4.8353702181808544e-05, + "loss": 0.2888, + "step": 10875 + }, + { + "epoch": 3.250820161049806, + "grad_norm": 24.48253059387207, + "learning_rate": 4.833713316189921e-05, + "loss": 0.3286, + "step": 10900 + }, + { + "epoch": 3.2582761705934984, + "grad_norm": 0.22145067155361176, + "learning_rate": 4.8320564141989875e-05, + "loss": 0.2459, + "step": 10925 + }, + { + "epoch": 3.2657321801371904, + "grad_norm": 23.265213012695312, + "learning_rate": 4.8303995122080545e-05, + "loss": 0.273, + "step": 10950 + }, + { + "epoch": 3.2731881896808828, + "grad_norm": 14.237789154052734, + "learning_rate": 4.828742610217121e-05, + "loss": 0.1217, + "step": 10975 + }, + { + "epoch": 3.280644199224575, + "grad_norm": 11.19775676727295, + "learning_rate": 4.827085708226187e-05, + "loss": 0.2509, + "step": 11000 + }, + { + "epoch": 3.288100208768267, + "grad_norm": 12.824684143066406, + "learning_rate": 4.825428806235254e-05, + "loss": 0.2533, + "step": 11025 + }, + { + "epoch": 3.2955562183119596, + "grad_norm": 0.6048849821090698, + "learning_rate": 4.82377190424432e-05, + "loss": 0.2427, + "step": 11050 + }, + { + "epoch": 3.3030122278556515, + "grad_norm": 18.854202270507812, + "learning_rate": 4.822115002253387e-05, + "loss": 0.2239, + "step": 11075 + }, + { + "epoch": 3.310468237399344, + "grad_norm": 1.0037951469421387, + "learning_rate": 4.820458100262453e-05, + "loss": 0.2323, + "step": 11100 + }, + { + "epoch": 3.317924246943036, + "grad_norm": 12.175690650939941, + "learning_rate": 4.81880119827152e-05, + "loss": 0.1916, + "step": 11125 + }, + { + "epoch": 3.3253802564867283, + "grad_norm": 3.9193851947784424, + "learning_rate": 4.817144296280587e-05, + "loss": 0.1596, + "step": 11150 + }, + { + "epoch": 3.3328362660304203, + "grad_norm": 1.2663558721542358, + "learning_rate": 4.815487394289653e-05, + "loss": 0.2768, + "step": 11175 + }, + { + "epoch": 3.3402922755741127, + "grad_norm": 1.7071452140808105, + "learning_rate": 4.81383049229872e-05, + "loss": 0.33, + "step": 11200 + }, + { + "epoch": 3.347748285117805, + "grad_norm": 27.484474182128906, + "learning_rate": 4.8121735903077864e-05, + "loss": 0.4431, + "step": 11225 + }, + { + "epoch": 3.355204294661497, + "grad_norm": 30.908485412597656, + "learning_rate": 4.8105166883168527e-05, + "loss": 0.1375, + "step": 11250 + }, + { + "epoch": 3.3626603042051895, + "grad_norm": 15.244987487792969, + "learning_rate": 4.8088597863259196e-05, + "loss": 0.1436, + "step": 11275 + }, + { + "epoch": 3.3701163137488814, + "grad_norm": 5.141242504119873, + "learning_rate": 4.807202884334986e-05, + "loss": 0.2911, + "step": 11300 + }, + { + "epoch": 3.377572323292574, + "grad_norm": 5.202120304107666, + "learning_rate": 4.805545982344053e-05, + "loss": 0.1735, + "step": 11325 + }, + { + "epoch": 3.3850283328362663, + "grad_norm": 0.155669167637825, + "learning_rate": 4.803889080353119e-05, + "loss": 0.1395, + "step": 11350 + }, + { + "epoch": 3.392484342379958, + "grad_norm": 37.31657791137695, + "learning_rate": 4.802232178362186e-05, + "loss": 0.2422, + "step": 11375 + }, + { + "epoch": 3.3999403519236506, + "grad_norm": 1.6722973585128784, + "learning_rate": 4.800575276371253e-05, + "loss": 0.1709, + "step": 11400 + }, + { + "epoch": 3.4073963614673426, + "grad_norm": 41.64527130126953, + "learning_rate": 4.798918374380319e-05, + "loss": 0.2258, + "step": 11425 + }, + { + "epoch": 3.414852371011035, + "grad_norm": 12.723953247070312, + "learning_rate": 4.797261472389385e-05, + "loss": 0.1889, + "step": 11450 + }, + { + "epoch": 3.422308380554727, + "grad_norm": 35.9575080871582, + "learning_rate": 4.795604570398452e-05, + "loss": 0.18, + "step": 11475 + }, + { + "epoch": 3.4297643900984194, + "grad_norm": 1.8803658485412598, + "learning_rate": 4.7939476684075184e-05, + "loss": 0.3035, + "step": 11500 + }, + { + "epoch": 3.4372203996421113, + "grad_norm": 0.22066275775432587, + "learning_rate": 4.792290766416585e-05, + "loss": 0.1552, + "step": 11525 + }, + { + "epoch": 3.4446764091858038, + "grad_norm": 2.5760536193847656, + "learning_rate": 4.7906338644256515e-05, + "loss": 0.1945, + "step": 11550 + }, + { + "epoch": 3.452132418729496, + "grad_norm": 19.55931282043457, + "learning_rate": 4.788976962434718e-05, + "loss": 0.3879, + "step": 11575 + }, + { + "epoch": 3.459588428273188, + "grad_norm": 0.1631491780281067, + "learning_rate": 4.7873200604437854e-05, + "loss": 0.184, + "step": 11600 + }, + { + "epoch": 3.4670444378168805, + "grad_norm": 16.460819244384766, + "learning_rate": 4.7856631584528516e-05, + "loss": 0.2637, + "step": 11625 + }, + { + "epoch": 3.4745004473605725, + "grad_norm": 14.154874801635742, + "learning_rate": 4.784006256461918e-05, + "loss": 0.2471, + "step": 11650 + }, + { + "epoch": 3.481956456904265, + "grad_norm": 28.12816047668457, + "learning_rate": 4.782349354470985e-05, + "loss": 0.2432, + "step": 11675 + }, + { + "epoch": 3.489412466447957, + "grad_norm": 4.544245719909668, + "learning_rate": 4.780692452480051e-05, + "loss": 0.1726, + "step": 11700 + }, + { + "epoch": 3.4968684759916493, + "grad_norm": 37.361698150634766, + "learning_rate": 4.779035550489118e-05, + "loss": 0.2676, + "step": 11725 + }, + { + "epoch": 3.5043244855353413, + "grad_norm": 0.2849110960960388, + "learning_rate": 4.777378648498184e-05, + "loss": 0.1267, + "step": 11750 + }, + { + "epoch": 3.5117804950790337, + "grad_norm": 13.191654205322266, + "learning_rate": 4.7757217465072504e-05, + "loss": 0.261, + "step": 11775 + }, + { + "epoch": 3.519236504622726, + "grad_norm": 17.304588317871094, + "learning_rate": 4.774064844516317e-05, + "loss": 0.1624, + "step": 11800 + }, + { + "epoch": 3.526692514166418, + "grad_norm": 6.902221202850342, + "learning_rate": 4.7724079425253835e-05, + "loss": 0.2299, + "step": 11825 + }, + { + "epoch": 3.5341485237101105, + "grad_norm": 29.891098022460938, + "learning_rate": 4.770751040534451e-05, + "loss": 0.2008, + "step": 11850 + }, + { + "epoch": 3.5416045332538024, + "grad_norm": 1.6863594055175781, + "learning_rate": 4.769094138543517e-05, + "loss": 0.1565, + "step": 11875 + }, + { + "epoch": 3.549060542797495, + "grad_norm": 16.029647827148438, + "learning_rate": 4.7674372365525836e-05, + "loss": 0.2082, + "step": 11900 + }, + { + "epoch": 3.5565165523411872, + "grad_norm": 5.520860195159912, + "learning_rate": 4.7657803345616505e-05, + "loss": 0.2113, + "step": 11925 + }, + { + "epoch": 3.563972561884879, + "grad_norm": 10.906929016113281, + "learning_rate": 4.764123432570717e-05, + "loss": 0.2652, + "step": 11950 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 17.15738868713379, + "learning_rate": 4.7624665305797836e-05, + "loss": 0.1654, + "step": 11975 + }, + { + "epoch": 3.5788845809722636, + "grad_norm": 3.8852152824401855, + "learning_rate": 4.76080962858885e-05, + "loss": 0.3021, + "step": 12000 + }, + { + "epoch": 3.586340590515956, + "grad_norm": 3.2705817222595215, + "learning_rate": 4.759152726597916e-05, + "loss": 0.2207, + "step": 12025 + }, + { + "epoch": 3.593796600059648, + "grad_norm": 2.1786513328552246, + "learning_rate": 4.757495824606983e-05, + "loss": 0.2425, + "step": 12050 + }, + { + "epoch": 3.6012526096033404, + "grad_norm": 1.1399081945419312, + "learning_rate": 4.75583892261605e-05, + "loss": 0.1534, + "step": 12075 + }, + { + "epoch": 3.6087086191470323, + "grad_norm": 56.90084457397461, + "learning_rate": 4.754182020625116e-05, + "loss": 0.2596, + "step": 12100 + }, + { + "epoch": 3.6161646286907247, + "grad_norm": 14.946979522705078, + "learning_rate": 4.752525118634183e-05, + "loss": 0.3018, + "step": 12125 + }, + { + "epoch": 3.623620638234417, + "grad_norm": 4.24704122543335, + "learning_rate": 4.750868216643249e-05, + "loss": 0.2398, + "step": 12150 + }, + { + "epoch": 3.631076647778109, + "grad_norm": 1.877937912940979, + "learning_rate": 4.749211314652316e-05, + "loss": 0.2393, + "step": 12175 + }, + { + "epoch": 3.6385326573218015, + "grad_norm": 13.878131866455078, + "learning_rate": 4.7475544126613824e-05, + "loss": 0.269, + "step": 12200 + }, + { + "epoch": 3.6459886668654935, + "grad_norm": 54.240055084228516, + "learning_rate": 4.745897510670449e-05, + "loss": 0.1789, + "step": 12225 + }, + { + "epoch": 3.653444676409186, + "grad_norm": 50.91316604614258, + "learning_rate": 4.7442406086795156e-05, + "loss": 0.2266, + "step": 12250 + }, + { + "epoch": 3.660900685952878, + "grad_norm": Infinity, + "learning_rate": 4.7426499827682196e-05, + "loss": 0.2616, + "step": 12275 + }, + { + "epoch": 3.6683566954965703, + "grad_norm": 18.687328338623047, + "learning_rate": 4.7409930807772865e-05, + "loss": 0.2016, + "step": 12300 + }, + { + "epoch": 3.6758127050402623, + "grad_norm": 0.21909315884113312, + "learning_rate": 4.739336178786353e-05, + "loss": 0.1519, + "step": 12325 + }, + { + "epoch": 3.6832687145839547, + "grad_norm": 31.093547821044922, + "learning_rate": 4.737679276795419e-05, + "loss": 0.1486, + "step": 12350 + }, + { + "epoch": 3.690724724127647, + "grad_norm": 19.91496467590332, + "learning_rate": 4.736022374804486e-05, + "loss": 0.1421, + "step": 12375 + }, + { + "epoch": 3.698180733671339, + "grad_norm": 0.031140485778450966, + "learning_rate": 4.734365472813552e-05, + "loss": 0.174, + "step": 12400 + }, + { + "epoch": 3.7056367432150314, + "grad_norm": 0.5245652198791504, + "learning_rate": 4.732708570822619e-05, + "loss": 0.1967, + "step": 12425 + }, + { + "epoch": 3.7130927527587234, + "grad_norm": 9.509659767150879, + "learning_rate": 4.731051668831685e-05, + "loss": 0.3401, + "step": 12450 + }, + { + "epoch": 3.720548762302416, + "grad_norm": 0.24189208447933197, + "learning_rate": 4.729394766840752e-05, + "loss": 0.1994, + "step": 12475 + }, + { + "epoch": 3.7280047718461082, + "grad_norm": 7.9083733558654785, + "learning_rate": 4.727737864849819e-05, + "loss": 0.165, + "step": 12500 + }, + { + "epoch": 3.7354607813898, + "grad_norm": 13.54222297668457, + "learning_rate": 4.7260809628588854e-05, + "loss": 0.2339, + "step": 12525 + }, + { + "epoch": 3.742916790933492, + "grad_norm": 0.5903582572937012, + "learning_rate": 4.7244240608679516e-05, + "loss": 0.2427, + "step": 12550 + }, + { + "epoch": 3.7503728004771846, + "grad_norm": 2.730581521987915, + "learning_rate": 4.7227671588770185e-05, + "loss": 0.1408, + "step": 12575 + }, + { + "epoch": 3.757828810020877, + "grad_norm": 0.4194844365119934, + "learning_rate": 4.721110256886085e-05, + "loss": 0.2342, + "step": 12600 + }, + { + "epoch": 3.765284819564569, + "grad_norm": 20.00274658203125, + "learning_rate": 4.7194533548951517e-05, + "loss": 0.0921, + "step": 12625 + }, + { + "epoch": 3.7727408291082614, + "grad_norm": 0.035322315990924835, + "learning_rate": 4.717796452904218e-05, + "loss": 0.1742, + "step": 12650 + }, + { + "epoch": 3.7801968386519533, + "grad_norm": 15.161859512329102, + "learning_rate": 4.716139550913284e-05, + "loss": 0.185, + "step": 12675 + }, + { + "epoch": 3.7876528481956457, + "grad_norm": 13.298684120178223, + "learning_rate": 4.714482648922351e-05, + "loss": 0.1626, + "step": 12700 + }, + { + "epoch": 3.795108857739338, + "grad_norm": 14.108867645263672, + "learning_rate": 4.712825746931418e-05, + "loss": 0.2064, + "step": 12725 + }, + { + "epoch": 3.80256486728303, + "grad_norm": 6.5551886558532715, + "learning_rate": 4.711168844940484e-05, + "loss": 0.1743, + "step": 12750 + }, + { + "epoch": 3.810020876826722, + "grad_norm": 1.9325826168060303, + "learning_rate": 4.709511942949551e-05, + "loss": 0.2071, + "step": 12775 + }, + { + "epoch": 3.8174768863704145, + "grad_norm": 9.065774917602539, + "learning_rate": 4.707855040958617e-05, + "loss": 0.2417, + "step": 12800 + }, + { + "epoch": 3.824932895914107, + "grad_norm": 16.085844039916992, + "learning_rate": 4.706198138967684e-05, + "loss": 0.1279, + "step": 12825 + }, + { + "epoch": 3.832388905457799, + "grad_norm": 19.756853103637695, + "learning_rate": 4.7045412369767505e-05, + "loss": 0.2925, + "step": 12850 + }, + { + "epoch": 3.8398449150014913, + "grad_norm": 0.015781521797180176, + "learning_rate": 4.702884334985817e-05, + "loss": 0.2662, + "step": 12875 + }, + { + "epoch": 3.8473009245451832, + "grad_norm": 0.3410816192626953, + "learning_rate": 4.7012274329948836e-05, + "loss": 0.2415, + "step": 12900 + }, + { + "epoch": 3.8547569340888757, + "grad_norm": 10.625577926635742, + "learning_rate": 4.69957053100395e-05, + "loss": 0.1441, + "step": 12925 + }, + { + "epoch": 3.862212943632568, + "grad_norm": 28.55473518371582, + "learning_rate": 4.697913629013017e-05, + "loss": 0.2246, + "step": 12950 + }, + { + "epoch": 3.86966895317626, + "grad_norm": 18.04960060119629, + "learning_rate": 4.696256727022084e-05, + "loss": 0.1946, + "step": 12975 + }, + { + "epoch": 3.8771249627199524, + "grad_norm": 12.809388160705566, + "learning_rate": 4.69459982503115e-05, + "loss": 0.1689, + "step": 13000 + }, + { + "epoch": 3.8845809722636444, + "grad_norm": 12.735445022583008, + "learning_rate": 4.692942923040217e-05, + "loss": 0.3456, + "step": 13025 + }, + { + "epoch": 3.892036981807337, + "grad_norm": 13.631649017333984, + "learning_rate": 4.691286021049283e-05, + "loss": 0.1507, + "step": 13050 + }, + { + "epoch": 3.899492991351029, + "grad_norm": 6.858339786529541, + "learning_rate": 4.68962911905835e-05, + "loss": 0.1571, + "step": 13075 + }, + { + "epoch": 3.906949000894721, + "grad_norm": 0.8234581351280212, + "learning_rate": 4.687972217067416e-05, + "loss": 0.0485, + "step": 13100 + }, + { + "epoch": 3.914405010438413, + "grad_norm": 54.14894104003906, + "learning_rate": 4.6863153150764824e-05, + "loss": 0.1594, + "step": 13125 + }, + { + "epoch": 3.9218610199821056, + "grad_norm": 0.19911998510360718, + "learning_rate": 4.6846584130855493e-05, + "loss": 0.2208, + "step": 13150 + }, + { + "epoch": 3.929317029525798, + "grad_norm": 12.671158790588379, + "learning_rate": 4.6830015110946156e-05, + "loss": 0.1298, + "step": 13175 + }, + { + "epoch": 3.93677303906949, + "grad_norm": 5.566137313842773, + "learning_rate": 4.6813446091036825e-05, + "loss": 0.2675, + "step": 13200 + }, + { + "epoch": 3.9442290486131824, + "grad_norm": 3.0645034313201904, + "learning_rate": 4.6796877071127494e-05, + "loss": 0.1215, + "step": 13225 + }, + { + "epoch": 3.9516850581568743, + "grad_norm": 0.5267123579978943, + "learning_rate": 4.6780308051218156e-05, + "loss": 0.1125, + "step": 13250 + }, + { + "epoch": 3.9591410677005667, + "grad_norm": 6.624309062957764, + "learning_rate": 4.6763739031308826e-05, + "loss": 0.197, + "step": 13275 + }, + { + "epoch": 3.966597077244259, + "grad_norm": 15.308833122253418, + "learning_rate": 4.674717001139949e-05, + "loss": 0.0796, + "step": 13300 + }, + { + "epoch": 3.974053086787951, + "grad_norm": 1.9672225713729858, + "learning_rate": 4.673060099149015e-05, + "loss": 0.3087, + "step": 13325 + }, + { + "epoch": 3.981509096331643, + "grad_norm": 9.752317428588867, + "learning_rate": 4.671403197158082e-05, + "loss": 0.1106, + "step": 13350 + }, + { + "epoch": 3.9889651058753355, + "grad_norm": 69.8141860961914, + "learning_rate": 4.669746295167148e-05, + "loss": 0.1566, + "step": 13375 + }, + { + "epoch": 3.996421115419028, + "grad_norm": 18.129182815551758, + "learning_rate": 4.668089393176215e-05, + "loss": 0.1422, + "step": 13400 + }, + { + "epoch": 4.0, + "eval_gen_len": 8.8205, + "eval_loss": 0.22952935099601746, + "eval_rouge1": 89.3626, + "eval_rouge2": 74.5509, + "eval_rougeL": 89.1898, + "eval_rougeLsum": 89.2039, + "eval_runtime": 105.2444, + "eval_samples_per_second": 15.934, + "eval_steps_per_second": 3.991, + "step": 13412 + }, + { + "epoch": 4.00387712496272, + "grad_norm": 4.1664204597473145, + "learning_rate": 4.666432491185281e-05, + "loss": 0.0988, + "step": 13425 + }, + { + "epoch": 4.011333134506412, + "grad_norm": 1.5676534175872803, + "learning_rate": 4.664775589194348e-05, + "loss": 0.0775, + "step": 13450 + }, + { + "epoch": 4.018789144050104, + "grad_norm": 1.9181686639785767, + "learning_rate": 4.663118687203415e-05, + "loss": 0.1013, + "step": 13475 + }, + { + "epoch": 4.026245153593797, + "grad_norm": 37.71769332885742, + "learning_rate": 4.6614617852124814e-05, + "loss": 0.1236, + "step": 13500 + }, + { + "epoch": 4.033701163137489, + "grad_norm": 12.9027099609375, + "learning_rate": 4.659804883221548e-05, + "loss": 0.2319, + "step": 13525 + }, + { + "epoch": 4.0411571726811815, + "grad_norm": 0.3448346257209778, + "learning_rate": 4.6581479812306145e-05, + "loss": 0.1614, + "step": 13550 + }, + { + "epoch": 4.048613182224873, + "grad_norm": 0.2613756060600281, + "learning_rate": 4.656491079239681e-05, + "loss": 0.0753, + "step": 13575 + }, + { + "epoch": 4.056069191768565, + "grad_norm": 12.620153427124023, + "learning_rate": 4.6549004533283855e-05, + "loss": 0.1857, + "step": 13600 + }, + { + "epoch": 4.063525201312258, + "grad_norm": 49.89236068725586, + "learning_rate": 4.653243551337452e-05, + "loss": 0.2494, + "step": 13625 + }, + { + "epoch": 4.07098121085595, + "grad_norm": 0.03416460007429123, + "learning_rate": 4.651586649346518e-05, + "loss": 0.1864, + "step": 13650 + }, + { + "epoch": 4.078437220399642, + "grad_norm": 23.923843383789062, + "learning_rate": 4.649929747355585e-05, + "loss": 0.1432, + "step": 13675 + }, + { + "epoch": 4.085893229943334, + "grad_norm": 26.50322723388672, + "learning_rate": 4.648272845364651e-05, + "loss": 0.0544, + "step": 13700 + }, + { + "epoch": 4.093349239487027, + "grad_norm": 2.477444887161255, + "learning_rate": 4.646615943373718e-05, + "loss": 0.0846, + "step": 13725 + }, + { + "epoch": 4.100805249030719, + "grad_norm": 4.058847904205322, + "learning_rate": 4.644959041382784e-05, + "loss": 0.1216, + "step": 13750 + }, + { + "epoch": 4.108261258574411, + "grad_norm": 22.271705627441406, + "learning_rate": 4.6433021393918505e-05, + "loss": 0.1599, + "step": 13775 + }, + { + "epoch": 4.115717268118103, + "grad_norm": 14.830306053161621, + "learning_rate": 4.6416452374009174e-05, + "loss": 0.1047, + "step": 13800 + }, + { + "epoch": 4.123173277661795, + "grad_norm": 20.900365829467773, + "learning_rate": 4.6399883354099836e-05, + "loss": 0.2165, + "step": 13825 + }, + { + "epoch": 4.130629287205488, + "grad_norm": 0.04809415712952614, + "learning_rate": 4.6383314334190505e-05, + "loss": 0.0797, + "step": 13850 + }, + { + "epoch": 4.13808529674918, + "grad_norm": 2.7082746028900146, + "learning_rate": 4.6366745314281174e-05, + "loss": 0.2539, + "step": 13875 + }, + { + "epoch": 4.145541306292872, + "grad_norm": 5.5213823318481445, + "learning_rate": 4.635017629437184e-05, + "loss": 0.0967, + "step": 13900 + }, + { + "epoch": 4.152997315836564, + "grad_norm": 0.5523321628570557, + "learning_rate": 4.6333607274462506e-05, + "loss": 0.115, + "step": 13925 + }, + { + "epoch": 4.1604533253802565, + "grad_norm": 2.0624325275421143, + "learning_rate": 4.631703825455317e-05, + "loss": 0.1811, + "step": 13950 + }, + { + "epoch": 4.167909334923949, + "grad_norm": 0.551288366317749, + "learning_rate": 4.630046923464384e-05, + "loss": 0.1378, + "step": 13975 + }, + { + "epoch": 4.175365344467641, + "grad_norm": 0.5739544034004211, + "learning_rate": 4.62839002147345e-05, + "loss": 0.0912, + "step": 14000 + }, + { + "epoch": 4.182821354011333, + "grad_norm": 23.730623245239258, + "learning_rate": 4.626733119482516e-05, + "loss": 0.2853, + "step": 14025 + }, + { + "epoch": 4.190277363555025, + "grad_norm": 10.519808769226074, + "learning_rate": 4.625076217491583e-05, + "loss": 0.1026, + "step": 14050 + }, + { + "epoch": 4.197733373098718, + "grad_norm": 0.21397335827350616, + "learning_rate": 4.62341931550065e-05, + "loss": 0.0714, + "step": 14075 + }, + { + "epoch": 4.20518938264241, + "grad_norm": 1.646802306175232, + "learning_rate": 4.621762413509716e-05, + "loss": 0.1204, + "step": 14100 + }, + { + "epoch": 4.2126453921861025, + "grad_norm": 11.778765678405762, + "learning_rate": 4.620105511518783e-05, + "loss": 0.092, + "step": 14125 + }, + { + "epoch": 4.220101401729794, + "grad_norm": 22.594240188598633, + "learning_rate": 4.6184486095278494e-05, + "loss": 0.1312, + "step": 14150 + }, + { + "epoch": 4.227557411273486, + "grad_norm": 0.8660998940467834, + "learning_rate": 4.616791707536916e-05, + "loss": 0.1609, + "step": 14175 + }, + { + "epoch": 4.235013420817179, + "grad_norm": 31.39341926574707, + "learning_rate": 4.6151348055459826e-05, + "loss": 0.2667, + "step": 14200 + }, + { + "epoch": 4.242469430360871, + "grad_norm": 0.04638830944895744, + "learning_rate": 4.613477903555049e-05, + "loss": 0.1241, + "step": 14225 + }, + { + "epoch": 4.249925439904563, + "grad_norm": 1.1062257289886475, + "learning_rate": 4.611821001564116e-05, + "loss": 0.1065, + "step": 14250 + }, + { + "epoch": 4.257381449448255, + "grad_norm": 1.4014554023742676, + "learning_rate": 4.610164099573182e-05, + "loss": 0.117, + "step": 14275 + }, + { + "epoch": 4.2648374589919475, + "grad_norm": 10.264939308166504, + "learning_rate": 4.608507197582249e-05, + "loss": 0.1761, + "step": 14300 + }, + { + "epoch": 4.27229346853564, + "grad_norm": 5.444455623626709, + "learning_rate": 4.606850295591316e-05, + "loss": 0.1822, + "step": 14325 + }, + { + "epoch": 4.2797494780793315, + "grad_norm": 0.25484806299209595, + "learning_rate": 4.605193393600382e-05, + "loss": 0.0569, + "step": 14350 + }, + { + "epoch": 4.287205487623024, + "grad_norm": 13.141998291015625, + "learning_rate": 4.603536491609449e-05, + "loss": 0.1328, + "step": 14375 + }, + { + "epoch": 4.294661497166716, + "grad_norm": 0.047426436096429825, + "learning_rate": 4.601879589618515e-05, + "loss": 0.1656, + "step": 14400 + }, + { + "epoch": 4.302117506710409, + "grad_norm": 14.1134614944458, + "learning_rate": 4.6002226876275814e-05, + "loss": 0.1843, + "step": 14425 + }, + { + "epoch": 4.309573516254101, + "grad_norm": 0.0767190232872963, + "learning_rate": 4.598565785636648e-05, + "loss": 0.1311, + "step": 14450 + }, + { + "epoch": 4.317029525797793, + "grad_norm": 1.5797250270843506, + "learning_rate": 4.5969088836457145e-05, + "loss": 0.1161, + "step": 14475 + }, + { + "epoch": 4.324485535341485, + "grad_norm": 0.14433561265468597, + "learning_rate": 4.5952519816547814e-05, + "loss": 0.1597, + "step": 14500 + }, + { + "epoch": 4.3319415448851775, + "grad_norm": 18.34807014465332, + "learning_rate": 4.593595079663848e-05, + "loss": 0.1026, + "step": 14525 + }, + { + "epoch": 4.33939755442887, + "grad_norm": 0.033856164664030075, + "learning_rate": 4.5919381776729146e-05, + "loss": 0.1124, + "step": 14550 + }, + { + "epoch": 4.346853563972562, + "grad_norm": 3.9369986057281494, + "learning_rate": 4.5902812756819815e-05, + "loss": 0.1223, + "step": 14575 + }, + { + "epoch": 4.354309573516254, + "grad_norm": 1.2985237836837769, + "learning_rate": 4.588624373691048e-05, + "loss": 0.0947, + "step": 14600 + }, + { + "epoch": 4.361765583059946, + "grad_norm": 0.030779710039496422, + "learning_rate": 4.5869674717001146e-05, + "loss": 0.1627, + "step": 14625 + }, + { + "epoch": 4.369221592603639, + "grad_norm": 0.5389582514762878, + "learning_rate": 4.585310569709181e-05, + "loss": 0.1019, + "step": 14650 + }, + { + "epoch": 4.376677602147331, + "grad_norm": 47.4750862121582, + "learning_rate": 4.583653667718247e-05, + "loss": 0.2532, + "step": 14675 + }, + { + "epoch": 4.384133611691023, + "grad_norm": 0.20324255526065826, + "learning_rate": 4.581996765727314e-05, + "loss": 0.1099, + "step": 14700 + }, + { + "epoch": 4.391589621234715, + "grad_norm": 8.618117332458496, + "learning_rate": 4.58033986373638e-05, + "loss": 0.1625, + "step": 14725 + }, + { + "epoch": 4.399045630778407, + "grad_norm": 14.791823387145996, + "learning_rate": 4.578682961745447e-05, + "loss": 0.1611, + "step": 14750 + }, + { + "epoch": 4.4065016403221, + "grad_norm": 1.8301466703414917, + "learning_rate": 4.5770260597545134e-05, + "loss": 0.1291, + "step": 14775 + }, + { + "epoch": 4.413957649865792, + "grad_norm": 0.02634044922888279, + "learning_rate": 4.57536915776358e-05, + "loss": 0.0426, + "step": 14800 + }, + { + "epoch": 4.421413659409484, + "grad_norm": 8.729630470275879, + "learning_rate": 4.573712255772647e-05, + "loss": 0.1576, + "step": 14825 + }, + { + "epoch": 4.428869668953176, + "grad_norm": 8.50086498260498, + "learning_rate": 4.5720553537817135e-05, + "loss": 0.2184, + "step": 14850 + }, + { + "epoch": 4.4363256784968685, + "grad_norm": 2.6643874645233154, + "learning_rate": 4.57039845179078e-05, + "loss": 0.1499, + "step": 14875 + }, + { + "epoch": 4.443781688040561, + "grad_norm": 0.253903329372406, + "learning_rate": 4.5687415497998466e-05, + "loss": 0.1432, + "step": 14900 + }, + { + "epoch": 4.4512376975842525, + "grad_norm": 6.4706573486328125, + "learning_rate": 4.567084647808913e-05, + "loss": 0.1689, + "step": 14925 + }, + { + "epoch": 4.458693707127945, + "grad_norm": 3.9063432216644287, + "learning_rate": 4.56542774581798e-05, + "loss": 0.1393, + "step": 14950 + }, + { + "epoch": 4.466149716671637, + "grad_norm": 5.083062648773193, + "learning_rate": 4.563770843827046e-05, + "loss": 0.0821, + "step": 14975 + }, + { + "epoch": 4.47360572621533, + "grad_norm": 31.4875545501709, + "learning_rate": 4.562113941836112e-05, + "loss": 0.2829, + "step": 15000 + }, + { + "epoch": 4.481061735759022, + "grad_norm": 0.055298592895269394, + "learning_rate": 4.560457039845179e-05, + "loss": 0.0949, + "step": 15025 + }, + { + "epoch": 4.488517745302714, + "grad_norm": 9.65718936920166, + "learning_rate": 4.558800137854246e-05, + "loss": 0.159, + "step": 15050 + }, + { + "epoch": 4.495973754846406, + "grad_norm": 9.367506980895996, + "learning_rate": 4.557143235863312e-05, + "loss": 0.1376, + "step": 15075 + }, + { + "epoch": 4.5034297643900985, + "grad_norm": 0.32751259207725525, + "learning_rate": 4.555486333872379e-05, + "loss": 0.1999, + "step": 15100 + }, + { + "epoch": 4.510885773933791, + "grad_norm": 12.641383171081543, + "learning_rate": 4.5538294318814454e-05, + "loss": 0.1897, + "step": 15125 + }, + { + "epoch": 4.518341783477483, + "grad_norm": 0.09187845885753632, + "learning_rate": 4.552172529890512e-05, + "loss": 0.111, + "step": 15150 + }, + { + "epoch": 4.525797793021175, + "grad_norm": 10.361546516418457, + "learning_rate": 4.5505156278995786e-05, + "loss": 0.1307, + "step": 15175 + }, + { + "epoch": 4.533253802564867, + "grad_norm": 6.859467506408691, + "learning_rate": 4.5488587259086455e-05, + "loss": 0.0639, + "step": 15200 + }, + { + "epoch": 4.54070981210856, + "grad_norm": 9.48729419708252, + "learning_rate": 4.547201823917712e-05, + "loss": 0.1368, + "step": 15225 + }, + { + "epoch": 4.548165821652252, + "grad_norm": 0.25108686089515686, + "learning_rate": 4.545544921926778e-05, + "loss": 0.1616, + "step": 15250 + }, + { + "epoch": 4.5556218311959435, + "grad_norm": 33.95174026489258, + "learning_rate": 4.5438880199358455e-05, + "loss": 0.1568, + "step": 15275 + }, + { + "epoch": 4.563077840739636, + "grad_norm": 5.364670753479004, + "learning_rate": 4.542231117944912e-05, + "loss": 0.1048, + "step": 15300 + }, + { + "epoch": 4.570533850283328, + "grad_norm": 1.4906331300735474, + "learning_rate": 4.540574215953978e-05, + "loss": 0.1542, + "step": 15325 + }, + { + "epoch": 4.577989859827021, + "grad_norm": 21.04407501220703, + "learning_rate": 4.538917313963045e-05, + "loss": 0.0685, + "step": 15350 + }, + { + "epoch": 4.585445869370712, + "grad_norm": 11.777868270874023, + "learning_rate": 4.537260411972111e-05, + "loss": 0.1628, + "step": 15375 + }, + { + "epoch": 4.592901878914405, + "grad_norm": 13.60123348236084, + "learning_rate": 4.535603509981178e-05, + "loss": 0.19, + "step": 15400 + }, + { + "epoch": 4.600357888458097, + "grad_norm": 12.014949798583984, + "learning_rate": 4.533946607990244e-05, + "loss": 0.1144, + "step": 15425 + }, + { + "epoch": 4.6078138980017895, + "grad_norm": 21.47185516357422, + "learning_rate": 4.5322897059993105e-05, + "loss": 0.1211, + "step": 15450 + }, + { + "epoch": 4.615269907545482, + "grad_norm": 0.010081280022859573, + "learning_rate": 4.5306328040083774e-05, + "loss": 0.1062, + "step": 15475 + }, + { + "epoch": 4.6227259170891735, + "grad_norm": 0.2724536657333374, + "learning_rate": 4.528975902017444e-05, + "loss": 0.1029, + "step": 15500 + }, + { + "epoch": 4.630181926632866, + "grad_norm": 0.1282346248626709, + "learning_rate": 4.5273190000265106e-05, + "loss": 0.1125, + "step": 15525 + }, + { + "epoch": 4.637637936176558, + "grad_norm": 8.942920684814453, + "learning_rate": 4.5256620980355775e-05, + "loss": 0.1662, + "step": 15550 + }, + { + "epoch": 4.645093945720251, + "grad_norm": 9.61270809173584, + "learning_rate": 4.524005196044644e-05, + "loss": 0.0608, + "step": 15575 + }, + { + "epoch": 4.652549955263943, + "grad_norm": 0.3739064931869507, + "learning_rate": 4.5223482940537106e-05, + "loss": 0.1371, + "step": 15600 + }, + { + "epoch": 4.660005964807635, + "grad_norm": 18.457700729370117, + "learning_rate": 4.520691392062777e-05, + "loss": 0.1166, + "step": 15625 + }, + { + "epoch": 4.667461974351327, + "grad_norm": 10.917533874511719, + "learning_rate": 4.519034490071843e-05, + "loss": 0.1959, + "step": 15650 + }, + { + "epoch": 4.6749179838950194, + "grad_norm": 3.7748892307281494, + "learning_rate": 4.51737758808091e-05, + "loss": 0.076, + "step": 15675 + }, + { + "epoch": 4.682373993438712, + "grad_norm": 10.774211883544922, + "learning_rate": 4.515720686089976e-05, + "loss": 0.1744, + "step": 15700 + }, + { + "epoch": 4.689830002982404, + "grad_norm": 4.676548480987549, + "learning_rate": 4.514063784099043e-05, + "loss": 0.0786, + "step": 15725 + }, + { + "epoch": 4.697286012526096, + "grad_norm": 0.1515558362007141, + "learning_rate": 4.51240688210811e-05, + "loss": 0.0936, + "step": 15750 + }, + { + "epoch": 4.704742022069788, + "grad_norm": 0.661237895488739, + "learning_rate": 4.510749980117176e-05, + "loss": 0.1791, + "step": 15775 + }, + { + "epoch": 4.712198031613481, + "grad_norm": 64.2503890991211, + "learning_rate": 4.509093078126243e-05, + "loss": 0.1088, + "step": 15800 + }, + { + "epoch": 4.719654041157173, + "grad_norm": 6.597042560577393, + "learning_rate": 4.5074361761353095e-05, + "loss": 0.0908, + "step": 15825 + }, + { + "epoch": 4.7271100507008645, + "grad_norm": 11.98141860961914, + "learning_rate": 4.5057792741443764e-05, + "loss": 0.1698, + "step": 15850 + }, + { + "epoch": 4.734566060244557, + "grad_norm": 102.98745727539062, + "learning_rate": 4.5041223721534426e-05, + "loss": 0.1326, + "step": 15875 + }, + { + "epoch": 4.742022069788249, + "grad_norm": 21.18263816833496, + "learning_rate": 4.502465470162509e-05, + "loss": 0.1476, + "step": 15900 + }, + { + "epoch": 4.749478079331942, + "grad_norm": 5.757909774780273, + "learning_rate": 4.500808568171576e-05, + "loss": 0.0975, + "step": 15925 + }, + { + "epoch": 4.756934088875633, + "grad_norm": 0.33464935421943665, + "learning_rate": 4.499151666180642e-05, + "loss": 0.1271, + "step": 15950 + }, + { + "epoch": 4.764390098419326, + "grad_norm": 30.5048885345459, + "learning_rate": 4.497494764189709e-05, + "loss": 0.1143, + "step": 15975 + }, + { + "epoch": 4.771846107963018, + "grad_norm": 0.5966852307319641, + "learning_rate": 4.495837862198776e-05, + "loss": 0.0551, + "step": 16000 + }, + { + "epoch": 4.7793021175067105, + "grad_norm": 19.64974594116211, + "learning_rate": 4.494180960207842e-05, + "loss": 0.1292, + "step": 16025 + }, + { + "epoch": 4.786758127050403, + "grad_norm": 18.990375518798828, + "learning_rate": 4.492524058216909e-05, + "loss": 0.0927, + "step": 16050 + }, + { + "epoch": 4.7942141365940945, + "grad_norm": 3.3394312858581543, + "learning_rate": 4.490867156225975e-05, + "loss": 0.1558, + "step": 16075 + }, + { + "epoch": 4.801670146137787, + "grad_norm": 1.1377098560333252, + "learning_rate": 4.4892102542350414e-05, + "loss": 0.1145, + "step": 16100 + }, + { + "epoch": 4.809126155681479, + "grad_norm": 35.98493957519531, + "learning_rate": 4.4875533522441083e-05, + "loss": 0.1942, + "step": 16125 + }, + { + "epoch": 4.816582165225172, + "grad_norm": 12.814781188964844, + "learning_rate": 4.4858964502531746e-05, + "loss": 0.2208, + "step": 16150 + }, + { + "epoch": 4.824038174768864, + "grad_norm": 20.98240852355957, + "learning_rate": 4.4842395482622415e-05, + "loss": 0.0748, + "step": 16175 + }, + { + "epoch": 4.831494184312556, + "grad_norm": 23.819093704223633, + "learning_rate": 4.482582646271308e-05, + "loss": 0.1131, + "step": 16200 + }, + { + "epoch": 4.838950193856248, + "grad_norm": 3.9502758979797363, + "learning_rate": 4.4809257442803746e-05, + "loss": 0.0948, + "step": 16225 + }, + { + "epoch": 4.84640620339994, + "grad_norm": 56.03023147583008, + "learning_rate": 4.4792688422894415e-05, + "loss": 0.1912, + "step": 16250 + }, + { + "epoch": 4.853862212943633, + "grad_norm": 16.82088851928711, + "learning_rate": 4.477611940298508e-05, + "loss": 0.084, + "step": 16275 + }, + { + "epoch": 4.861318222487325, + "grad_norm": 4.70719575881958, + "learning_rate": 4.475955038307574e-05, + "loss": 0.1498, + "step": 16300 + }, + { + "epoch": 4.868774232031017, + "grad_norm": 0.018220912665128708, + "learning_rate": 4.474298136316641e-05, + "loss": 0.166, + "step": 16325 + }, + { + "epoch": 4.876230241574709, + "grad_norm": 2.55483078956604, + "learning_rate": 4.472641234325707e-05, + "loss": 0.0708, + "step": 16350 + }, + { + "epoch": 4.883686251118402, + "grad_norm": 6.328634738922119, + "learning_rate": 4.470984332334774e-05, + "loss": 0.0798, + "step": 16375 + }, + { + "epoch": 4.891142260662094, + "grad_norm": 4.580233573913574, + "learning_rate": 4.46932743034384e-05, + "loss": 0.0574, + "step": 16400 + }, + { + "epoch": 4.8985982702057855, + "grad_norm": 0.2286739945411682, + "learning_rate": 4.467670528352907e-05, + "loss": 0.1437, + "step": 16425 + }, + { + "epoch": 4.906054279749478, + "grad_norm": 0.08337617665529251, + "learning_rate": 4.4660136263619735e-05, + "loss": 0.109, + "step": 16450 + }, + { + "epoch": 4.91351028929317, + "grad_norm": 4.643275260925293, + "learning_rate": 4.4643567243710404e-05, + "loss": 0.0927, + "step": 16475 + }, + { + "epoch": 4.920966298836863, + "grad_norm": 50.29481506347656, + "learning_rate": 4.462699822380107e-05, + "loss": 0.1549, + "step": 16500 + }, + { + "epoch": 4.928422308380554, + "grad_norm": 9.458456993103027, + "learning_rate": 4.4610429203891735e-05, + "loss": 0.0536, + "step": 16525 + }, + { + "epoch": 4.935878317924247, + "grad_norm": 0.6447744369506836, + "learning_rate": 4.45938601839824e-05, + "loss": 0.0894, + "step": 16550 + }, + { + "epoch": 4.943334327467939, + "grad_norm": 3.911870241165161, + "learning_rate": 4.4577291164073067e-05, + "loss": 0.1486, + "step": 16575 + }, + { + "epoch": 4.9507903370116315, + "grad_norm": 0.10835100710391998, + "learning_rate": 4.456072214416373e-05, + "loss": 0.0847, + "step": 16600 + }, + { + "epoch": 4.958246346555324, + "grad_norm": 9.483917236328125, + "learning_rate": 4.45441531242544e-05, + "loss": 0.1105, + "step": 16625 + }, + { + "epoch": 4.9657023560990154, + "grad_norm": 10.36311149597168, + "learning_rate": 4.452758410434506e-05, + "loss": 0.0985, + "step": 16650 + }, + { + "epoch": 4.973158365642708, + "grad_norm": 0.43916112184524536, + "learning_rate": 4.451101508443572e-05, + "loss": 0.0887, + "step": 16675 + }, + { + "epoch": 4.9806143751864, + "grad_norm": 0.1739131361246109, + "learning_rate": 4.449444606452639e-05, + "loss": 0.0717, + "step": 16700 + }, + { + "epoch": 4.988070384730093, + "grad_norm": 30.31475257873535, + "learning_rate": 4.447787704461706e-05, + "loss": 0.0884, + "step": 16725 + }, + { + "epoch": 4.995526394273785, + "grad_norm": 14.318930625915527, + "learning_rate": 4.4461970785504095e-05, + "loss": 0.1252, + "step": 16750 + }, + { + "epoch": 5.0, + "eval_gen_len": 8.7686, + "eval_loss": 0.15973380208015442, + "eval_rouge1": 92.6945, + "eval_rouge2": 78.5817, + "eval_rougeL": 92.5764, + "eval_rougeLsum": 92.5683, + "eval_runtime": 99.8794, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 4.205, + "step": 16765 + }, + { + "epoch": 5.002982403817477, + "grad_norm": 0.05757031589746475, + "learning_rate": 4.4445401765594764e-05, + "loss": 0.1751, + "step": 16775 + }, + { + "epoch": 5.010438413361169, + "grad_norm": 0.6733642220497131, + "learning_rate": 4.4428832745685426e-05, + "loss": 0.1266, + "step": 16800 + }, + { + "epoch": 5.017894422904861, + "grad_norm": 0.2373235821723938, + "learning_rate": 4.4412263725776095e-05, + "loss": 0.0473, + "step": 16825 + }, + { + "epoch": 5.025350432448554, + "grad_norm": 0.2211294174194336, + "learning_rate": 4.439569470586676e-05, + "loss": 0.1415, + "step": 16850 + }, + { + "epoch": 5.032806441992245, + "grad_norm": 1.2345913648605347, + "learning_rate": 4.437912568595743e-05, + "loss": 0.0372, + "step": 16875 + }, + { + "epoch": 5.040262451535938, + "grad_norm": 0.27145493030548096, + "learning_rate": 4.4362556666048096e-05, + "loss": 0.0315, + "step": 16900 + }, + { + "epoch": 5.04771846107963, + "grad_norm": 7.357295513153076, + "learning_rate": 4.434598764613876e-05, + "loss": 0.0723, + "step": 16925 + }, + { + "epoch": 5.055174470623323, + "grad_norm": 5.996601581573486, + "learning_rate": 4.432941862622943e-05, + "loss": 0.0668, + "step": 16950 + }, + { + "epoch": 5.062630480167015, + "grad_norm": 0.10849784314632416, + "learning_rate": 4.431284960632009e-05, + "loss": 0.0573, + "step": 16975 + }, + { + "epoch": 5.0700864897107065, + "grad_norm": 1.5720083713531494, + "learning_rate": 4.429628058641075e-05, + "loss": 0.0592, + "step": 17000 + }, + { + "epoch": 5.077542499254399, + "grad_norm": 0.05373723804950714, + "learning_rate": 4.427971156650142e-05, + "loss": 0.1364, + "step": 17025 + }, + { + "epoch": 5.084998508798091, + "grad_norm": 0.47185277938842773, + "learning_rate": 4.4263142546592083e-05, + "loss": 0.118, + "step": 17050 + }, + { + "epoch": 5.092454518341784, + "grad_norm": 23.991382598876953, + "learning_rate": 4.424657352668275e-05, + "loss": 0.0927, + "step": 17075 + }, + { + "epoch": 5.099910527885475, + "grad_norm": 0.018344825133681297, + "learning_rate": 4.4230004506773415e-05, + "loss": 0.0498, + "step": 17100 + }, + { + "epoch": 5.107366537429168, + "grad_norm": 30.450063705444336, + "learning_rate": 4.4213435486864084e-05, + "loss": 0.0683, + "step": 17125 + }, + { + "epoch": 5.11482254697286, + "grad_norm": 0.7160254120826721, + "learning_rate": 4.419686646695475e-05, + "loss": 0.1258, + "step": 17150 + }, + { + "epoch": 5.1222785565165525, + "grad_norm": 5.697810649871826, + "learning_rate": 4.4180297447045415e-05, + "loss": 0.0782, + "step": 17175 + }, + { + "epoch": 5.129734566060245, + "grad_norm": 24.201627731323242, + "learning_rate": 4.416372842713608e-05, + "loss": 0.0378, + "step": 17200 + }, + { + "epoch": 5.137190575603936, + "grad_norm": 16.44838523864746, + "learning_rate": 4.414715940722675e-05, + "loss": 0.1738, + "step": 17225 + }, + { + "epoch": 5.144646585147629, + "grad_norm": 2.063767910003662, + "learning_rate": 4.413059038731741e-05, + "loss": 0.0564, + "step": 17250 + }, + { + "epoch": 5.152102594691321, + "grad_norm": 1.0960617065429688, + "learning_rate": 4.411402136740808e-05, + "loss": 0.106, + "step": 17275 + }, + { + "epoch": 5.159558604235014, + "grad_norm": 0.9119987487792969, + "learning_rate": 4.409745234749874e-05, + "loss": 0.1341, + "step": 17300 + }, + { + "epoch": 5.167014613778706, + "grad_norm": 25.07423210144043, + "learning_rate": 4.40808833275894e-05, + "loss": 0.0704, + "step": 17325 + }, + { + "epoch": 5.174470623322398, + "grad_norm": 3.1737473011016846, + "learning_rate": 4.406431430768008e-05, + "loss": 0.1202, + "step": 17350 + }, + { + "epoch": 5.18192663286609, + "grad_norm": 13.149360656738281, + "learning_rate": 4.404774528777074e-05, + "loss": 0.0952, + "step": 17375 + }, + { + "epoch": 5.189382642409782, + "grad_norm": 0.8912478089332581, + "learning_rate": 4.403117626786141e-05, + "loss": 0.0528, + "step": 17400 + }, + { + "epoch": 5.196838651953475, + "grad_norm": 0.5555346012115479, + "learning_rate": 4.401460724795207e-05, + "loss": 0.0441, + "step": 17425 + }, + { + "epoch": 5.204294661497166, + "grad_norm": 0.02102746069431305, + "learning_rate": 4.3998038228042735e-05, + "loss": 0.0984, + "step": 17450 + }, + { + "epoch": 5.211750671040859, + "grad_norm": 0.06021396070718765, + "learning_rate": 4.3981469208133404e-05, + "loss": 0.0662, + "step": 17475 + }, + { + "epoch": 5.219206680584551, + "grad_norm": 0.17239326238632202, + "learning_rate": 4.3964900188224067e-05, + "loss": 0.0938, + "step": 17500 + }, + { + "epoch": 5.226662690128244, + "grad_norm": 8.74494457244873, + "learning_rate": 4.3948331168314736e-05, + "loss": 0.1012, + "step": 17525 + }, + { + "epoch": 5.234118699671936, + "grad_norm": 0.18227869272232056, + "learning_rate": 4.39317621484054e-05, + "loss": 0.0969, + "step": 17550 + }, + { + "epoch": 5.2415747092156275, + "grad_norm": 0.35041165351867676, + "learning_rate": 4.391519312849606e-05, + "loss": 0.1368, + "step": 17575 + }, + { + "epoch": 5.24903071875932, + "grad_norm": 36.02559280395508, + "learning_rate": 4.3898624108586736e-05, + "loss": 0.1031, + "step": 17600 + }, + { + "epoch": 5.256486728303012, + "grad_norm": 28.382909774780273, + "learning_rate": 4.38820550886774e-05, + "loss": 0.168, + "step": 17625 + }, + { + "epoch": 5.263942737846705, + "grad_norm": 0.79286128282547, + "learning_rate": 4.386548606876806e-05, + "loss": 0.0353, + "step": 17650 + }, + { + "epoch": 5.271398747390396, + "grad_norm": 0.4146468937397003, + "learning_rate": 4.384891704885873e-05, + "loss": 0.0466, + "step": 17675 + }, + { + "epoch": 5.278854756934089, + "grad_norm": 9.44299030303955, + "learning_rate": 4.383234802894939e-05, + "loss": 0.1078, + "step": 17700 + }, + { + "epoch": 5.286310766477781, + "grad_norm": 1.2989798784255981, + "learning_rate": 4.381577900904006e-05, + "loss": 0.0732, + "step": 17725 + }, + { + "epoch": 5.2937667760214735, + "grad_norm": 19.005168914794922, + "learning_rate": 4.3799209989130724e-05, + "loss": 0.0778, + "step": 17750 + }, + { + "epoch": 5.301222785565166, + "grad_norm": 15.506356239318848, + "learning_rate": 4.3782640969221386e-05, + "loss": 0.0421, + "step": 17775 + }, + { + "epoch": 5.308678795108857, + "grad_norm": 6.192285537719727, + "learning_rate": 4.3766071949312055e-05, + "loss": 0.0594, + "step": 17800 + }, + { + "epoch": 5.31613480465255, + "grad_norm": 0.025958608835935593, + "learning_rate": 4.3749502929402724e-05, + "loss": 0.1048, + "step": 17825 + }, + { + "epoch": 5.323590814196242, + "grad_norm": 0.09452486038208008, + "learning_rate": 4.373293390949339e-05, + "loss": 0.1083, + "step": 17850 + }, + { + "epoch": 5.331046823739935, + "grad_norm": 5.524946212768555, + "learning_rate": 4.3716364889584056e-05, + "loss": 0.0898, + "step": 17875 + }, + { + "epoch": 5.338502833283627, + "grad_norm": 14.375529289245605, + "learning_rate": 4.369979586967472e-05, + "loss": 0.0287, + "step": 17900 + }, + { + "epoch": 5.345958842827319, + "grad_norm": 13.109046936035156, + "learning_rate": 4.368322684976539e-05, + "loss": 0.047, + "step": 17925 + }, + { + "epoch": 5.353414852371011, + "grad_norm": 0.10136231780052185, + "learning_rate": 4.366665782985605e-05, + "loss": 0.1218, + "step": 17950 + }, + { + "epoch": 5.360870861914703, + "grad_norm": 0.01339312270283699, + "learning_rate": 4.365008880994671e-05, + "loss": 0.1445, + "step": 17975 + }, + { + "epoch": 5.368326871458396, + "grad_norm": 0.42180967330932617, + "learning_rate": 4.363351979003738e-05, + "loss": 0.0626, + "step": 18000 + }, + { + "epoch": 5.375782881002087, + "grad_norm": 0.004577248357236385, + "learning_rate": 4.3616950770128044e-05, + "loss": 0.0161, + "step": 18025 + }, + { + "epoch": 5.38323889054578, + "grad_norm": 0.25045108795166016, + "learning_rate": 4.360038175021871e-05, + "loss": 0.0347, + "step": 18050 + }, + { + "epoch": 5.390694900089472, + "grad_norm": 0.6512510180473328, + "learning_rate": 4.358381273030938e-05, + "loss": 0.0538, + "step": 18075 + }, + { + "epoch": 5.398150909633165, + "grad_norm": 0.0632275938987732, + "learning_rate": 4.3567243710400044e-05, + "loss": 0.0475, + "step": 18100 + }, + { + "epoch": 5.405606919176856, + "grad_norm": 3.314922332763672, + "learning_rate": 4.355067469049071e-05, + "loss": 0.0581, + "step": 18125 + }, + { + "epoch": 5.4130629287205485, + "grad_norm": 3.7075135707855225, + "learning_rate": 4.3534105670581376e-05, + "loss": 0.1561, + "step": 18150 + }, + { + "epoch": 5.420518938264241, + "grad_norm": 1.4350308179855347, + "learning_rate": 4.3517536650672045e-05, + "loss": 0.0651, + "step": 18175 + }, + { + "epoch": 5.427974947807933, + "grad_norm": 15.598840713500977, + "learning_rate": 4.350096763076271e-05, + "loss": 0.0744, + "step": 18200 + }, + { + "epoch": 5.435430957351626, + "grad_norm": 19.05609130859375, + "learning_rate": 4.348439861085337e-05, + "loss": 0.051, + "step": 18225 + }, + { + "epoch": 5.442886966895317, + "grad_norm": 13.1383695602417, + "learning_rate": 4.346782959094404e-05, + "loss": 0.2101, + "step": 18250 + }, + { + "epoch": 5.45034297643901, + "grad_norm": 2.7254810333251953, + "learning_rate": 4.34512605710347e-05, + "loss": 0.1575, + "step": 18275 + }, + { + "epoch": 5.457798985982702, + "grad_norm": 0.05944235250353813, + "learning_rate": 4.343469155112537e-05, + "loss": 0.1236, + "step": 18300 + }, + { + "epoch": 5.4652549955263945, + "grad_norm": 0.77425616979599, + "learning_rate": 4.341812253121604e-05, + "loss": 0.0598, + "step": 18325 + }, + { + "epoch": 5.472711005070087, + "grad_norm": 13.926398277282715, + "learning_rate": 4.34015535113067e-05, + "loss": 0.0586, + "step": 18350 + }, + { + "epoch": 5.480167014613778, + "grad_norm": 0.29317107796669006, + "learning_rate": 4.338498449139737e-05, + "loss": 0.1206, + "step": 18375 + }, + { + "epoch": 5.487623024157471, + "grad_norm": 9.615321159362793, + "learning_rate": 4.336841547148803e-05, + "loss": 0.112, + "step": 18400 + }, + { + "epoch": 5.495079033701163, + "grad_norm": 3.3480887413024902, + "learning_rate": 4.3351846451578695e-05, + "loss": 0.092, + "step": 18425 + }, + { + "epoch": 5.502535043244856, + "grad_norm": 5.509705543518066, + "learning_rate": 4.3335277431669364e-05, + "loss": 0.0675, + "step": 18450 + }, + { + "epoch": 5.509991052788548, + "grad_norm": 14.98397159576416, + "learning_rate": 4.331870841176003e-05, + "loss": 0.1012, + "step": 18475 + }, + { + "epoch": 5.51744706233224, + "grad_norm": 14.701458930969238, + "learning_rate": 4.3302139391850696e-05, + "loss": 0.0987, + "step": 18500 + }, + { + "epoch": 5.524903071875932, + "grad_norm": 0.031948402523994446, + "learning_rate": 4.328557037194136e-05, + "loss": 0.1196, + "step": 18525 + }, + { + "epoch": 5.532359081419624, + "grad_norm": 0.2000201940536499, + "learning_rate": 4.326900135203203e-05, + "loss": 0.1658, + "step": 18550 + }, + { + "epoch": 5.539815090963317, + "grad_norm": 1.2646598815917969, + "learning_rate": 4.3252432332122696e-05, + "loss": 0.0475, + "step": 18575 + }, + { + "epoch": 5.547271100507008, + "grad_norm": 0.10240360349416733, + "learning_rate": 4.323586331221336e-05, + "loss": 0.0751, + "step": 18600 + }, + { + "epoch": 5.554727110050701, + "grad_norm": 1.4732797145843506, + "learning_rate": 4.321929429230403e-05, + "loss": 0.1033, + "step": 18625 + }, + { + "epoch": 5.562183119594393, + "grad_norm": 0.16163279116153717, + "learning_rate": 4.320272527239469e-05, + "loss": 0.0994, + "step": 18650 + }, + { + "epoch": 5.569639129138086, + "grad_norm": 36.78316116333008, + "learning_rate": 4.318615625248535e-05, + "loss": 0.133, + "step": 18675 + }, + { + "epoch": 5.577095138681777, + "grad_norm": 8.2725830078125, + "learning_rate": 4.316958723257602e-05, + "loss": 0.0815, + "step": 18700 + }, + { + "epoch": 5.5845511482254695, + "grad_norm": 0.0331939198076725, + "learning_rate": 4.3153018212666684e-05, + "loss": 0.0573, + "step": 18725 + }, + { + "epoch": 5.592007157769162, + "grad_norm": 0.17003242671489716, + "learning_rate": 4.313644919275735e-05, + "loss": 0.095, + "step": 18750 + }, + { + "epoch": 5.599463167312854, + "grad_norm": 0.3271353542804718, + "learning_rate": 4.3119880172848015e-05, + "loss": 0.1235, + "step": 18775 + }, + { + "epoch": 5.606919176856547, + "grad_norm": 1.6304597854614258, + "learning_rate": 4.3103311152938685e-05, + "loss": 0.1232, + "step": 18800 + }, + { + "epoch": 5.614375186400238, + "grad_norm": 2.366298198699951, + "learning_rate": 4.3086742133029354e-05, + "loss": 0.0471, + "step": 18825 + }, + { + "epoch": 5.621831195943931, + "grad_norm": 0.019990181550383568, + "learning_rate": 4.3070173113120016e-05, + "loss": 0.1426, + "step": 18850 + }, + { + "epoch": 5.629287205487623, + "grad_norm": 0.05211897939443588, + "learning_rate": 4.305360409321068e-05, + "loss": 0.0902, + "step": 18875 + }, + { + "epoch": 5.6367432150313155, + "grad_norm": 71.19368743896484, + "learning_rate": 4.303703507330135e-05, + "loss": 0.0776, + "step": 18900 + }, + { + "epoch": 5.644199224575008, + "grad_norm": 2.616161823272705, + "learning_rate": 4.302046605339201e-05, + "loss": 0.1408, + "step": 18925 + }, + { + "epoch": 5.651655234118699, + "grad_norm": 24.18864631652832, + "learning_rate": 4.300389703348268e-05, + "loss": 0.0636, + "step": 18950 + }, + { + "epoch": 5.659111243662392, + "grad_norm": 0.19227628409862518, + "learning_rate": 4.298732801357334e-05, + "loss": 0.049, + "step": 18975 + }, + { + "epoch": 5.666567253206084, + "grad_norm": 32.69465255737305, + "learning_rate": 4.2970758993664004e-05, + "loss": 0.0648, + "step": 19000 + }, + { + "epoch": 5.674023262749777, + "grad_norm": 0.5582588315010071, + "learning_rate": 4.295418997375468e-05, + "loss": 0.0635, + "step": 19025 + }, + { + "epoch": 5.681479272293468, + "grad_norm": 1.391935110092163, + "learning_rate": 4.293762095384534e-05, + "loss": 0.0883, + "step": 19050 + }, + { + "epoch": 5.688935281837161, + "grad_norm": 10.445085525512695, + "learning_rate": 4.2921051933936004e-05, + "loss": 0.05, + "step": 19075 + }, + { + "epoch": 5.696391291380853, + "grad_norm": 0.07640068978071213, + "learning_rate": 4.290448291402667e-05, + "loss": 0.0986, + "step": 19100 + }, + { + "epoch": 5.703847300924545, + "grad_norm": 0.01860329695045948, + "learning_rate": 4.2887913894117336e-05, + "loss": 0.106, + "step": 19125 + }, + { + "epoch": 5.711303310468237, + "grad_norm": 0.2838321626186371, + "learning_rate": 4.2871344874208005e-05, + "loss": 0.0648, + "step": 19150 + }, + { + "epoch": 5.718759320011929, + "grad_norm": 1.3784078359603882, + "learning_rate": 4.285477585429867e-05, + "loss": 0.0581, + "step": 19175 + }, + { + "epoch": 5.726215329555622, + "grad_norm": 0.13906244933605194, + "learning_rate": 4.283820683438933e-05, + "loss": 0.0994, + "step": 19200 + }, + { + "epoch": 5.733671339099314, + "grad_norm": 0.06269329786300659, + "learning_rate": 4.282163781448e-05, + "loss": 0.0369, + "step": 19225 + }, + { + "epoch": 5.741127348643007, + "grad_norm": 23.20893669128418, + "learning_rate": 4.280506879457066e-05, + "loss": 0.0794, + "step": 19250 + }, + { + "epoch": 5.748583358186698, + "grad_norm": 0.008146079257130623, + "learning_rate": 4.278849977466134e-05, + "loss": 0.122, + "step": 19275 + }, + { + "epoch": 5.7560393677303905, + "grad_norm": 3.9998602867126465, + "learning_rate": 4.2771930754752e-05, + "loss": 0.0627, + "step": 19300 + }, + { + "epoch": 5.763495377274083, + "grad_norm": 9.522217750549316, + "learning_rate": 4.275536173484266e-05, + "loss": 0.1013, + "step": 19325 + }, + { + "epoch": 5.770951386817775, + "grad_norm": 17.462121963500977, + "learning_rate": 4.273879271493333e-05, + "loss": 0.0862, + "step": 19350 + }, + { + "epoch": 5.778407396361468, + "grad_norm": 0.30210548639297485, + "learning_rate": 4.272222369502399e-05, + "loss": 0.049, + "step": 19375 + }, + { + "epoch": 5.785863405905159, + "grad_norm": 15.636837005615234, + "learning_rate": 4.270565467511466e-05, + "loss": 0.0787, + "step": 19400 + }, + { + "epoch": 5.793319415448852, + "grad_norm": 0.0845949798822403, + "learning_rate": 4.2689085655205324e-05, + "loss": 0.0861, + "step": 19425 + }, + { + "epoch": 5.800775424992544, + "grad_norm": 0.0145711749792099, + "learning_rate": 4.267251663529599e-05, + "loss": 0.1099, + "step": 19450 + }, + { + "epoch": 5.8082314345362365, + "grad_norm": 0.0563751682639122, + "learning_rate": 4.2655947615386656e-05, + "loss": 0.0463, + "step": 19475 + }, + { + "epoch": 5.815687444079929, + "grad_norm": 0.12543386220932007, + "learning_rate": 4.2639378595477325e-05, + "loss": 0.0881, + "step": 19500 + }, + { + "epoch": 5.82314345362362, + "grad_norm": 0.13315843045711517, + "learning_rate": 4.262280957556799e-05, + "loss": 0.0471, + "step": 19525 + }, + { + "epoch": 5.830599463167313, + "grad_norm": 0.006420004181563854, + "learning_rate": 4.2606240555658656e-05, + "loss": 0.0775, + "step": 19550 + }, + { + "epoch": 5.838055472711005, + "grad_norm": 10.033127784729004, + "learning_rate": 4.258967153574932e-05, + "loss": 0.0631, + "step": 19575 + }, + { + "epoch": 5.845511482254698, + "grad_norm": 10.855030059814453, + "learning_rate": 4.257310251583999e-05, + "loss": 0.074, + "step": 19600 + }, + { + "epoch": 5.852967491798389, + "grad_norm": 0.44868308305740356, + "learning_rate": 4.255653349593065e-05, + "loss": 0.123, + "step": 19625 + }, + { + "epoch": 5.860423501342082, + "grad_norm": 0.07777903974056244, + "learning_rate": 4.253996447602131e-05, + "loss": 0.0452, + "step": 19650 + }, + { + "epoch": 5.867879510885774, + "grad_norm": 0.0545571930706501, + "learning_rate": 4.252339545611198e-05, + "loss": 0.0985, + "step": 19675 + }, + { + "epoch": 5.875335520429466, + "grad_norm": 12.324911117553711, + "learning_rate": 4.2506826436202644e-05, + "loss": 0.1197, + "step": 19700 + }, + { + "epoch": 5.882791529973158, + "grad_norm": 0.026907717809081078, + "learning_rate": 4.249025741629331e-05, + "loss": 0.1382, + "step": 19725 + }, + { + "epoch": 5.89024753951685, + "grad_norm": 0.031041543930768967, + "learning_rate": 4.247368839638398e-05, + "loss": 0.0367, + "step": 19750 + }, + { + "epoch": 5.897703549060543, + "grad_norm": 0.1477086991071701, + "learning_rate": 4.2457119376474645e-05, + "loss": 0.0242, + "step": 19775 + }, + { + "epoch": 5.905159558604235, + "grad_norm": 0.009802543558180332, + "learning_rate": 4.2440550356565314e-05, + "loss": 0.1238, + "step": 19800 + }, + { + "epoch": 5.9126155681479275, + "grad_norm": 2.768169403076172, + "learning_rate": 4.2423981336655976e-05, + "loss": 0.142, + "step": 19825 + }, + { + "epoch": 5.920071577691619, + "grad_norm": 0.06762892007827759, + "learning_rate": 4.2407412316746645e-05, + "loss": 0.0501, + "step": 19850 + }, + { + "epoch": 5.9275275872353115, + "grad_norm": 0.09432929754257202, + "learning_rate": 4.239084329683731e-05, + "loss": 0.0721, + "step": 19875 + }, + { + "epoch": 5.934983596779004, + "grad_norm": 0.21210744976997375, + "learning_rate": 4.237427427692797e-05, + "loss": 0.0142, + "step": 19900 + }, + { + "epoch": 5.942439606322696, + "grad_norm": 0.15370802581310272, + "learning_rate": 4.235770525701864e-05, + "loss": 0.0276, + "step": 19925 + }, + { + "epoch": 5.949895615866389, + "grad_norm": 0.012401612475514412, + "learning_rate": 4.23411362371093e-05, + "loss": 0.0661, + "step": 19950 + }, + { + "epoch": 5.95735162541008, + "grad_norm": 0.019147785380482674, + "learning_rate": 4.232456721719997e-05, + "loss": 0.0537, + "step": 19975 + }, + { + "epoch": 5.964807634953773, + "grad_norm": 1.0347957611083984, + "learning_rate": 4.230799819729064e-05, + "loss": 0.0106, + "step": 20000 + }, + { + "epoch": 5.972263644497465, + "grad_norm": 3.7031664848327637, + "learning_rate": 4.22914291773813e-05, + "loss": 0.0296, + "step": 20025 + }, + { + "epoch": 5.9797196540411575, + "grad_norm": 0.052466195076704025, + "learning_rate": 4.227486015747197e-05, + "loss": 0.1013, + "step": 20050 + }, + { + "epoch": 5.98717566358485, + "grad_norm": 0.08741900324821472, + "learning_rate": 4.2258291137562633e-05, + "loss": 0.1086, + "step": 20075 + }, + { + "epoch": 5.994631673128541, + "grad_norm": 0.196999192237854, + "learning_rate": 4.2241722117653296e-05, + "loss": 0.03, + "step": 20100 + }, + { + "epoch": 6.0, + "eval_gen_len": 9.1741, + "eval_loss": 0.11874907463788986, + "eval_rouge1": 94.9431, + "eval_rouge2": 81.0015, + "eval_rougeL": 94.8452, + "eval_rougeLsum": 94.8177, + "eval_runtime": 105.0274, + "eval_samples_per_second": 15.967, + "eval_steps_per_second": 3.999, + "step": 20118 + }, + { + "epoch": 6.002087682672234, + "grad_norm": 0.01878800056874752, + "learning_rate": 4.2225153097743965e-05, + "loss": 0.0218, + "step": 20125 + }, + { + "epoch": 6.009543692215926, + "grad_norm": 0.08164286613464355, + "learning_rate": 4.220858407783463e-05, + "loss": 0.0477, + "step": 20150 + }, + { + "epoch": 6.016999701759619, + "grad_norm": 1.362046241760254, + "learning_rate": 4.2192015057925296e-05, + "loss": 0.1691, + "step": 20175 + }, + { + "epoch": 6.02445571130331, + "grad_norm": 0.0867367535829544, + "learning_rate": 4.217544603801596e-05, + "loss": 0.0223, + "step": 20200 + }, + { + "epoch": 6.031911720847003, + "grad_norm": 0.010886842384934425, + "learning_rate": 4.215887701810663e-05, + "loss": 0.0478, + "step": 20225 + }, + { + "epoch": 6.039367730390695, + "grad_norm": 0.7621909379959106, + "learning_rate": 4.21423079981973e-05, + "loss": 0.0103, + "step": 20250 + }, + { + "epoch": 6.046823739934387, + "grad_norm": 1.5744153261184692, + "learning_rate": 4.212573897828796e-05, + "loss": 0.0457, + "step": 20275 + }, + { + "epoch": 6.054279749478079, + "grad_norm": 0.11433689296245575, + "learning_rate": 4.210916995837862e-05, + "loss": 0.0506, + "step": 20300 + }, + { + "epoch": 6.061735759021771, + "grad_norm": 26.32907485961914, + "learning_rate": 4.209260093846929e-05, + "loss": 0.0815, + "step": 20325 + }, + { + "epoch": 6.069191768565464, + "grad_norm": 1.6335394382476807, + "learning_rate": 4.207603191855995e-05, + "loss": 0.0328, + "step": 20350 + }, + { + "epoch": 6.076647778109156, + "grad_norm": 0.54072505235672, + "learning_rate": 4.205946289865062e-05, + "loss": 0.109, + "step": 20375 + }, + { + "epoch": 6.0841037876528485, + "grad_norm": 23.399791717529297, + "learning_rate": 4.2042893878741285e-05, + "loss": 0.0436, + "step": 20400 + }, + { + "epoch": 6.09155979719654, + "grad_norm": 0.04539789631962776, + "learning_rate": 4.202632485883195e-05, + "loss": 0.0483, + "step": 20425 + }, + { + "epoch": 6.0990158067402325, + "grad_norm": 0.020925790071487427, + "learning_rate": 4.2009755838922616e-05, + "loss": 0.0376, + "step": 20450 + }, + { + "epoch": 6.106471816283925, + "grad_norm": 0.5620167851448059, + "learning_rate": 4.1993186819013285e-05, + "loss": 0.0955, + "step": 20475 + }, + { + "epoch": 6.113927825827617, + "grad_norm": 1.8948103189468384, + "learning_rate": 4.1976617799103954e-05, + "loss": 0.1628, + "step": 20500 + }, + { + "epoch": 6.12138383537131, + "grad_norm": 5.853365421295166, + "learning_rate": 4.1960048779194617e-05, + "loss": 0.0407, + "step": 20525 + }, + { + "epoch": 6.128839844915001, + "grad_norm": 0.35002636909484863, + "learning_rate": 4.194347975928528e-05, + "loss": 0.0887, + "step": 20550 + }, + { + "epoch": 6.136295854458694, + "grad_norm": 0.3504277169704437, + "learning_rate": 4.192691073937595e-05, + "loss": 0.0452, + "step": 20575 + }, + { + "epoch": 6.143751864002386, + "grad_norm": 2.05971097946167, + "learning_rate": 4.191034171946661e-05, + "loss": 0.0589, + "step": 20600 + }, + { + "epoch": 6.1512078735460785, + "grad_norm": 0.01998194307088852, + "learning_rate": 4.189377269955728e-05, + "loss": 0.0412, + "step": 20625 + }, + { + "epoch": 6.15866388308977, + "grad_norm": 0.07505607604980469, + "learning_rate": 4.187720367964794e-05, + "loss": 0.0812, + "step": 20650 + }, + { + "epoch": 6.166119892633462, + "grad_norm": 0.5407578349113464, + "learning_rate": 4.1860634659738604e-05, + "loss": 0.044, + "step": 20675 + }, + { + "epoch": 6.173575902177155, + "grad_norm": 6.510289669036865, + "learning_rate": 4.184406563982928e-05, + "loss": 0.0505, + "step": 20700 + }, + { + "epoch": 6.181031911720847, + "grad_norm": 0.7390807867050171, + "learning_rate": 4.182749661991994e-05, + "loss": 0.0763, + "step": 20725 + }, + { + "epoch": 6.18848792126454, + "grad_norm": 0.037136584520339966, + "learning_rate": 4.1810927600010605e-05, + "loss": 0.0052, + "step": 20750 + }, + { + "epoch": 6.195943930808231, + "grad_norm": 0.13674665987491608, + "learning_rate": 4.1794358580101274e-05, + "loss": 0.0511, + "step": 20775 + }, + { + "epoch": 6.2033999403519235, + "grad_norm": 0.5490770936012268, + "learning_rate": 4.1777789560191936e-05, + "loss": 0.0973, + "step": 20800 + }, + { + "epoch": 6.210855949895616, + "grad_norm": 0.005674040876328945, + "learning_rate": 4.1761220540282605e-05, + "loss": 0.0257, + "step": 20825 + }, + { + "epoch": 6.218311959439308, + "grad_norm": 8.376826286315918, + "learning_rate": 4.174465152037327e-05, + "loss": 0.068, + "step": 20850 + }, + { + "epoch": 6.225767968983, + "grad_norm": 0.03015846200287342, + "learning_rate": 4.172808250046393e-05, + "loss": 0.0285, + "step": 20875 + }, + { + "epoch": 6.233223978526692, + "grad_norm": 0.01474801730364561, + "learning_rate": 4.17115134805546e-05, + "loss": 0.0302, + "step": 20900 + }, + { + "epoch": 6.240679988070385, + "grad_norm": 16.602705001831055, + "learning_rate": 4.169494446064526e-05, + "loss": 0.0815, + "step": 20925 + }, + { + "epoch": 6.248135997614077, + "grad_norm": 2.8422799110412598, + "learning_rate": 4.167837544073593e-05, + "loss": 0.0528, + "step": 20950 + }, + { + "epoch": 6.2555920071577695, + "grad_norm": 17.248394012451172, + "learning_rate": 4.16618064208266e-05, + "loss": 0.0298, + "step": 20975 + }, + { + "epoch": 6.263048016701461, + "grad_norm": 0.010792219080030918, + "learning_rate": 4.164523740091726e-05, + "loss": 0.0098, + "step": 21000 + }, + { + "epoch": 6.2705040262451535, + "grad_norm": 0.020565340295433998, + "learning_rate": 4.162866838100793e-05, + "loss": 0.019, + "step": 21025 + }, + { + "epoch": 6.277960035788846, + "grad_norm": 0.03458723425865173, + "learning_rate": 4.1612099361098594e-05, + "loss": 0.0572, + "step": 21050 + }, + { + "epoch": 6.285416045332538, + "grad_norm": 0.5358602404594421, + "learning_rate": 4.159553034118926e-05, + "loss": 0.0625, + "step": 21075 + }, + { + "epoch": 6.292872054876231, + "grad_norm": 0.7117275595664978, + "learning_rate": 4.1578961321279925e-05, + "loss": 0.0479, + "step": 21100 + }, + { + "epoch": 6.300328064419922, + "grad_norm": 0.46912163496017456, + "learning_rate": 4.156239230137059e-05, + "loss": 0.1382, + "step": 21125 + }, + { + "epoch": 6.307784073963615, + "grad_norm": 17.23985481262207, + "learning_rate": 4.1546486042257635e-05, + "loss": 0.0598, + "step": 21150 + }, + { + "epoch": 6.315240083507307, + "grad_norm": 1.2074980735778809, + "learning_rate": 4.15299170223483e-05, + "loss": 0.0741, + "step": 21175 + }, + { + "epoch": 6.322696093050999, + "grad_norm": 2.5435595512390137, + "learning_rate": 4.151334800243896e-05, + "loss": 0.1226, + "step": 21200 + }, + { + "epoch": 6.330152102594691, + "grad_norm": 0.0682421550154686, + "learning_rate": 4.149677898252963e-05, + "loss": 0.0185, + "step": 21225 + }, + { + "epoch": 6.337608112138383, + "grad_norm": 16.758506774902344, + "learning_rate": 4.148020996262029e-05, + "loss": 0.0491, + "step": 21250 + }, + { + "epoch": 6.345064121682076, + "grad_norm": 5.860011577606201, + "learning_rate": 4.146364094271096e-05, + "loss": 0.0211, + "step": 21275 + }, + { + "epoch": 6.352520131225768, + "grad_norm": 0.9169291853904724, + "learning_rate": 4.144707192280162e-05, + "loss": 0.0625, + "step": 21300 + }, + { + "epoch": 6.35997614076946, + "grad_norm": 20.915584564208984, + "learning_rate": 4.1430502902892285e-05, + "loss": 0.0336, + "step": 21325 + }, + { + "epoch": 6.367432150313152, + "grad_norm": 0.12377249449491501, + "learning_rate": 4.141393388298296e-05, + "loss": 0.0972, + "step": 21350 + }, + { + "epoch": 6.3748881598568445, + "grad_norm": 0.2923804223537445, + "learning_rate": 4.139736486307362e-05, + "loss": 0.0771, + "step": 21375 + }, + { + "epoch": 6.382344169400537, + "grad_norm": 0.0026703316252678633, + "learning_rate": 4.1380795843164285e-05, + "loss": 0.065, + "step": 21400 + }, + { + "epoch": 6.389800178944229, + "grad_norm": 1.1541820764541626, + "learning_rate": 4.1364226823254954e-05, + "loss": 0.0171, + "step": 21425 + }, + { + "epoch": 6.397256188487921, + "grad_norm": 0.010893816128373146, + "learning_rate": 4.1347657803345617e-05, + "loss": 0.1095, + "step": 21450 + }, + { + "epoch": 6.404712198031613, + "grad_norm": 0.030142832547426224, + "learning_rate": 4.1331088783436286e-05, + "loss": 0.0563, + "step": 21475 + }, + { + "epoch": 6.412168207575306, + "grad_norm": 0.29954442381858826, + "learning_rate": 4.131451976352695e-05, + "loss": 0.0529, + "step": 21500 + }, + { + "epoch": 6.419624217118998, + "grad_norm": 10.40064525604248, + "learning_rate": 4.129795074361762e-05, + "loss": 0.0602, + "step": 21525 + }, + { + "epoch": 6.4270802266626905, + "grad_norm": 0.021467700600624084, + "learning_rate": 4.128138172370828e-05, + "loss": 0.0142, + "step": 21550 + }, + { + "epoch": 6.434536236206382, + "grad_norm": 8.21902847290039, + "learning_rate": 4.126481270379895e-05, + "loss": 0.1391, + "step": 21575 + }, + { + "epoch": 6.4419922457500745, + "grad_norm": 0.9277015924453735, + "learning_rate": 4.124824368388962e-05, + "loss": 0.1091, + "step": 21600 + }, + { + "epoch": 6.449448255293767, + "grad_norm": 0.10828567296266556, + "learning_rate": 4.123167466398028e-05, + "loss": 0.02, + "step": 21625 + }, + { + "epoch": 6.456904264837459, + "grad_norm": 5.262417793273926, + "learning_rate": 4.121510564407094e-05, + "loss": 0.1392, + "step": 21650 + }, + { + "epoch": 6.464360274381151, + "grad_norm": 0.07524223625659943, + "learning_rate": 4.119853662416161e-05, + "loss": 0.054, + "step": 21675 + }, + { + "epoch": 6.471816283924843, + "grad_norm": 0.028518134728074074, + "learning_rate": 4.1181967604252274e-05, + "loss": 0.0425, + "step": 21700 + }, + { + "epoch": 6.479272293468536, + "grad_norm": 24.74271011352539, + "learning_rate": 4.116539858434294e-05, + "loss": 0.0752, + "step": 21725 + }, + { + "epoch": 6.486728303012228, + "grad_norm": 0.003951622173190117, + "learning_rate": 4.1148829564433605e-05, + "loss": 0.0282, + "step": 21750 + }, + { + "epoch": 6.49418431255592, + "grad_norm": 14.138594627380371, + "learning_rate": 4.113226054452427e-05, + "loss": 0.0706, + "step": 21775 + }, + { + "epoch": 6.501640322099612, + "grad_norm": 0.9741306304931641, + "learning_rate": 4.111569152461494e-05, + "loss": 0.1514, + "step": 21800 + }, + { + "epoch": 6.509096331643304, + "grad_norm": 1.9363592863082886, + "learning_rate": 4.1099122504705606e-05, + "loss": 0.0842, + "step": 21825 + }, + { + "epoch": 6.516552341186997, + "grad_norm": 20.144515991210938, + "learning_rate": 4.108255348479627e-05, + "loss": 0.1134, + "step": 21850 + }, + { + "epoch": 6.524008350730689, + "grad_norm": 3.8856167793273926, + "learning_rate": 4.106598446488694e-05, + "loss": 0.0526, + "step": 21875 + }, + { + "epoch": 6.531464360274381, + "grad_norm": 0.09439099580049515, + "learning_rate": 4.10494154449776e-05, + "loss": 0.0112, + "step": 21900 + }, + { + "epoch": 6.538920369818073, + "grad_norm": 0.07617989182472229, + "learning_rate": 4.103284642506827e-05, + "loss": 0.0834, + "step": 21925 + }, + { + "epoch": 6.5463763793617655, + "grad_norm": 19.814706802368164, + "learning_rate": 4.101627740515893e-05, + "loss": 0.0665, + "step": 21950 + }, + { + "epoch": 6.553832388905458, + "grad_norm": 0.009026892483234406, + "learning_rate": 4.0999708385249594e-05, + "loss": 0.0685, + "step": 21975 + }, + { + "epoch": 6.56128839844915, + "grad_norm": 1.3233736753463745, + "learning_rate": 4.098313936534026e-05, + "loss": 0.0855, + "step": 22000 + }, + { + "epoch": 6.568744407992842, + "grad_norm": 0.004098168108612299, + "learning_rate": 4.0966570345430925e-05, + "loss": 0.0975, + "step": 22025 + }, + { + "epoch": 6.576200417536534, + "grad_norm": 18.999271392822266, + "learning_rate": 4.0950001325521594e-05, + "loss": 0.0434, + "step": 22050 + }, + { + "epoch": 6.583656427080227, + "grad_norm": 0.08481686562299728, + "learning_rate": 4.093343230561226e-05, + "loss": 0.053, + "step": 22075 + }, + { + "epoch": 6.591112436623919, + "grad_norm": 0.03606973588466644, + "learning_rate": 4.0916863285702926e-05, + "loss": 0.1021, + "step": 22100 + }, + { + "epoch": 6.5985684461676115, + "grad_norm": 3.600905656814575, + "learning_rate": 4.0900294265793595e-05, + "loss": 0.0289, + "step": 22125 + }, + { + "epoch": 6.606024455711303, + "grad_norm": 0.6694594025611877, + "learning_rate": 4.088372524588426e-05, + "loss": 0.0297, + "step": 22150 + }, + { + "epoch": 6.613480465254995, + "grad_norm": 0.31841492652893066, + "learning_rate": 4.0867156225974926e-05, + "loss": 0.0578, + "step": 22175 + }, + { + "epoch": 6.620936474798688, + "grad_norm": 12.335127830505371, + "learning_rate": 4.085058720606559e-05, + "loss": 0.0594, + "step": 22200 + }, + { + "epoch": 6.62839248434238, + "grad_norm": 14.127710342407227, + "learning_rate": 4.083401818615625e-05, + "loss": 0.0516, + "step": 22225 + }, + { + "epoch": 6.635848493886072, + "grad_norm": 0.029252486303448677, + "learning_rate": 4.081744916624692e-05, + "loss": 0.0599, + "step": 22250 + }, + { + "epoch": 6.643304503429764, + "grad_norm": 0.5378937721252441, + "learning_rate": 4.080088014633758e-05, + "loss": 0.1345, + "step": 22275 + }, + { + "epoch": 6.650760512973457, + "grad_norm": 0.13244609534740448, + "learning_rate": 4.078431112642825e-05, + "loss": 0.0515, + "step": 22300 + }, + { + "epoch": 6.658216522517149, + "grad_norm": 0.2065218836069107, + "learning_rate": 4.076774210651892e-05, + "loss": 0.0365, + "step": 22325 + }, + { + "epoch": 6.6656725320608405, + "grad_norm": 0.010511617176234722, + "learning_rate": 4.075117308660958e-05, + "loss": 0.0429, + "step": 22350 + }, + { + "epoch": 6.673128541604533, + "grad_norm": 0.7169992327690125, + "learning_rate": 4.073460406670025e-05, + "loss": 0.142, + "step": 22375 + }, + { + "epoch": 6.680584551148225, + "grad_norm": 12.551905632019043, + "learning_rate": 4.0718035046790914e-05, + "loss": 0.101, + "step": 22400 + }, + { + "epoch": 6.688040560691918, + "grad_norm": 0.05723453685641289, + "learning_rate": 4.070146602688158e-05, + "loss": 0.0566, + "step": 22425 + }, + { + "epoch": 6.69549657023561, + "grad_norm": 0.6166514754295349, + "learning_rate": 4.0684897006972246e-05, + "loss": 0.0914, + "step": 22450 + }, + { + "epoch": 6.702952579779302, + "grad_norm": 0.01288004219532013, + "learning_rate": 4.066832798706291e-05, + "loss": 0.03, + "step": 22475 + }, + { + "epoch": 6.710408589322994, + "grad_norm": 0.07274222373962402, + "learning_rate": 4.065175896715358e-05, + "loss": 0.0967, + "step": 22500 + }, + { + "epoch": 6.7178645988666865, + "grad_norm": 15.432951927185059, + "learning_rate": 4.063518994724424e-05, + "loss": 0.0478, + "step": 22525 + }, + { + "epoch": 6.725320608410379, + "grad_norm": 0.8928155303001404, + "learning_rate": 4.061862092733491e-05, + "loss": 0.0682, + "step": 22550 + }, + { + "epoch": 6.732776617954071, + "grad_norm": 0.044634025543928146, + "learning_rate": 4.060205190742558e-05, + "loss": 0.0232, + "step": 22575 + }, + { + "epoch": 6.740232627497763, + "grad_norm": 6.356382369995117, + "learning_rate": 4.058548288751624e-05, + "loss": 0.0411, + "step": 22600 + }, + { + "epoch": 6.747688637041455, + "grad_norm": 0.5113846659660339, + "learning_rate": 4.05689138676069e-05, + "loss": 0.0493, + "step": 22625 + }, + { + "epoch": 6.755144646585148, + "grad_norm": 3.461223602294922, + "learning_rate": 4.055234484769757e-05, + "loss": 0.0776, + "step": 22650 + }, + { + "epoch": 6.76260065612884, + "grad_norm": 28.651147842407227, + "learning_rate": 4.0535775827788234e-05, + "loss": 0.0689, + "step": 22675 + }, + { + "epoch": 6.7700566656725325, + "grad_norm": 5.866575241088867, + "learning_rate": 4.05192068078789e-05, + "loss": 0.0753, + "step": 22700 + }, + { + "epoch": 6.777512675216224, + "grad_norm": 0.052426777780056, + "learning_rate": 4.0502637787969565e-05, + "loss": 0.0496, + "step": 22725 + }, + { + "epoch": 6.784968684759916, + "grad_norm": 8.888331413269043, + "learning_rate": 4.0486068768060235e-05, + "loss": 0.0399, + "step": 22750 + }, + { + "epoch": 6.792424694303609, + "grad_norm": 11.797131538391113, + "learning_rate": 4.0469499748150904e-05, + "loss": 0.0764, + "step": 22775 + }, + { + "epoch": 6.799880703847301, + "grad_norm": 22.82054328918457, + "learning_rate": 4.0452930728241566e-05, + "loss": 0.0664, + "step": 22800 + }, + { + "epoch": 6.807336713390993, + "grad_norm": 2.2873129844665527, + "learning_rate": 4.0436361708332235e-05, + "loss": 0.0954, + "step": 22825 + }, + { + "epoch": 6.814792722934685, + "grad_norm": 9.829035758972168, + "learning_rate": 4.04197926884229e-05, + "loss": 0.0213, + "step": 22850 + }, + { + "epoch": 6.822248732478378, + "grad_norm": 0.41712069511413574, + "learning_rate": 4.040322366851356e-05, + "loss": 0.0699, + "step": 22875 + }, + { + "epoch": 6.82970474202207, + "grad_norm": 22.233823776245117, + "learning_rate": 4.038665464860423e-05, + "loss": 0.0725, + "step": 22900 + }, + { + "epoch": 6.8371607515657615, + "grad_norm": 0.4478222131729126, + "learning_rate": 4.037008562869489e-05, + "loss": 0.043, + "step": 22925 + }, + { + "epoch": 6.844616761109454, + "grad_norm": 10.205151557922363, + "learning_rate": 4.035351660878556e-05, + "loss": 0.0923, + "step": 22950 + }, + { + "epoch": 6.852072770653146, + "grad_norm": 14.356264114379883, + "learning_rate": 4.033694758887622e-05, + "loss": 0.0779, + "step": 22975 + }, + { + "epoch": 6.859528780196839, + "grad_norm": 5.969383239746094, + "learning_rate": 4.0320378568966885e-05, + "loss": 0.0468, + "step": 23000 + }, + { + "epoch": 6.866984789740531, + "grad_norm": 0.151869997382164, + "learning_rate": 4.030380954905756e-05, + "loss": 0.0227, + "step": 23025 + }, + { + "epoch": 6.874440799284223, + "grad_norm": 12.691407203674316, + "learning_rate": 4.028724052914822e-05, + "loss": 0.0199, + "step": 23050 + }, + { + "epoch": 6.881896808827915, + "grad_norm": 0.6804265975952148, + "learning_rate": 4.0270671509238886e-05, + "loss": 0.0734, + "step": 23075 + }, + { + "epoch": 6.8893528183716075, + "grad_norm": 0.10523873567581177, + "learning_rate": 4.0254102489329555e-05, + "loss": 0.0646, + "step": 23100 + }, + { + "epoch": 6.8968088279153, + "grad_norm": 6.399729251861572, + "learning_rate": 4.023753346942022e-05, + "loss": 0.0336, + "step": 23125 + }, + { + "epoch": 6.904264837458992, + "grad_norm": 0.018156565725803375, + "learning_rate": 4.0220964449510886e-05, + "loss": 0.0434, + "step": 23150 + }, + { + "epoch": 6.911720847002684, + "grad_norm": 0.04066069424152374, + "learning_rate": 4.020439542960155e-05, + "loss": 0.0835, + "step": 23175 + }, + { + "epoch": 6.919176856546376, + "grad_norm": 0.20896196365356445, + "learning_rate": 4.018782640969221e-05, + "loss": 0.0646, + "step": 23200 + }, + { + "epoch": 6.926632866090069, + "grad_norm": 0.18396757543087006, + "learning_rate": 4.017125738978288e-05, + "loss": 0.0936, + "step": 23225 + }, + { + "epoch": 6.934088875633761, + "grad_norm": 0.08867733180522919, + "learning_rate": 4.015468836987355e-05, + "loss": 0.1058, + "step": 23250 + }, + { + "epoch": 6.9415448851774535, + "grad_norm": 0.16155028343200684, + "learning_rate": 4.013811934996422e-05, + "loss": 0.1159, + "step": 23275 + }, + { + "epoch": 6.949000894721145, + "grad_norm": 10.4935941696167, + "learning_rate": 4.012155033005488e-05, + "loss": 0.0857, + "step": 23300 + }, + { + "epoch": 6.956456904264837, + "grad_norm": 11.887359619140625, + "learning_rate": 4.010498131014554e-05, + "loss": 0.0939, + "step": 23325 + }, + { + "epoch": 6.96391291380853, + "grad_norm": 0.025529278442263603, + "learning_rate": 4.008841229023621e-05, + "loss": 0.1037, + "step": 23350 + }, + { + "epoch": 6.971368923352222, + "grad_norm": 0.14388221502304077, + "learning_rate": 4.0071843270326874e-05, + "loss": 0.0505, + "step": 23375 + }, + { + "epoch": 6.978824932895914, + "grad_norm": 0.007163532543927431, + "learning_rate": 4.0055274250417544e-05, + "loss": 0.0659, + "step": 23400 + }, + { + "epoch": 6.986280942439606, + "grad_norm": 0.7755250334739685, + "learning_rate": 4.0038705230508206e-05, + "loss": 0.0595, + "step": 23425 + }, + { + "epoch": 6.993736951983299, + "grad_norm": 0.010912524536252022, + "learning_rate": 4.002213621059887e-05, + "loss": 0.0422, + "step": 23450 + }, + { + "epoch": 7.0, + "eval_gen_len": 8.7078, + "eval_loss": 0.11466038972139359, + "eval_rouge1": 95.8676, + "eval_rouge2": 81.828, + "eval_rougeL": 95.8089, + "eval_rougeLsum": 95.7724, + "eval_runtime": 99.9262, + "eval_samples_per_second": 16.782, + "eval_steps_per_second": 4.203, + "step": 23471 + }, + { + "epoch": 7.001192961526991, + "grad_norm": 2.111431837081909, + "learning_rate": 4.000556719068954e-05, + "loss": 0.0429, + "step": 23475 + }, + { + "epoch": 7.008648971070683, + "grad_norm": 0.029296431690454483, + "learning_rate": 3.9988998170780207e-05, + "loss": 0.0895, + "step": 23500 + }, + { + "epoch": 7.016104980614375, + "grad_norm": 0.07177238911390305, + "learning_rate": 3.997242915087087e-05, + "loss": 0.0412, + "step": 23525 + }, + { + "epoch": 7.023560990158067, + "grad_norm": 0.22311842441558838, + "learning_rate": 3.995586013096154e-05, + "loss": 0.0392, + "step": 23550 + }, + { + "epoch": 7.03101699970176, + "grad_norm": 15.43985366821289, + "learning_rate": 3.99392911110522e-05, + "loss": 0.0376, + "step": 23575 + }, + { + "epoch": 7.038473009245452, + "grad_norm": 0.008796346373856068, + "learning_rate": 3.992272209114287e-05, + "loss": 0.0655, + "step": 23600 + }, + { + "epoch": 7.045929018789144, + "grad_norm": 0.4163694977760315, + "learning_rate": 3.990615307123353e-05, + "loss": 0.0532, + "step": 23625 + }, + { + "epoch": 7.053385028332836, + "grad_norm": 0.447316437959671, + "learning_rate": 3.9889584051324194e-05, + "loss": 0.0332, + "step": 23650 + }, + { + "epoch": 7.0608410378765285, + "grad_norm": 0.3781053423881531, + "learning_rate": 3.987301503141486e-05, + "loss": 0.0359, + "step": 23675 + }, + { + "epoch": 7.068297047420221, + "grad_norm": 0.03129143640398979, + "learning_rate": 3.9856446011505526e-05, + "loss": 0.0626, + "step": 23700 + }, + { + "epoch": 7.075753056963913, + "grad_norm": 15.764562606811523, + "learning_rate": 3.9839876991596195e-05, + "loss": 0.0376, + "step": 23725 + }, + { + "epoch": 7.083209066507605, + "grad_norm": 0.12252432852983475, + "learning_rate": 3.9823307971686864e-05, + "loss": 0.003, + "step": 23750 + }, + { + "epoch": 7.090665076051297, + "grad_norm": 0.2217705249786377, + "learning_rate": 3.9806738951777526e-05, + "loss": 0.0337, + "step": 23775 + }, + { + "epoch": 7.09812108559499, + "grad_norm": 1.2270210981369019, + "learning_rate": 3.9790169931868195e-05, + "loss": 0.0035, + "step": 23800 + }, + { + "epoch": 7.105577095138682, + "grad_norm": 0.040991149842739105, + "learning_rate": 3.977360091195886e-05, + "loss": 0.0075, + "step": 23825 + }, + { + "epoch": 7.113033104682374, + "grad_norm": 0.00795942172408104, + "learning_rate": 3.975703189204952e-05, + "loss": 0.0672, + "step": 23850 + }, + { + "epoch": 7.120489114226066, + "grad_norm": 18.399192810058594, + "learning_rate": 3.974046287214019e-05, + "loss": 0.039, + "step": 23875 + }, + { + "epoch": 7.127945123769758, + "grad_norm": 0.009747338481247425, + "learning_rate": 3.972389385223085e-05, + "loss": 0.0342, + "step": 23900 + }, + { + "epoch": 7.135401133313451, + "grad_norm": 0.12228219211101532, + "learning_rate": 3.970732483232152e-05, + "loss": 0.0089, + "step": 23925 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.5154474377632141, + "learning_rate": 3.969075581241218e-05, + "loss": 0.0216, + "step": 23950 + }, + { + "epoch": 7.150313152400835, + "grad_norm": 48.93901824951172, + "learning_rate": 3.967418679250285e-05, + "loss": 0.0758, + "step": 23975 + }, + { + "epoch": 7.157769161944527, + "grad_norm": 0.3987184464931488, + "learning_rate": 3.965761777259352e-05, + "loss": 0.032, + "step": 24000 + }, + { + "epoch": 7.16522517148822, + "grad_norm": 6.687148094177246, + "learning_rate": 3.9641048752684183e-05, + "loss": 0.0444, + "step": 24025 + }, + { + "epoch": 7.172681181031912, + "grad_norm": 0.00944861862808466, + "learning_rate": 3.962447973277485e-05, + "loss": 0.0525, + "step": 24050 + }, + { + "epoch": 7.1801371905756035, + "grad_norm": 0.3815286457538605, + "learning_rate": 3.9607910712865515e-05, + "loss": 0.0218, + "step": 24075 + }, + { + "epoch": 7.187593200119296, + "grad_norm": 0.008050195872783661, + "learning_rate": 3.959134169295618e-05, + "loss": 0.093, + "step": 24100 + }, + { + "epoch": 7.195049209662988, + "grad_norm": 13.898192405700684, + "learning_rate": 3.9574772673046846e-05, + "loss": 0.04, + "step": 24125 + }, + { + "epoch": 7.202505219206681, + "grad_norm": 0.0883483812212944, + "learning_rate": 3.955820365313751e-05, + "loss": 0.0139, + "step": 24150 + }, + { + "epoch": 7.209961228750373, + "grad_norm": 2.729593515396118, + "learning_rate": 3.954163463322818e-05, + "loss": 0.0189, + "step": 24175 + }, + { + "epoch": 7.217417238294065, + "grad_norm": 24.50420570373535, + "learning_rate": 3.952506561331884e-05, + "loss": 0.0689, + "step": 24200 + }, + { + "epoch": 7.224873247837757, + "grad_norm": 0.010202550329267979, + "learning_rate": 3.950849659340951e-05, + "loss": 0.0345, + "step": 24225 + }, + { + "epoch": 7.2323292573814495, + "grad_norm": 0.20225048065185547, + "learning_rate": 3.949259033429655e-05, + "loss": 0.0682, + "step": 24250 + }, + { + "epoch": 7.239785266925142, + "grad_norm": 18.914243698120117, + "learning_rate": 3.947602131438721e-05, + "loss": 0.0571, + "step": 24275 + }, + { + "epoch": 7.247241276468834, + "grad_norm": 0.03825189918279648, + "learning_rate": 3.9459452294477874e-05, + "loss": 0.026, + "step": 24300 + }, + { + "epoch": 7.254697286012526, + "grad_norm": 16.95638656616211, + "learning_rate": 3.9442883274568544e-05, + "loss": 0.1054, + "step": 24325 + }, + { + "epoch": 7.262153295556218, + "grad_norm": 0.016385719180107117, + "learning_rate": 3.9426314254659206e-05, + "loss": 0.0445, + "step": 24350 + }, + { + "epoch": 7.269609305099911, + "grad_norm": 0.0067216139286756516, + "learning_rate": 3.940974523474988e-05, + "loss": 0.0768, + "step": 24375 + }, + { + "epoch": 7.277065314643603, + "grad_norm": 0.004308663308620453, + "learning_rate": 3.9393176214840544e-05, + "loss": 0.0128, + "step": 24400 + }, + { + "epoch": 7.284521324187295, + "grad_norm": 0.029721522703766823, + "learning_rate": 3.9376607194931207e-05, + "loss": 0.0407, + "step": 24425 + }, + { + "epoch": 7.291977333730987, + "grad_norm": 0.32637977600097656, + "learning_rate": 3.9360038175021876e-05, + "loss": 0.0234, + "step": 24450 + }, + { + "epoch": 7.299433343274679, + "grad_norm": 0.228067547082901, + "learning_rate": 3.934346915511254e-05, + "loss": 0.0528, + "step": 24475 + }, + { + "epoch": 7.306889352818372, + "grad_norm": 0.0018244112143293023, + "learning_rate": 3.932690013520321e-05, + "loss": 0.0208, + "step": 24500 + }, + { + "epoch": 7.314345362362064, + "grad_norm": 0.01213071309030056, + "learning_rate": 3.931033111529387e-05, + "loss": 0.0363, + "step": 24525 + }, + { + "epoch": 7.321801371905756, + "grad_norm": 4.35712194442749, + "learning_rate": 3.929376209538453e-05, + "loss": 0.0419, + "step": 24550 + }, + { + "epoch": 7.329257381449448, + "grad_norm": 0.019649688154459, + "learning_rate": 3.92771930754752e-05, + "loss": 0.011, + "step": 24575 + }, + { + "epoch": 7.336713390993141, + "grad_norm": 0.2837681770324707, + "learning_rate": 3.926062405556586e-05, + "loss": 0.0448, + "step": 24600 + }, + { + "epoch": 7.344169400536833, + "grad_norm": 0.0030554018449038267, + "learning_rate": 3.924405503565653e-05, + "loss": 0.0038, + "step": 24625 + }, + { + "epoch": 7.3516254100805245, + "grad_norm": 0.3149765431880951, + "learning_rate": 3.92274860157472e-05, + "loss": 0.0096, + "step": 24650 + }, + { + "epoch": 7.359081419624217, + "grad_norm": 0.08300528675317764, + "learning_rate": 3.9210916995837864e-05, + "loss": 0.0606, + "step": 24675 + }, + { + "epoch": 7.366537429167909, + "grad_norm": 0.7167775630950928, + "learning_rate": 3.919434797592853e-05, + "loss": 0.0071, + "step": 24700 + }, + { + "epoch": 7.373993438711602, + "grad_norm": 0.042454127222299576, + "learning_rate": 3.9177778956019195e-05, + "loss": 0.1068, + "step": 24725 + }, + { + "epoch": 7.381449448255294, + "grad_norm": 0.003952869679778814, + "learning_rate": 3.916120993610986e-05, + "loss": 0.0236, + "step": 24750 + }, + { + "epoch": 7.388905457798986, + "grad_norm": 0.006623697001487017, + "learning_rate": 3.914464091620053e-05, + "loss": 0.0432, + "step": 24775 + }, + { + "epoch": 7.396361467342678, + "grad_norm": 0.2648797631263733, + "learning_rate": 3.912807189629119e-05, + "loss": 0.1026, + "step": 24800 + }, + { + "epoch": 7.4038174768863705, + "grad_norm": 0.038570746779441833, + "learning_rate": 3.911150287638186e-05, + "loss": 0.0174, + "step": 24825 + }, + { + "epoch": 7.411273486430063, + "grad_norm": 0.11163607984781265, + "learning_rate": 3.909493385647253e-05, + "loss": 0.0581, + "step": 24850 + }, + { + "epoch": 7.418729495973755, + "grad_norm": 18.279457092285156, + "learning_rate": 3.907836483656319e-05, + "loss": 0.0378, + "step": 24875 + }, + { + "epoch": 7.426185505517447, + "grad_norm": 14.651384353637695, + "learning_rate": 3.906179581665386e-05, + "loss": 0.0884, + "step": 24900 + }, + { + "epoch": 7.433641515061139, + "grad_norm": 0.08544855564832687, + "learning_rate": 3.904522679674452e-05, + "loss": 0.043, + "step": 24925 + }, + { + "epoch": 7.441097524604832, + "grad_norm": 0.5073758363723755, + "learning_rate": 3.9028657776835183e-05, + "loss": 0.0312, + "step": 24950 + }, + { + "epoch": 7.448553534148524, + "grad_norm": 0.7192637920379639, + "learning_rate": 3.901208875692585e-05, + "loss": 0.0272, + "step": 24975 + }, + { + "epoch": 7.456009543692216, + "grad_norm": 7.956757068634033, + "learning_rate": 3.8995519737016515e-05, + "loss": 0.0965, + "step": 25000 + }, + { + "epoch": 7.463465553235908, + "grad_norm": 0.006529694423079491, + "learning_rate": 3.8978950717107184e-05, + "loss": 0.0162, + "step": 25025 + }, + { + "epoch": 7.4709215627796, + "grad_norm": 0.005817771423608065, + "learning_rate": 3.8962381697197846e-05, + "loss": 0.0576, + "step": 25050 + }, + { + "epoch": 7.478377572323293, + "grad_norm": 0.005918608978390694, + "learning_rate": 3.8945812677288516e-05, + "loss": 0.0196, + "step": 25075 + }, + { + "epoch": 7.485833581866984, + "grad_norm": 0.0037487195804715157, + "learning_rate": 3.8929243657379185e-05, + "loss": 0.0313, + "step": 25100 + }, + { + "epoch": 7.493289591410677, + "grad_norm": 0.044559087604284286, + "learning_rate": 3.891267463746985e-05, + "loss": 0.0353, + "step": 25125 + }, + { + "epoch": 7.500745600954369, + "grad_norm": 0.0025634621270000935, + "learning_rate": 3.8896105617560516e-05, + "loss": 0.0253, + "step": 25150 + }, + { + "epoch": 7.508201610498062, + "grad_norm": 18.160240173339844, + "learning_rate": 3.887953659765118e-05, + "loss": 0.0418, + "step": 25175 + }, + { + "epoch": 7.515657620041754, + "grad_norm": 0.011235961690545082, + "learning_rate": 3.886296757774184e-05, + "loss": 0.0758, + "step": 25200 + }, + { + "epoch": 7.5231136295854455, + "grad_norm": 6.458895683288574, + "learning_rate": 3.884639855783251e-05, + "loss": 0.0425, + "step": 25225 + }, + { + "epoch": 7.530569639129138, + "grad_norm": 0.8765074610710144, + "learning_rate": 3.882982953792317e-05, + "loss": 0.0208, + "step": 25250 + }, + { + "epoch": 7.53802564867283, + "grad_norm": 12.993375778198242, + "learning_rate": 3.881326051801384e-05, + "loss": 0.0333, + "step": 25275 + }, + { + "epoch": 7.545481658216523, + "grad_norm": 0.10430316627025604, + "learning_rate": 3.8796691498104504e-05, + "loss": 0.2056, + "step": 25300 + }, + { + "epoch": 7.552937667760215, + "grad_norm": 0.07876244187355042, + "learning_rate": 3.878012247819517e-05, + "loss": 0.0489, + "step": 25325 + }, + { + "epoch": 7.560393677303907, + "grad_norm": 16.660846710205078, + "learning_rate": 3.876355345828584e-05, + "loss": 0.2723, + "step": 25350 + }, + { + "epoch": 7.567849686847599, + "grad_norm": 0.11059949547052383, + "learning_rate": 3.8746984438376504e-05, + "loss": 0.0338, + "step": 25375 + }, + { + "epoch": 7.5753056963912915, + "grad_norm": 1.6685644388198853, + "learning_rate": 3.873041541846717e-05, + "loss": 0.0309, + "step": 25400 + }, + { + "epoch": 7.582761705934984, + "grad_norm": 0.3328234851360321, + "learning_rate": 3.8713846398557836e-05, + "loss": 0.0713, + "step": 25425 + }, + { + "epoch": 7.590217715478676, + "grad_norm": 10.004615783691406, + "learning_rate": 3.86972773786485e-05, + "loss": 0.0459, + "step": 25450 + }, + { + "epoch": 7.597673725022368, + "grad_norm": 8.179096221923828, + "learning_rate": 3.868070835873917e-05, + "loss": 0.02, + "step": 25475 + }, + { + "epoch": 7.60512973456606, + "grad_norm": 0.036157842725515366, + "learning_rate": 3.866413933882983e-05, + "loss": 0.0704, + "step": 25500 + }, + { + "epoch": 7.612585744109753, + "grad_norm": 0.1007198840379715, + "learning_rate": 3.864757031892049e-05, + "loss": 0.019, + "step": 25525 + }, + { + "epoch": 7.620041753653445, + "grad_norm": 0.057942017912864685, + "learning_rate": 3.863100129901116e-05, + "loss": 0.0588, + "step": 25550 + }, + { + "epoch": 7.627497763197137, + "grad_norm": 1.3576432466506958, + "learning_rate": 3.861443227910183e-05, + "loss": 0.0124, + "step": 25575 + }, + { + "epoch": 7.634953772740829, + "grad_norm": 0.00611503841355443, + "learning_rate": 3.85978632591925e-05, + "loss": 0.0646, + "step": 25600 + }, + { + "epoch": 7.642409782284521, + "grad_norm": 0.041820377111434937, + "learning_rate": 3.858129423928316e-05, + "loss": 0.0667, + "step": 25625 + }, + { + "epoch": 7.649865791828214, + "grad_norm": 4.039573669433594, + "learning_rate": 3.8564725219373824e-05, + "loss": 0.0215, + "step": 25650 + }, + { + "epoch": 7.657321801371905, + "grad_norm": 7.467697620391846, + "learning_rate": 3.854815619946449e-05, + "loss": 0.0054, + "step": 25675 + }, + { + "epoch": 7.664777810915598, + "grad_norm": 0.01705421693623066, + "learning_rate": 3.8531587179555155e-05, + "loss": 0.0225, + "step": 25700 + }, + { + "epoch": 7.67223382045929, + "grad_norm": 0.049625929445028305, + "learning_rate": 3.8515018159645825e-05, + "loss": 0.0338, + "step": 25725 + }, + { + "epoch": 7.6796898300029826, + "grad_norm": 0.2381235659122467, + "learning_rate": 3.849844913973649e-05, + "loss": 0.043, + "step": 25750 + }, + { + "epoch": 7.687145839546675, + "grad_norm": 17.741899490356445, + "learning_rate": 3.848188011982715e-05, + "loss": 0.0511, + "step": 25775 + }, + { + "epoch": 7.6946018490903665, + "grad_norm": 0.02506117708981037, + "learning_rate": 3.846531109991782e-05, + "loss": 0.0804, + "step": 25800 + }, + { + "epoch": 7.702057858634059, + "grad_norm": 0.018649157136678696, + "learning_rate": 3.844874208000849e-05, + "loss": 0.0479, + "step": 25825 + }, + { + "epoch": 7.709513868177751, + "grad_norm": 0.1378479301929474, + "learning_rate": 3.843217306009915e-05, + "loss": 0.0695, + "step": 25850 + }, + { + "epoch": 7.716969877721444, + "grad_norm": 0.040336690843105316, + "learning_rate": 3.841560404018982e-05, + "loss": 0.0016, + "step": 25875 + }, + { + "epoch": 7.724425887265136, + "grad_norm": 0.6665530800819397, + "learning_rate": 3.839903502028048e-05, + "loss": 0.0375, + "step": 25900 + }, + { + "epoch": 7.731881896808828, + "grad_norm": 20.52546501159668, + "learning_rate": 3.838246600037115e-05, + "loss": 0.0571, + "step": 25925 + }, + { + "epoch": 7.73933790635252, + "grad_norm": 2.0904996395111084, + "learning_rate": 3.836589698046181e-05, + "loss": 0.0273, + "step": 25950 + }, + { + "epoch": 7.7467939158962125, + "grad_norm": 10.46048355102539, + "learning_rate": 3.8349327960552475e-05, + "loss": 0.089, + "step": 25975 + }, + { + "epoch": 7.754249925439905, + "grad_norm": 0.045314982533454895, + "learning_rate": 3.8332758940643144e-05, + "loss": 0.0376, + "step": 26000 + }, + { + "epoch": 7.761705934983596, + "grad_norm": 11.569523811340332, + "learning_rate": 3.8316189920733807e-05, + "loss": 0.07, + "step": 26025 + }, + { + "epoch": 7.769161944527289, + "grad_norm": 0.2440641224384308, + "learning_rate": 3.8299620900824476e-05, + "loss": 0.0491, + "step": 26050 + }, + { + "epoch": 7.776617954070981, + "grad_norm": 5.467372894287109, + "learning_rate": 3.8283051880915145e-05, + "loss": 0.0103, + "step": 26075 + }, + { + "epoch": 7.784073963614674, + "grad_norm": 0.07725433260202408, + "learning_rate": 3.826648286100581e-05, + "loss": 0.0621, + "step": 26100 + }, + { + "epoch": 7.791529973158365, + "grad_norm": 0.26000112295150757, + "learning_rate": 3.8249913841096476e-05, + "loss": 0.0456, + "step": 26125 + }, + { + "epoch": 7.798985982702058, + "grad_norm": 0.05612451583147049, + "learning_rate": 3.823334482118714e-05, + "loss": 0.0386, + "step": 26150 + }, + { + "epoch": 7.80644199224575, + "grad_norm": 0.3714951276779175, + "learning_rate": 3.821677580127781e-05, + "loss": 0.0403, + "step": 26175 + }, + { + "epoch": 7.813898001789442, + "grad_norm": 0.0035816803574562073, + "learning_rate": 3.820020678136847e-05, + "loss": 0.0083, + "step": 26200 + }, + { + "epoch": 7.821354011333135, + "grad_norm": 10.295557975769043, + "learning_rate": 3.818363776145913e-05, + "loss": 0.0669, + "step": 26225 + }, + { + "epoch": 7.828810020876826, + "grad_norm": 0.02752000279724598, + "learning_rate": 3.81670687415498e-05, + "loss": 0.0259, + "step": 26250 + }, + { + "epoch": 7.836266030420519, + "grad_norm": 0.008338281884789467, + "learning_rate": 3.8150499721640464e-05, + "loss": 0.0456, + "step": 26275 + }, + { + "epoch": 7.843722039964211, + "grad_norm": 0.04406023770570755, + "learning_rate": 3.813393070173113e-05, + "loss": 0.0075, + "step": 26300 + }, + { + "epoch": 7.8511780495079035, + "grad_norm": 3.312938928604126, + "learning_rate": 3.81173616818218e-05, + "loss": 0.0602, + "step": 26325 + }, + { + "epoch": 7.858634059051596, + "grad_norm": 14.139311790466309, + "learning_rate": 3.8100792661912464e-05, + "loss": 0.0345, + "step": 26350 + }, + { + "epoch": 7.8660900685952875, + "grad_norm": 0.5736969113349915, + "learning_rate": 3.8084223642003134e-05, + "loss": 0.0441, + "step": 26375 + }, + { + "epoch": 7.87354607813898, + "grad_norm": 1.0199308395385742, + "learning_rate": 3.8067654622093796e-05, + "loss": 0.096, + "step": 26400 + }, + { + "epoch": 7.881002087682672, + "grad_norm": 36.68028259277344, + "learning_rate": 3.805108560218446e-05, + "loss": 0.0979, + "step": 26425 + }, + { + "epoch": 7.888458097226365, + "grad_norm": 0.0884442925453186, + "learning_rate": 3.8035179343071505e-05, + "loss": 0.0646, + "step": 26450 + }, + { + "epoch": 7.895914106770057, + "grad_norm": 4.98067045211792, + "learning_rate": 3.801861032316217e-05, + "loss": 0.0605, + "step": 26475 + }, + { + "epoch": 7.903370116313749, + "grad_norm": 0.042504098266363144, + "learning_rate": 3.800204130325283e-05, + "loss": 0.0354, + "step": 26500 + }, + { + "epoch": 7.910826125857441, + "grad_norm": 0.008564049378037453, + "learning_rate": 3.79854722833435e-05, + "loss": 0.0323, + "step": 26525 + }, + { + "epoch": 7.9182821354011335, + "grad_norm": 0.30509287118911743, + "learning_rate": 3.796890326343416e-05, + "loss": 0.0237, + "step": 26550 + }, + { + "epoch": 7.925738144944826, + "grad_norm": 11.284287452697754, + "learning_rate": 3.795233424352483e-05, + "loss": 0.0339, + "step": 26575 + }, + { + "epoch": 7.933194154488517, + "grad_norm": 0.008676270954310894, + "learning_rate": 3.793576522361549e-05, + "loss": 0.04, + "step": 26600 + }, + { + "epoch": 7.94065016403221, + "grad_norm": 1.3062283992767334, + "learning_rate": 3.7919196203706155e-05, + "loss": 0.0101, + "step": 26625 + }, + { + "epoch": 7.948106173575902, + "grad_norm": 0.21713188290596008, + "learning_rate": 3.7902627183796825e-05, + "loss": 0.1142, + "step": 26650 + }, + { + "epoch": 7.955562183119595, + "grad_norm": 0.06104138121008873, + "learning_rate": 3.788605816388749e-05, + "loss": 0.0221, + "step": 26675 + }, + { + "epoch": 7.963018192663286, + "grad_norm": 0.0560293085873127, + "learning_rate": 3.786948914397816e-05, + "loss": 0.0229, + "step": 26700 + }, + { + "epoch": 7.9704742022069786, + "grad_norm": 0.013059995137155056, + "learning_rate": 3.7852920124068825e-05, + "loss": 0.0233, + "step": 26725 + }, + { + "epoch": 7.977930211750671, + "grad_norm": 13.456666946411133, + "learning_rate": 3.783635110415949e-05, + "loss": 0.0191, + "step": 26750 + }, + { + "epoch": 7.985386221294363, + "grad_norm": 11.048165321350098, + "learning_rate": 3.7819782084250157e-05, + "loss": 0.0974, + "step": 26775 + }, + { + "epoch": 7.992842230838056, + "grad_norm": 0.036631595343351364, + "learning_rate": 3.780321306434082e-05, + "loss": 0.0245, + "step": 26800 + }, + { + "epoch": 8.0, + "eval_gen_len": 8.6917, + "eval_loss": 0.10363117605447769, + "eval_rouge1": 95.9929, + "eval_rouge2": 82.7618, + "eval_rougeL": 95.9431, + "eval_rougeLsum": 95.9431, + "eval_runtime": 96.085, + "eval_samples_per_second": 17.453, + "eval_steps_per_second": 4.371, + "step": 26824 + }, + { + "epoch": 8.000298240381747, + "grad_norm": 1.841543197631836, + "learning_rate": 3.778664404443149e-05, + "loss": 0.0217, + "step": 26825 + }, + { + "epoch": 8.00775424992544, + "grad_norm": 0.07992962747812271, + "learning_rate": 3.777007502452215e-05, + "loss": 0.0396, + "step": 26850 + }, + { + "epoch": 8.015210259469132, + "grad_norm": 0.06587328016757965, + "learning_rate": 3.775350600461281e-05, + "loss": 0.0426, + "step": 26875 + }, + { + "epoch": 8.022666269012824, + "grad_norm": 0.03052508272230625, + "learning_rate": 3.773693698470348e-05, + "loss": 0.0516, + "step": 26900 + }, + { + "epoch": 8.030122278556517, + "grad_norm": 1.1213061809539795, + "learning_rate": 3.772036796479415e-05, + "loss": 0.0276, + "step": 26925 + }, + { + "epoch": 8.037578288100208, + "grad_norm": 5.817966938018799, + "learning_rate": 3.770379894488481e-05, + "loss": 0.0083, + "step": 26950 + }, + { + "epoch": 8.045034297643902, + "grad_norm": 0.009192215278744698, + "learning_rate": 3.768722992497548e-05, + "loss": 0.0139, + "step": 26975 + }, + { + "epoch": 8.052490307187593, + "grad_norm": 0.2627590000629425, + "learning_rate": 3.7670660905066145e-05, + "loss": 0.0077, + "step": 27000 + }, + { + "epoch": 8.059946316731285, + "grad_norm": 11.902652740478516, + "learning_rate": 3.7654091885156814e-05, + "loss": 0.0139, + "step": 27025 + }, + { + "epoch": 8.067402326274978, + "grad_norm": 0.4330286979675293, + "learning_rate": 3.7637522865247476e-05, + "loss": 0.0087, + "step": 27050 + }, + { + "epoch": 8.07485833581867, + "grad_norm": 10.134819984436035, + "learning_rate": 3.762095384533814e-05, + "loss": 0.0643, + "step": 27075 + }, + { + "epoch": 8.082314345362363, + "grad_norm": 0.8864652514457703, + "learning_rate": 3.760438482542881e-05, + "loss": 0.0159, + "step": 27100 + }, + { + "epoch": 8.089770354906054, + "grad_norm": 0.05642193183302879, + "learning_rate": 3.758781580551947e-05, + "loss": 0.0017, + "step": 27125 + }, + { + "epoch": 8.097226364449746, + "grad_norm": 0.07227706164121628, + "learning_rate": 3.757124678561014e-05, + "loss": 0.0028, + "step": 27150 + }, + { + "epoch": 8.10468237399344, + "grad_norm": 0.06436634063720703, + "learning_rate": 3.755467776570081e-05, + "loss": 0.0141, + "step": 27175 + }, + { + "epoch": 8.11213838353713, + "grad_norm": 0.09210552275180817, + "learning_rate": 3.753810874579147e-05, + "loss": 0.0365, + "step": 27200 + }, + { + "epoch": 8.119594393080822, + "grad_norm": 0.0035904215183109045, + "learning_rate": 3.752153972588214e-05, + "loss": 0.0304, + "step": 27225 + }, + { + "epoch": 8.127050402624516, + "grad_norm": 67.14472198486328, + "learning_rate": 3.75049707059728e-05, + "loss": 0.0539, + "step": 27250 + }, + { + "epoch": 8.134506412168207, + "grad_norm": 0.2653847336769104, + "learning_rate": 3.748840168606347e-05, + "loss": 0.0454, + "step": 27275 + }, + { + "epoch": 8.1419624217119, + "grad_norm": 0.029458891600370407, + "learning_rate": 3.7471832666154134e-05, + "loss": 0.0262, + "step": 27300 + }, + { + "epoch": 8.149418431255592, + "grad_norm": 0.08708936721086502, + "learning_rate": 3.7455263646244796e-05, + "loss": 0.0798, + "step": 27325 + }, + { + "epoch": 8.156874440799283, + "grad_norm": 0.065219946205616, + "learning_rate": 3.7438694626335465e-05, + "loss": 0.022, + "step": 27350 + }, + { + "epoch": 8.164330450342977, + "grad_norm": 0.05592311546206474, + "learning_rate": 3.742212560642613e-05, + "loss": 0.039, + "step": 27375 + }, + { + "epoch": 8.171786459886668, + "grad_norm": 5.338498592376709, + "learning_rate": 3.7405556586516796e-05, + "loss": 0.0417, + "step": 27400 + }, + { + "epoch": 8.179242469430362, + "grad_norm": 0.0057130069471895695, + "learning_rate": 3.7388987566607466e-05, + "loss": 0.0232, + "step": 27425 + }, + { + "epoch": 8.186698478974053, + "grad_norm": 14.00313949584961, + "learning_rate": 3.737241854669813e-05, + "loss": 0.0464, + "step": 27450 + }, + { + "epoch": 8.194154488517745, + "grad_norm": 0.15262223780155182, + "learning_rate": 3.735651228758517e-05, + "loss": 0.0663, + "step": 27475 + }, + { + "epoch": 8.201610498061438, + "grad_norm": 0.01996547356247902, + "learning_rate": 3.733994326767583e-05, + "loss": 0.0176, + "step": 27500 + }, + { + "epoch": 8.20906650760513, + "grad_norm": 0.022581912577152252, + "learning_rate": 3.732337424776649e-05, + "loss": 0.0375, + "step": 27525 + }, + { + "epoch": 8.216522517148823, + "grad_norm": 0.019385678693652153, + "learning_rate": 3.730680522785716e-05, + "loss": 0.0203, + "step": 27550 + }, + { + "epoch": 8.223978526692514, + "grad_norm": 0.02384263090789318, + "learning_rate": 3.729023620794783e-05, + "loss": 0.0365, + "step": 27575 + }, + { + "epoch": 8.231434536236206, + "grad_norm": 0.004445483908057213, + "learning_rate": 3.7273667188038494e-05, + "loss": 0.0422, + "step": 27600 + }, + { + "epoch": 8.238890545779899, + "grad_norm": 0.005683319177478552, + "learning_rate": 3.725709816812916e-05, + "loss": 0.0154, + "step": 27625 + }, + { + "epoch": 8.24634655532359, + "grad_norm": 0.013689364306628704, + "learning_rate": 3.7240529148219825e-05, + "loss": 0.044, + "step": 27650 + }, + { + "epoch": 8.253802564867282, + "grad_norm": 0.01799396611750126, + "learning_rate": 3.7223960128310494e-05, + "loss": 0.0294, + "step": 27675 + }, + { + "epoch": 8.261258574410975, + "grad_norm": 0.0023462409153580666, + "learning_rate": 3.7207391108401157e-05, + "loss": 0.0425, + "step": 27700 + }, + { + "epoch": 8.268714583954667, + "grad_norm": 0.02442399598658085, + "learning_rate": 3.7190822088491826e-05, + "loss": 0.003, + "step": 27725 + }, + { + "epoch": 8.27617059349836, + "grad_norm": 0.022840287536382675, + "learning_rate": 3.717425306858249e-05, + "loss": 0.0277, + "step": 27750 + }, + { + "epoch": 8.283626603042052, + "grad_norm": 0.3185962736606598, + "learning_rate": 3.715768404867315e-05, + "loss": 0.0601, + "step": 27775 + }, + { + "epoch": 8.291082612585743, + "grad_norm": 0.03330976143479347, + "learning_rate": 3.714111502876382e-05, + "loss": 0.0329, + "step": 27800 + }, + { + "epoch": 8.298538622129437, + "grad_norm": 0.036625757813453674, + "learning_rate": 3.712454600885449e-05, + "loss": 0.0315, + "step": 27825 + }, + { + "epoch": 8.305994631673128, + "grad_norm": 0.5965529084205627, + "learning_rate": 3.710797698894515e-05, + "loss": 0.0215, + "step": 27850 + }, + { + "epoch": 8.313450641216821, + "grad_norm": 0.04731619358062744, + "learning_rate": 3.709140796903582e-05, + "loss": 0.0041, + "step": 27875 + }, + { + "epoch": 8.320906650760513, + "grad_norm": 0.20525537431240082, + "learning_rate": 3.707483894912648e-05, + "loss": 0.0223, + "step": 27900 + }, + { + "epoch": 8.328362660304204, + "grad_norm": 4.396222114562988, + "learning_rate": 3.705826992921715e-05, + "loss": 0.0133, + "step": 27925 + }, + { + "epoch": 8.335818669847898, + "grad_norm": 0.1394113004207611, + "learning_rate": 3.7041700909307814e-05, + "loss": 0.0265, + "step": 27950 + }, + { + "epoch": 8.34327467939159, + "grad_norm": 81.92708587646484, + "learning_rate": 3.7025131889398476e-05, + "loss": 0.0188, + "step": 27975 + }, + { + "epoch": 8.350730688935283, + "grad_norm": 0.15940868854522705, + "learning_rate": 3.7008562869489145e-05, + "loss": 0.0396, + "step": 28000 + }, + { + "epoch": 8.358186698478974, + "grad_norm": 0.8631670475006104, + "learning_rate": 3.699199384957981e-05, + "loss": 0.0064, + "step": 28025 + }, + { + "epoch": 8.365642708022666, + "grad_norm": 0.06859345734119415, + "learning_rate": 3.697542482967048e-05, + "loss": 0.0251, + "step": 28050 + }, + { + "epoch": 8.373098717566359, + "grad_norm": 0.007882620207965374, + "learning_rate": 3.6958855809761146e-05, + "loss": 0.0147, + "step": 28075 + }, + { + "epoch": 8.38055472711005, + "grad_norm": 0.0038322643376886845, + "learning_rate": 3.694228678985181e-05, + "loss": 0.1021, + "step": 28100 + }, + { + "epoch": 8.388010736653744, + "grad_norm": 1.0349613428115845, + "learning_rate": 3.692571776994248e-05, + "loss": 0.071, + "step": 28125 + }, + { + "epoch": 8.395466746197435, + "grad_norm": 6.251139163970947, + "learning_rate": 3.690914875003314e-05, + "loss": 0.0183, + "step": 28150 + }, + { + "epoch": 8.402922755741127, + "grad_norm": 31.23532485961914, + "learning_rate": 3.68925797301238e-05, + "loss": 0.0176, + "step": 28175 + }, + { + "epoch": 8.41037876528482, + "grad_norm": 0.018022626638412476, + "learning_rate": 3.687601071021447e-05, + "loss": 0.0045, + "step": 28200 + }, + { + "epoch": 8.417834774828512, + "grad_norm": 0.022061647847294807, + "learning_rate": 3.6859441690305134e-05, + "loss": 0.0017, + "step": 28225 + }, + { + "epoch": 8.425290784372205, + "grad_norm": 7.409425258636475, + "learning_rate": 3.68428726703958e-05, + "loss": 0.0454, + "step": 28250 + }, + { + "epoch": 8.432746793915896, + "grad_norm": 0.879426896572113, + "learning_rate": 3.6826303650486465e-05, + "loss": 0.002, + "step": 28275 + }, + { + "epoch": 8.440202803459588, + "grad_norm": 0.8661222457885742, + "learning_rate": 3.6809734630577134e-05, + "loss": 0.0513, + "step": 28300 + }, + { + "epoch": 8.447658813003281, + "grad_norm": 0.04469098895788193, + "learning_rate": 3.67931656106678e-05, + "loss": 0.0269, + "step": 28325 + }, + { + "epoch": 8.455114822546973, + "grad_norm": 0.0013134300243109465, + "learning_rate": 3.6776596590758466e-05, + "loss": 0.0306, + "step": 28350 + }, + { + "epoch": 8.462570832090664, + "grad_norm": 0.17150649428367615, + "learning_rate": 3.6760027570849135e-05, + "loss": 0.0687, + "step": 28375 + }, + { + "epoch": 8.470026841634358, + "grad_norm": 0.03961332514882088, + "learning_rate": 3.67434585509398e-05, + "loss": 0.007, + "step": 28400 + }, + { + "epoch": 8.47748285117805, + "grad_norm": 0.6865983009338379, + "learning_rate": 3.672688953103046e-05, + "loss": 0.0586, + "step": 28425 + }, + { + "epoch": 8.484938860721742, + "grad_norm": 0.21008718013763428, + "learning_rate": 3.671032051112113e-05, + "loss": 0.0705, + "step": 28450 + }, + { + "epoch": 8.492394870265434, + "grad_norm": 12.806836128234863, + "learning_rate": 3.669375149121179e-05, + "loss": 0.0316, + "step": 28475 + }, + { + "epoch": 8.499850879809125, + "grad_norm": 0.010018163360655308, + "learning_rate": 3.667718247130246e-05, + "loss": 0.0202, + "step": 28500 + }, + { + "epoch": 8.507306889352819, + "grad_norm": 0.01900198683142662, + "learning_rate": 3.666061345139313e-05, + "loss": 0.0562, + "step": 28525 + }, + { + "epoch": 8.51476289889651, + "grad_norm": 0.003071998944506049, + "learning_rate": 3.664404443148379e-05, + "loss": 0.0144, + "step": 28550 + }, + { + "epoch": 8.522218908440204, + "grad_norm": 0.02706441655755043, + "learning_rate": 3.662747541157446e-05, + "loss": 0.0166, + "step": 28575 + }, + { + "epoch": 8.529674917983895, + "grad_norm": 9.968490600585938, + "learning_rate": 3.661090639166512e-05, + "loss": 0.0216, + "step": 28600 + }, + { + "epoch": 8.537130927527587, + "grad_norm": 0.0901758223772049, + "learning_rate": 3.6594337371755785e-05, + "loss": 0.0452, + "step": 28625 + }, + { + "epoch": 8.54458693707128, + "grad_norm": 0.07754506915807724, + "learning_rate": 3.6577768351846454e-05, + "loss": 0.0099, + "step": 28650 + }, + { + "epoch": 8.552042946614971, + "grad_norm": 0.004118072800338268, + "learning_rate": 3.656119933193712e-05, + "loss": 0.022, + "step": 28675 + }, + { + "epoch": 8.559498956158663, + "grad_norm": 0.017649231478571892, + "learning_rate": 3.6544630312027786e-05, + "loss": 0.0456, + "step": 28700 + }, + { + "epoch": 8.566954965702356, + "grad_norm": 0.0039413124322891235, + "learning_rate": 3.652806129211845e-05, + "loss": 0.0079, + "step": 28725 + }, + { + "epoch": 8.574410975246048, + "grad_norm": 0.003399114590138197, + "learning_rate": 3.651149227220911e-05, + "loss": 0.0245, + "step": 28750 + }, + { + "epoch": 8.581866984789741, + "grad_norm": 44.0549430847168, + "learning_rate": 3.6494923252299786e-05, + "loss": 0.0432, + "step": 28775 + }, + { + "epoch": 8.589322994333433, + "grad_norm": 0.0016269719926640391, + "learning_rate": 3.647835423239045e-05, + "loss": 0.0119, + "step": 28800 + }, + { + "epoch": 8.596779003877124, + "grad_norm": 0.00740943755954504, + "learning_rate": 3.646178521248111e-05, + "loss": 0.0185, + "step": 28825 + }, + { + "epoch": 8.604235013420817, + "grad_norm": 0.24125465750694275, + "learning_rate": 3.644521619257178e-05, + "loss": 0.0082, + "step": 28850 + }, + { + "epoch": 8.611691022964509, + "grad_norm": 2.92952299118042, + "learning_rate": 3.642864717266244e-05, + "loss": 0.0682, + "step": 28875 + }, + { + "epoch": 8.619147032508202, + "grad_norm": 10.032451629638672, + "learning_rate": 3.641207815275311e-05, + "loss": 0.0066, + "step": 28900 + }, + { + "epoch": 8.626603042051894, + "grad_norm": 0.9171352982521057, + "learning_rate": 3.6395509132843774e-05, + "loss": 0.0286, + "step": 28925 + }, + { + "epoch": 8.634059051595585, + "grad_norm": 0.013421298936009407, + "learning_rate": 3.637894011293444e-05, + "loss": 0.0504, + "step": 28950 + }, + { + "epoch": 8.641515061139279, + "grad_norm": 0.06524740159511566, + "learning_rate": 3.6362371093025105e-05, + "loss": 0.0185, + "step": 28975 + }, + { + "epoch": 8.64897107068297, + "grad_norm": 0.033882249146699905, + "learning_rate": 3.6345802073115775e-05, + "loss": 0.0157, + "step": 29000 + }, + { + "epoch": 8.656427080226663, + "grad_norm": 0.06428802013397217, + "learning_rate": 3.6329233053206444e-05, + "loss": 0.0338, + "step": 29025 + }, + { + "epoch": 8.663883089770355, + "grad_norm": 0.01688966527581215, + "learning_rate": 3.6312664033297106e-05, + "loss": 0.0063, + "step": 29050 + }, + { + "epoch": 8.671339099314046, + "grad_norm": 23.71369743347168, + "learning_rate": 3.629609501338777e-05, + "loss": 0.031, + "step": 29075 + }, + { + "epoch": 8.67879510885774, + "grad_norm": 0.0019492580322548747, + "learning_rate": 3.627952599347844e-05, + "loss": 0.0749, + "step": 29100 + }, + { + "epoch": 8.686251118401431, + "grad_norm": 15.162439346313477, + "learning_rate": 3.62629569735691e-05, + "loss": 0.0487, + "step": 29125 + }, + { + "epoch": 8.693707127945125, + "grad_norm": 0.5042064189910889, + "learning_rate": 3.624638795365977e-05, + "loss": 0.023, + "step": 29150 + }, + { + "epoch": 8.701163137488816, + "grad_norm": 3.6821882724761963, + "learning_rate": 3.622981893375043e-05, + "loss": 0.0714, + "step": 29175 + }, + { + "epoch": 8.708619147032508, + "grad_norm": 6.568551540374756, + "learning_rate": 3.6213249913841094e-05, + "loss": 0.0341, + "step": 29200 + }, + { + "epoch": 8.716075156576201, + "grad_norm": 0.3266686201095581, + "learning_rate": 3.619668089393176e-05, + "loss": 0.0676, + "step": 29225 + }, + { + "epoch": 8.723531166119892, + "grad_norm": 0.014193633571267128, + "learning_rate": 3.618011187402243e-05, + "loss": 0.0166, + "step": 29250 + }, + { + "epoch": 8.730987175663586, + "grad_norm": 0.0672958567738533, + "learning_rate": 3.6163542854113094e-05, + "loss": 0.0032, + "step": 29275 + }, + { + "epoch": 8.738443185207277, + "grad_norm": 0.007883368991315365, + "learning_rate": 3.614697383420376e-05, + "loss": 0.0449, + "step": 29300 + }, + { + "epoch": 8.745899194750969, + "grad_norm": 20.07937240600586, + "learning_rate": 3.6130404814294426e-05, + "loss": 0.0445, + "step": 29325 + }, + { + "epoch": 8.753355204294662, + "grad_norm": 5.050912857055664, + "learning_rate": 3.6113835794385095e-05, + "loss": 0.0058, + "step": 29350 + }, + { + "epoch": 8.760811213838354, + "grad_norm": 0.019583450630307198, + "learning_rate": 3.609726677447576e-05, + "loss": 0.0092, + "step": 29375 + }, + { + "epoch": 8.768267223382045, + "grad_norm": 6.366359710693359, + "learning_rate": 3.608069775456642e-05, + "loss": 0.0361, + "step": 29400 + }, + { + "epoch": 8.775723232925738, + "grad_norm": 0.012687691487371922, + "learning_rate": 3.606412873465709e-05, + "loss": 0.0471, + "step": 29425 + }, + { + "epoch": 8.78317924246943, + "grad_norm": 0.04783850535750389, + "learning_rate": 3.604755971474775e-05, + "loss": 0.0375, + "step": 29450 + }, + { + "epoch": 8.790635252013123, + "grad_norm": 0.058531004935503006, + "learning_rate": 3.603099069483842e-05, + "loss": 0.0109, + "step": 29475 + }, + { + "epoch": 8.798091261556815, + "grad_norm": 0.006072982680052519, + "learning_rate": 3.601442167492909e-05, + "loss": 0.0275, + "step": 29500 + }, + { + "epoch": 8.805547271100506, + "grad_norm": 0.03280794247984886, + "learning_rate": 3.599785265501975e-05, + "loss": 0.0142, + "step": 29525 + }, + { + "epoch": 8.8130032806442, + "grad_norm": 0.36898598074913025, + "learning_rate": 3.598128363511042e-05, + "loss": 0.0063, + "step": 29550 + }, + { + "epoch": 8.820459290187891, + "grad_norm": 2.096160888671875, + "learning_rate": 3.596471461520108e-05, + "loss": 0.012, + "step": 29575 + }, + { + "epoch": 8.827915299731584, + "grad_norm": 0.62769615650177, + "learning_rate": 3.594814559529175e-05, + "loss": 0.0109, + "step": 29600 + }, + { + "epoch": 8.835371309275276, + "grad_norm": 9.831634521484375, + "learning_rate": 3.5931576575382414e-05, + "loss": 0.0797, + "step": 29625 + }, + { + "epoch": 8.842827318818967, + "grad_norm": 9.49618911743164, + "learning_rate": 3.591500755547308e-05, + "loss": 0.0206, + "step": 29650 + }, + { + "epoch": 8.85028332836266, + "grad_norm": 73.54988098144531, + "learning_rate": 3.5898438535563746e-05, + "loss": 0.0308, + "step": 29675 + }, + { + "epoch": 8.857739337906352, + "grad_norm": 0.0006858339766040444, + "learning_rate": 3.588186951565441e-05, + "loss": 0.02, + "step": 29700 + }, + { + "epoch": 8.865195347450046, + "grad_norm": 0.012461572885513306, + "learning_rate": 3.586530049574508e-05, + "loss": 0.0373, + "step": 29725 + }, + { + "epoch": 8.872651356993737, + "grad_norm": 0.04995008185505867, + "learning_rate": 3.5848731475835747e-05, + "loss": 0.0527, + "step": 29750 + }, + { + "epoch": 8.880107366537429, + "grad_norm": 2.493194103240967, + "learning_rate": 3.583216245592641e-05, + "loss": 0.0208, + "step": 29775 + }, + { + "epoch": 8.887563376081122, + "grad_norm": 0.01812615804374218, + "learning_rate": 3.581559343601708e-05, + "loss": 0.0165, + "step": 29800 + }, + { + "epoch": 8.895019385624813, + "grad_norm": 0.01689509116113186, + "learning_rate": 3.579902441610774e-05, + "loss": 0.0085, + "step": 29825 + }, + { + "epoch": 8.902475395168505, + "grad_norm": 0.011237064376473427, + "learning_rate": 3.57824553961984e-05, + "loss": 0.0257, + "step": 29850 + }, + { + "epoch": 8.909931404712198, + "grad_norm": 0.009161061607301235, + "learning_rate": 3.576588637628907e-05, + "loss": 0.0231, + "step": 29875 + }, + { + "epoch": 8.91738741425589, + "grad_norm": 0.9806844592094421, + "learning_rate": 3.5749317356379734e-05, + "loss": 0.0841, + "step": 29900 + }, + { + "epoch": 8.924843423799583, + "grad_norm": 0.01535357441753149, + "learning_rate": 3.57327483364704e-05, + "loss": 0.0207, + "step": 29925 + }, + { + "epoch": 8.932299433343275, + "grad_norm": 0.054871998727321625, + "learning_rate": 3.5716179316561066e-05, + "loss": 0.0079, + "step": 29950 + }, + { + "epoch": 8.939755442886966, + "grad_norm": 0.011252072639763355, + "learning_rate": 3.5699610296651735e-05, + "loss": 0.05, + "step": 29975 + }, + { + "epoch": 8.94721145243066, + "grad_norm": 0.03132103383541107, + "learning_rate": 3.5683041276742404e-05, + "loss": 0.0051, + "step": 30000 + }, + { + "epoch": 8.954667461974351, + "grad_norm": 0.07881677895784378, + "learning_rate": 3.5666472256833066e-05, + "loss": 0.0274, + "step": 30025 + }, + { + "epoch": 8.962123471518044, + "grad_norm": 0.00470997067168355, + "learning_rate": 3.564990323692373e-05, + "loss": 0.029, + "step": 30050 + }, + { + "epoch": 8.969579481061736, + "grad_norm": 4.142818450927734, + "learning_rate": 3.56333342170144e-05, + "loss": 0.0153, + "step": 30075 + }, + { + "epoch": 8.977035490605427, + "grad_norm": 1.2706577777862549, + "learning_rate": 3.561676519710506e-05, + "loss": 0.0767, + "step": 30100 + }, + { + "epoch": 8.98449150014912, + "grad_norm": 0.01267695240676403, + "learning_rate": 3.560019617719573e-05, + "loss": 0.0846, + "step": 30125 + }, + { + "epoch": 8.991947509692812, + "grad_norm": 7.920314788818359, + "learning_rate": 3.558362715728639e-05, + "loss": 0.0442, + "step": 30150 + }, + { + "epoch": 8.999403519236505, + "grad_norm": 34.99578094482422, + "learning_rate": 3.556705813737706e-05, + "loss": 0.0273, + "step": 30175 + }, + { + "epoch": 9.0, + "eval_gen_len": 8.7102, + "eval_loss": 0.083512082695961, + "eval_rouge1": 97.0896, + "eval_rouge2": 84.4122, + "eval_rougeL": 97.0507, + "eval_rougeLsum": 97.0591, + "eval_runtime": 101.145, + "eval_samples_per_second": 16.58, + "eval_steps_per_second": 4.152, + "step": 30177 + }, + { + "epoch": 9.006859528780197, + "grad_norm": 0.011261310428380966, + "learning_rate": 3.555048911746773e-05, + "loss": 0.0355, + "step": 30200 + }, + { + "epoch": 9.014315538323888, + "grad_norm": 0.014239492826163769, + "learning_rate": 3.553392009755839e-05, + "loss": 0.0229, + "step": 30225 + }, + { + "epoch": 9.021771547867582, + "grad_norm": 0.02628404088318348, + "learning_rate": 3.551735107764906e-05, + "loss": 0.0127, + "step": 30250 + }, + { + "epoch": 9.029227557411273, + "grad_norm": 0.0303230881690979, + "learning_rate": 3.5500782057739723e-05, + "loss": 0.0052, + "step": 30275 + }, + { + "epoch": 9.036683566954967, + "grad_norm": 7.405888080596924, + "learning_rate": 3.5484213037830386e-05, + "loss": 0.0111, + "step": 30300 + }, + { + "epoch": 9.044139576498658, + "grad_norm": 2.99228835105896, + "learning_rate": 3.5467644017921055e-05, + "loss": 0.034, + "step": 30325 + }, + { + "epoch": 9.05159558604235, + "grad_norm": 0.05243349075317383, + "learning_rate": 3.545107499801172e-05, + "loss": 0.017, + "step": 30350 + }, + { + "epoch": 9.059051595586043, + "grad_norm": 0.019075891003012657, + "learning_rate": 3.5434505978102386e-05, + "loss": 0.004, + "step": 30375 + }, + { + "epoch": 9.066507605129734, + "grad_norm": 0.004510013852268457, + "learning_rate": 3.541793695819305e-05, + "loss": 0.01, + "step": 30400 + }, + { + "epoch": 9.073963614673426, + "grad_norm": 0.47157201170921326, + "learning_rate": 3.540136793828371e-05, + "loss": 0.0433, + "step": 30425 + }, + { + "epoch": 9.08141962421712, + "grad_norm": 0.05149286240339279, + "learning_rate": 3.538479891837439e-05, + "loss": 0.0823, + "step": 30450 + }, + { + "epoch": 9.08887563376081, + "grad_norm": 9.489185333251953, + "learning_rate": 3.536822989846505e-05, + "loss": 0.013, + "step": 30475 + }, + { + "epoch": 9.096331643304504, + "grad_norm": 32.13996505737305, + "learning_rate": 3.535166087855571e-05, + "loss": 0.0236, + "step": 30500 + }, + { + "epoch": 9.103787652848196, + "grad_norm": 0.18573585152626038, + "learning_rate": 3.533509185864638e-05, + "loss": 0.0031, + "step": 30525 + }, + { + "epoch": 9.111243662391887, + "grad_norm": 24.975109100341797, + "learning_rate": 3.531852283873704e-05, + "loss": 0.0096, + "step": 30550 + }, + { + "epoch": 9.11869967193558, + "grad_norm": 0.7033310532569885, + "learning_rate": 3.530195381882771e-05, + "loss": 0.0272, + "step": 30575 + }, + { + "epoch": 9.126155681479272, + "grad_norm": 0.0882677286863327, + "learning_rate": 3.5285384798918375e-05, + "loss": 0.0099, + "step": 30600 + }, + { + "epoch": 9.133611691022965, + "grad_norm": 2.4805209636688232, + "learning_rate": 3.526881577900904e-05, + "loss": 0.0021, + "step": 30625 + }, + { + "epoch": 9.141067700566657, + "grad_norm": 0.05557962879538536, + "learning_rate": 3.5252246759099706e-05, + "loss": 0.015, + "step": 30650 + }, + { + "epoch": 9.148523710110348, + "grad_norm": 2.1198525428771973, + "learning_rate": 3.5235677739190375e-05, + "loss": 0.0088, + "step": 30675 + }, + { + "epoch": 9.155979719654042, + "grad_norm": 87.86044311523438, + "learning_rate": 3.5219108719281044e-05, + "loss": 0.0189, + "step": 30700 + }, + { + "epoch": 9.163435729197733, + "grad_norm": 0.35490408539772034, + "learning_rate": 3.520253969937171e-05, + "loss": 0.0026, + "step": 30725 + }, + { + "epoch": 9.170891738741426, + "grad_norm": 0.006703643128275871, + "learning_rate": 3.518597067946237e-05, + "loss": 0.0296, + "step": 30750 + }, + { + "epoch": 9.178347748285118, + "grad_norm": 0.014011339284479618, + "learning_rate": 3.516940165955304e-05, + "loss": 0.03, + "step": 30775 + }, + { + "epoch": 9.18580375782881, + "grad_norm": 4.419519901275635, + "learning_rate": 3.51528326396437e-05, + "loss": 0.089, + "step": 30800 + }, + { + "epoch": 9.193259767372503, + "grad_norm": 0.008075419813394547, + "learning_rate": 3.513626361973437e-05, + "loss": 0.005, + "step": 30825 + }, + { + "epoch": 9.200715776916194, + "grad_norm": 0.0033533978275954723, + "learning_rate": 3.511969459982503e-05, + "loss": 0.0052, + "step": 30850 + }, + { + "epoch": 9.208171786459888, + "grad_norm": 0.022313714027404785, + "learning_rate": 3.5103125579915694e-05, + "loss": 0.0247, + "step": 30875 + }, + { + "epoch": 9.215627796003579, + "grad_norm": 0.16721826791763306, + "learning_rate": 3.508655656000636e-05, + "loss": 0.0271, + "step": 30900 + }, + { + "epoch": 9.22308380554727, + "grad_norm": 0.0008401995291933417, + "learning_rate": 3.506998754009703e-05, + "loss": 0.0128, + "step": 30925 + }, + { + "epoch": 9.230539815090964, + "grad_norm": 0.008809903636574745, + "learning_rate": 3.5053418520187695e-05, + "loss": 0.0095, + "step": 30950 + }, + { + "epoch": 9.237995824634655, + "grad_norm": 0.013414003886282444, + "learning_rate": 3.5036849500278364e-05, + "loss": 0.0012, + "step": 30975 + }, + { + "epoch": 9.245451834178347, + "grad_norm": 0.009434329345822334, + "learning_rate": 3.5020280480369026e-05, + "loss": 0.0576, + "step": 31000 + }, + { + "epoch": 9.25290784372204, + "grad_norm": 0.014045946300029755, + "learning_rate": 3.5003711460459695e-05, + "loss": 0.0188, + "step": 31025 + }, + { + "epoch": 9.260363853265732, + "grad_norm": 0.044111546128988266, + "learning_rate": 3.498714244055036e-05, + "loss": 0.0047, + "step": 31050 + }, + { + "epoch": 9.267819862809425, + "grad_norm": 0.03687797114253044, + "learning_rate": 3.497057342064102e-05, + "loss": 0.0169, + "step": 31075 + }, + { + "epoch": 9.275275872353117, + "grad_norm": 0.012163372710347176, + "learning_rate": 3.495400440073169e-05, + "loss": 0.0256, + "step": 31100 + }, + { + "epoch": 9.282731881896808, + "grad_norm": 0.020992450416088104, + "learning_rate": 3.493743538082235e-05, + "loss": 0.0166, + "step": 31125 + }, + { + "epoch": 9.290187891440501, + "grad_norm": 0.00904077384620905, + "learning_rate": 3.492086636091302e-05, + "loss": 0.0555, + "step": 31150 + }, + { + "epoch": 9.297643900984193, + "grad_norm": 0.13449884951114655, + "learning_rate": 3.490429734100369e-05, + "loss": 0.0331, + "step": 31175 + }, + { + "epoch": 9.305099910527886, + "grad_norm": 0.013724715448915958, + "learning_rate": 3.488772832109435e-05, + "loss": 0.025, + "step": 31200 + }, + { + "epoch": 9.312555920071578, + "grad_norm": 0.006786949001252651, + "learning_rate": 3.487115930118502e-05, + "loss": 0.0283, + "step": 31225 + }, + { + "epoch": 9.32001192961527, + "grad_norm": 0.006112619303166866, + "learning_rate": 3.4854590281275684e-05, + "loss": 0.0127, + "step": 31250 + }, + { + "epoch": 9.327467939158963, + "grad_norm": 0.018585694953799248, + "learning_rate": 3.4838021261366346e-05, + "loss": 0.0153, + "step": 31275 + }, + { + "epoch": 9.334923948702654, + "grad_norm": 0.0026641006115823984, + "learning_rate": 3.4821452241457015e-05, + "loss": 0.0022, + "step": 31300 + }, + { + "epoch": 9.342379958246347, + "grad_norm": 0.20816652476787567, + "learning_rate": 3.480488322154768e-05, + "loss": 0.009, + "step": 31325 + }, + { + "epoch": 9.349835967790039, + "grad_norm": 0.0028136121109128, + "learning_rate": 3.4788314201638347e-05, + "loss": 0.0331, + "step": 31350 + }, + { + "epoch": 9.35729197733373, + "grad_norm": 0.02150745317339897, + "learning_rate": 3.477174518172901e-05, + "loss": 0.0458, + "step": 31375 + }, + { + "epoch": 9.364747986877424, + "grad_norm": 17.131425857543945, + "learning_rate": 3.475517616181968e-05, + "loss": 0.0879, + "step": 31400 + }, + { + "epoch": 9.372203996421115, + "grad_norm": 0.10048453509807587, + "learning_rate": 3.473860714191035e-05, + "loss": 0.0168, + "step": 31425 + }, + { + "epoch": 9.379660005964809, + "grad_norm": 0.005317870993167162, + "learning_rate": 3.472203812200101e-05, + "loss": 0.0402, + "step": 31450 + }, + { + "epoch": 9.3871160155085, + "grad_norm": 0.11025191098451614, + "learning_rate": 3.470546910209168e-05, + "loss": 0.0188, + "step": 31475 + }, + { + "epoch": 9.394572025052192, + "grad_norm": 0.024496447294950485, + "learning_rate": 3.468890008218234e-05, + "loss": 0.0118, + "step": 31500 + }, + { + "epoch": 9.402028034595885, + "grad_norm": 6.208194732666016, + "learning_rate": 3.4672331062273e-05, + "loss": 0.0385, + "step": 31525 + }, + { + "epoch": 9.409484044139576, + "grad_norm": 0.18992580473423004, + "learning_rate": 3.465576204236367e-05, + "loss": 0.0086, + "step": 31550 + }, + { + "epoch": 9.416940053683268, + "grad_norm": 24.698410034179688, + "learning_rate": 3.4639193022454335e-05, + "loss": 0.0273, + "step": 31575 + }, + { + "epoch": 9.424396063226961, + "grad_norm": 3.067220687866211, + "learning_rate": 3.4622624002545004e-05, + "loss": 0.0416, + "step": 31600 + }, + { + "epoch": 9.431852072770653, + "grad_norm": 0.010118064470589161, + "learning_rate": 3.4606054982635666e-05, + "loss": 0.0051, + "step": 31625 + }, + { + "epoch": 9.439308082314346, + "grad_norm": 0.014647743664681911, + "learning_rate": 3.4589485962726335e-05, + "loss": 0.0412, + "step": 31650 + }, + { + "epoch": 9.446764091858038, + "grad_norm": 0.017802445217967033, + "learning_rate": 3.4572916942817004e-05, + "loss": 0.0313, + "step": 31675 + }, + { + "epoch": 9.454220101401729, + "grad_norm": 0.0036217891611158848, + "learning_rate": 3.455634792290767e-05, + "loss": 0.0569, + "step": 31700 + }, + { + "epoch": 9.461676110945422, + "grad_norm": 13.685757637023926, + "learning_rate": 3.453977890299833e-05, + "loss": 0.0208, + "step": 31725 + }, + { + "epoch": 9.469132120489114, + "grad_norm": 0.021553946658968925, + "learning_rate": 3.4523209883089e-05, + "loss": 0.0549, + "step": 31750 + }, + { + "epoch": 9.476588130032807, + "grad_norm": 0.2268332988023758, + "learning_rate": 3.450664086317966e-05, + "loss": 0.0073, + "step": 31775 + }, + { + "epoch": 9.484044139576499, + "grad_norm": 0.022020496428012848, + "learning_rate": 3.449007184327033e-05, + "loss": 0.05, + "step": 31800 + }, + { + "epoch": 9.49150014912019, + "grad_norm": 0.0033605294302105904, + "learning_rate": 3.447350282336099e-05, + "loss": 0.0224, + "step": 31825 + }, + { + "epoch": 9.498956158663884, + "grad_norm": 0.0036527118645608425, + "learning_rate": 3.4456933803451654e-05, + "loss": 0.0974, + "step": 31850 + }, + { + "epoch": 9.506412168207575, + "grad_norm": 0.6867750287055969, + "learning_rate": 3.444036478354233e-05, + "loss": 0.0149, + "step": 31875 + }, + { + "epoch": 9.513868177751267, + "grad_norm": 0.02161332778632641, + "learning_rate": 3.442379576363299e-05, + "loss": 0.028, + "step": 31900 + }, + { + "epoch": 9.52132418729496, + "grad_norm": 0.021856382489204407, + "learning_rate": 3.440722674372366e-05, + "loss": 0.0012, + "step": 31925 + }, + { + "epoch": 9.528780196838651, + "grad_norm": 0.5613261461257935, + "learning_rate": 3.4390657723814324e-05, + "loss": 0.0201, + "step": 31950 + }, + { + "epoch": 9.536236206382345, + "grad_norm": 0.010432520881295204, + "learning_rate": 3.4374088703904986e-05, + "loss": 0.0159, + "step": 31975 + }, + { + "epoch": 9.543692215926036, + "grad_norm": 0.5688920021057129, + "learning_rate": 3.4357519683995656e-05, + "loss": 0.047, + "step": 32000 + }, + { + "epoch": 9.551148225469728, + "grad_norm": 0.00614794809371233, + "learning_rate": 3.434095066408632e-05, + "loss": 0.0608, + "step": 32025 + }, + { + "epoch": 9.558604235013421, + "grad_norm": 0.6179513931274414, + "learning_rate": 3.432438164417699e-05, + "loss": 0.0079, + "step": 32050 + }, + { + "epoch": 9.566060244557113, + "grad_norm": 0.5426047444343567, + "learning_rate": 3.430781262426765e-05, + "loss": 0.0386, + "step": 32075 + }, + { + "epoch": 9.573516254100806, + "grad_norm": 0.3112524747848511, + "learning_rate": 3.429124360435831e-05, + "loss": 0.0048, + "step": 32100 + }, + { + "epoch": 9.580972263644497, + "grad_norm": 1.4666061401367188, + "learning_rate": 3.427467458444899e-05, + "loss": 0.0011, + "step": 32125 + }, + { + "epoch": 9.588428273188189, + "grad_norm": 0.025465501472353935, + "learning_rate": 3.425810556453965e-05, + "loss": 0.0112, + "step": 32150 + }, + { + "epoch": 9.595884282731882, + "grad_norm": 0.23355644941329956, + "learning_rate": 3.424153654463031e-05, + "loss": 0.0242, + "step": 32175 + }, + { + "epoch": 9.603340292275574, + "grad_norm": 0.0009142689523287117, + "learning_rate": 3.422496752472098e-05, + "loss": 0.005, + "step": 32200 + }, + { + "epoch": 9.610796301819267, + "grad_norm": 0.06617454439401627, + "learning_rate": 3.4208398504811644e-05, + "loss": 0.0086, + "step": 32225 + }, + { + "epoch": 9.618252311362959, + "grad_norm": 0.006406477652490139, + "learning_rate": 3.419182948490231e-05, + "loss": 0.0039, + "step": 32250 + }, + { + "epoch": 9.62570832090665, + "grad_norm": 1.2433578968048096, + "learning_rate": 3.4175260464992975e-05, + "loss": 0.02, + "step": 32275 + }, + { + "epoch": 9.633164330450343, + "grad_norm": 0.021782483905553818, + "learning_rate": 3.415869144508364e-05, + "loss": 0.0145, + "step": 32300 + }, + { + "epoch": 9.640620339994035, + "grad_norm": 0.009916610084474087, + "learning_rate": 3.414212242517431e-05, + "loss": 0.046, + "step": 32325 + }, + { + "epoch": 9.648076349537728, + "grad_norm": 0.008093140088021755, + "learning_rate": 3.412621616606135e-05, + "loss": 0.0117, + "step": 32350 + }, + { + "epoch": 9.65553235908142, + "grad_norm": 0.025815371423959732, + "learning_rate": 3.4109647146152016e-05, + "loss": 0.0066, + "step": 32375 + }, + { + "epoch": 9.662988368625111, + "grad_norm": 20.948083877563477, + "learning_rate": 3.409307812624268e-05, + "loss": 0.0199, + "step": 32400 + }, + { + "epoch": 9.670444378168805, + "grad_norm": 0.0008856813074089587, + "learning_rate": 3.407650910633334e-05, + "loss": 0.0092, + "step": 32425 + }, + { + "epoch": 9.677900387712496, + "grad_norm": 14.290371894836426, + "learning_rate": 3.405994008642401e-05, + "loss": 0.0845, + "step": 32450 + }, + { + "epoch": 9.68535639725619, + "grad_norm": 0.29901570081710815, + "learning_rate": 3.404337106651467e-05, + "loss": 0.0091, + "step": 32475 + }, + { + "epoch": 9.69281240679988, + "grad_norm": 1.902666687965393, + "learning_rate": 3.402680204660534e-05, + "loss": 0.0014, + "step": 32500 + }, + { + "epoch": 9.700268416343572, + "grad_norm": 5.127783298492432, + "learning_rate": 3.401023302669601e-05, + "loss": 0.0075, + "step": 32525 + }, + { + "epoch": 9.707724425887266, + "grad_norm": 0.020370395854115486, + "learning_rate": 3.399366400678667e-05, + "loss": 0.0011, + "step": 32550 + }, + { + "epoch": 9.715180435430957, + "grad_norm": 0.3405543565750122, + "learning_rate": 3.397709498687734e-05, + "loss": 0.0062, + "step": 32575 + }, + { + "epoch": 9.722636444974649, + "grad_norm": 0.06298086047172546, + "learning_rate": 3.3960525966968004e-05, + "loss": 0.0021, + "step": 32600 + }, + { + "epoch": 9.730092454518342, + "grad_norm": 0.14343926310539246, + "learning_rate": 3.394395694705867e-05, + "loss": 0.042, + "step": 32625 + }, + { + "epoch": 9.737548464062034, + "grad_norm": 0.02531730756163597, + "learning_rate": 3.3927387927149336e-05, + "loss": 0.0063, + "step": 32650 + }, + { + "epoch": 9.745004473605727, + "grad_norm": 0.04750616475939751, + "learning_rate": 3.391081890724e-05, + "loss": 0.0276, + "step": 32675 + }, + { + "epoch": 9.752460483149418, + "grad_norm": 11.076581954956055, + "learning_rate": 3.389424988733067e-05, + "loss": 0.0445, + "step": 32700 + }, + { + "epoch": 9.75991649269311, + "grad_norm": 0.025029189884662628, + "learning_rate": 3.387768086742133e-05, + "loss": 0.0137, + "step": 32725 + }, + { + "epoch": 9.767372502236803, + "grad_norm": 0.040830034762620926, + "learning_rate": 3.386111184751199e-05, + "loss": 0.016, + "step": 32750 + }, + { + "epoch": 9.774828511780495, + "grad_norm": 0.01969616673886776, + "learning_rate": 3.384454282760267e-05, + "loss": 0.0174, + "step": 32775 + }, + { + "epoch": 9.782284521324188, + "grad_norm": 0.44630536437034607, + "learning_rate": 3.382797380769333e-05, + "loss": 0.012, + "step": 32800 + }, + { + "epoch": 9.78974053086788, + "grad_norm": 0.4425484836101532, + "learning_rate": 3.381140478778399e-05, + "loss": 0.0531, + "step": 32825 + }, + { + "epoch": 9.797196540411571, + "grad_norm": 0.02152046374976635, + "learning_rate": 3.379549852867103e-05, + "loss": 0.0148, + "step": 32850 + }, + { + "epoch": 9.804652549955264, + "grad_norm": 0.0015139617025852203, + "learning_rate": 3.3778929508761695e-05, + "loss": 0.0025, + "step": 32875 + }, + { + "epoch": 9.812108559498956, + "grad_norm": 25.084253311157227, + "learning_rate": 3.3762360488852365e-05, + "loss": 0.0937, + "step": 32900 + }, + { + "epoch": 9.81956456904265, + "grad_norm": 0.060568299144506454, + "learning_rate": 3.3745791468943034e-05, + "loss": 0.01, + "step": 32925 + }, + { + "epoch": 9.82702057858634, + "grad_norm": 0.004180490970611572, + "learning_rate": 3.3729222449033696e-05, + "loss": 0.0175, + "step": 32950 + }, + { + "epoch": 9.834476588130032, + "grad_norm": 17.96302032470703, + "learning_rate": 3.3712653429124365e-05, + "loss": 0.0473, + "step": 32975 + }, + { + "epoch": 9.841932597673726, + "grad_norm": 0.015103708952665329, + "learning_rate": 3.369608440921503e-05, + "loss": 0.057, + "step": 33000 + }, + { + "epoch": 9.849388607217417, + "grad_norm": 0.014436209574341774, + "learning_rate": 3.3679515389305697e-05, + "loss": 0.0202, + "step": 33025 + }, + { + "epoch": 9.856844616761109, + "grad_norm": 48.11013412475586, + "learning_rate": 3.366294636939636e-05, + "loss": 0.0201, + "step": 33050 + }, + { + "epoch": 9.864300626304802, + "grad_norm": 2.6800339221954346, + "learning_rate": 3.364637734948702e-05, + "loss": 0.0421, + "step": 33075 + }, + { + "epoch": 9.871756635848493, + "grad_norm": 0.014932164922356606, + "learning_rate": 3.362980832957769e-05, + "loss": 0.0651, + "step": 33100 + }, + { + "epoch": 9.879212645392187, + "grad_norm": 0.0073710051365196705, + "learning_rate": 3.361323930966835e-05, + "loss": 0.0468, + "step": 33125 + }, + { + "epoch": 9.886668654935878, + "grad_norm": 0.00784632284194231, + "learning_rate": 3.359667028975902e-05, + "loss": 0.065, + "step": 33150 + }, + { + "epoch": 9.89412466447957, + "grad_norm": 0.044525280594825745, + "learning_rate": 3.358010126984969e-05, + "loss": 0.0249, + "step": 33175 + }, + { + "epoch": 9.901580674023263, + "grad_norm": 0.011400883086025715, + "learning_rate": 3.356353224994035e-05, + "loss": 0.0155, + "step": 33200 + }, + { + "epoch": 9.909036683566955, + "grad_norm": 0.01919802837073803, + "learning_rate": 3.354696323003102e-05, + "loss": 0.0059, + "step": 33225 + }, + { + "epoch": 9.916492693110648, + "grad_norm": 0.0051742009818553925, + "learning_rate": 3.3530394210121685e-05, + "loss": 0.034, + "step": 33250 + }, + { + "epoch": 9.92394870265434, + "grad_norm": 0.00105185154825449, + "learning_rate": 3.351382519021235e-05, + "loss": 0.0175, + "step": 33275 + }, + { + "epoch": 9.931404712198031, + "grad_norm": 0.006342086009681225, + "learning_rate": 3.3497256170303016e-05, + "loss": 0.052, + "step": 33300 + }, + { + "epoch": 9.938860721741724, + "grad_norm": 0.01718548871576786, + "learning_rate": 3.348068715039368e-05, + "loss": 0.0029, + "step": 33325 + }, + { + "epoch": 9.946316731285416, + "grad_norm": 3.649813413619995, + "learning_rate": 3.346411813048435e-05, + "loss": 0.0411, + "step": 33350 + }, + { + "epoch": 9.953772740829109, + "grad_norm": 9.431161880493164, + "learning_rate": 3.344754911057501e-05, + "loss": 0.0482, + "step": 33375 + }, + { + "epoch": 9.9612287503728, + "grad_norm": 0.0046329195611178875, + "learning_rate": 3.343098009066568e-05, + "loss": 0.0231, + "step": 33400 + }, + { + "epoch": 9.968684759916492, + "grad_norm": 5.078273296356201, + "learning_rate": 3.341441107075635e-05, + "loss": 0.0056, + "step": 33425 + }, + { + "epoch": 9.976140769460185, + "grad_norm": 0.02794848009943962, + "learning_rate": 3.339784205084701e-05, + "loss": 0.0175, + "step": 33450 + }, + { + "epoch": 9.983596779003877, + "grad_norm": 0.06508468836545944, + "learning_rate": 3.338127303093768e-05, + "loss": 0.0421, + "step": 33475 + }, + { + "epoch": 9.99105278854757, + "grad_norm": 0.950072169303894, + "learning_rate": 3.336470401102834e-05, + "loss": 0.0948, + "step": 33500 + }, + { + "epoch": 9.998508798091262, + "grad_norm": 0.0035361736081540585, + "learning_rate": 3.3348134991119004e-05, + "loss": 0.0623, + "step": 33525 + }, + { + "epoch": 10.0, + "eval_gen_len": 8.703, + "eval_loss": 0.08018776774406433, + "eval_rouge1": 97.4139, + "eval_rouge2": 84.4925, + "eval_rougeL": 97.3164, + "eval_rougeLsum": 97.3129, + "eval_runtime": 99.5841, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 4.218, + "step": 33530 + }, + { + "epoch": 10.005964807634953, + "grad_norm": 0.003375839442014694, + "learning_rate": 3.3331565971209674e-05, + "loss": 0.0126, + "step": 33550 + }, + { + "epoch": 10.013420817178647, + "grad_norm": 0.009126723743975163, + "learning_rate": 3.3314996951300336e-05, + "loss": 0.0245, + "step": 33575 + }, + { + "epoch": 10.020876826722338, + "grad_norm": 1.7665882110595703, + "learning_rate": 3.3298427931391005e-05, + "loss": 0.0474, + "step": 33600 + }, + { + "epoch": 10.02833283626603, + "grad_norm": 0.022015083581209183, + "learning_rate": 3.328185891148167e-05, + "loss": 0.0093, + "step": 33625 + }, + { + "epoch": 10.035788845809723, + "grad_norm": 0.3044419288635254, + "learning_rate": 3.3265289891572336e-05, + "loss": 0.0031, + "step": 33650 + }, + { + "epoch": 10.043244855353414, + "grad_norm": 0.131506085395813, + "learning_rate": 3.3248720871663006e-05, + "loss": 0.0342, + "step": 33675 + }, + { + "epoch": 10.050700864897108, + "grad_norm": 9.862656593322754, + "learning_rate": 3.323215185175367e-05, + "loss": 0.0135, + "step": 33700 + }, + { + "epoch": 10.0581568744408, + "grad_norm": 0.00710965134203434, + "learning_rate": 3.321558283184433e-05, + "loss": 0.0154, + "step": 33725 + }, + { + "epoch": 10.06561288398449, + "grad_norm": 0.7810943722724915, + "learning_rate": 3.3199013811935e-05, + "loss": 0.0095, + "step": 33750 + }, + { + "epoch": 10.073068893528184, + "grad_norm": 0.16734705865383148, + "learning_rate": 3.318244479202566e-05, + "loss": 0.0051, + "step": 33775 + }, + { + "epoch": 10.080524903071876, + "grad_norm": 7.479839324951172, + "learning_rate": 3.316587577211633e-05, + "loss": 0.0197, + "step": 33800 + }, + { + "epoch": 10.087980912615569, + "grad_norm": 0.019815055653452873, + "learning_rate": 3.314930675220699e-05, + "loss": 0.0021, + "step": 33825 + }, + { + "epoch": 10.09543692215926, + "grad_norm": 0.01897830329835415, + "learning_rate": 3.3132737732297656e-05, + "loss": 0.0948, + "step": 33850 + }, + { + "epoch": 10.102892931702952, + "grad_norm": 15.485894203186035, + "learning_rate": 3.311616871238833e-05, + "loss": 0.0269, + "step": 33875 + }, + { + "epoch": 10.110348941246645, + "grad_norm": 0.07259183377027512, + "learning_rate": 3.3099599692478994e-05, + "loss": 0.0008, + "step": 33900 + }, + { + "epoch": 10.117804950790337, + "grad_norm": 0.11839132755994797, + "learning_rate": 3.3083030672569656e-05, + "loss": 0.011, + "step": 33925 + }, + { + "epoch": 10.12526096033403, + "grad_norm": 0.022647298872470856, + "learning_rate": 3.3066461652660325e-05, + "loss": 0.0161, + "step": 33950 + }, + { + "epoch": 10.132716969877722, + "grad_norm": 0.008973998948931694, + "learning_rate": 3.304989263275099e-05, + "loss": 0.0214, + "step": 33975 + }, + { + "epoch": 10.140172979421413, + "grad_norm": 0.30917757749557495, + "learning_rate": 3.303332361284166e-05, + "loss": 0.0331, + "step": 34000 + }, + { + "epoch": 10.147628988965106, + "grad_norm": 1.5753026008605957, + "learning_rate": 3.301675459293232e-05, + "loss": 0.0074, + "step": 34025 + }, + { + "epoch": 10.155084998508798, + "grad_norm": 0.23202548921108246, + "learning_rate": 3.300018557302299e-05, + "loss": 0.0055, + "step": 34050 + }, + { + "epoch": 10.162541008052491, + "grad_norm": 0.12300365418195724, + "learning_rate": 3.298361655311365e-05, + "loss": 0.0053, + "step": 34075 + }, + { + "epoch": 10.169997017596183, + "grad_norm": 0.031884919852018356, + "learning_rate": 3.296704753320431e-05, + "loss": 0.0046, + "step": 34100 + }, + { + "epoch": 10.177453027139874, + "grad_norm": 0.02119879052042961, + "learning_rate": 3.295047851329499e-05, + "loss": 0.0124, + "step": 34125 + }, + { + "epoch": 10.184909036683568, + "grad_norm": 2.8341071605682373, + "learning_rate": 3.293390949338565e-05, + "loss": 0.0475, + "step": 34150 + }, + { + "epoch": 10.192365046227259, + "grad_norm": 0.02603074349462986, + "learning_rate": 3.2917340473476313e-05, + "loss": 0.0049, + "step": 34175 + }, + { + "epoch": 10.19982105577095, + "grad_norm": 0.006299956236034632, + "learning_rate": 3.290077145356698e-05, + "loss": 0.0507, + "step": 34200 + }, + { + "epoch": 10.207277065314644, + "grad_norm": 0.023150190711021423, + "learning_rate": 3.2884202433657645e-05, + "loss": 0.0034, + "step": 34225 + }, + { + "epoch": 10.214733074858335, + "grad_norm": 0.0036992118693888187, + "learning_rate": 3.2867633413748314e-05, + "loss": 0.0061, + "step": 34250 + }, + { + "epoch": 10.222189084402029, + "grad_norm": 0.012014524079859257, + "learning_rate": 3.2851064393838976e-05, + "loss": 0.0076, + "step": 34275 + }, + { + "epoch": 10.22964509394572, + "grad_norm": 0.015969576314091682, + "learning_rate": 3.283449537392964e-05, + "loss": 0.0266, + "step": 34300 + }, + { + "epoch": 10.237101103489412, + "grad_norm": 0.006832475308328867, + "learning_rate": 3.281792635402031e-05, + "loss": 0.0479, + "step": 34325 + }, + { + "epoch": 10.244557113033105, + "grad_norm": 0.004264793358743191, + "learning_rate": 3.280135733411098e-05, + "loss": 0.0272, + "step": 34350 + }, + { + "epoch": 10.252013122576797, + "grad_norm": 0.7217514514923096, + "learning_rate": 3.278478831420164e-05, + "loss": 0.0426, + "step": 34375 + }, + { + "epoch": 10.25946913212049, + "grad_norm": 0.029282154515385628, + "learning_rate": 3.276821929429231e-05, + "loss": 0.0081, + "step": 34400 + }, + { + "epoch": 10.266925141664181, + "grad_norm": 0.0008106474415399134, + "learning_rate": 3.275165027438297e-05, + "loss": 0.0165, + "step": 34425 + }, + { + "epoch": 10.274381151207873, + "grad_norm": 0.1722215712070465, + "learning_rate": 3.273508125447364e-05, + "loss": 0.0262, + "step": 34450 + }, + { + "epoch": 10.281837160751566, + "grad_norm": 9.704647064208984, + "learning_rate": 3.27185122345643e-05, + "loss": 0.0199, + "step": 34475 + }, + { + "epoch": 10.289293170295258, + "grad_norm": 0.022019336000084877, + "learning_rate": 3.2701943214654965e-05, + "loss": 0.0056, + "step": 34500 + }, + { + "epoch": 10.296749179838951, + "grad_norm": 0.0859612226486206, + "learning_rate": 3.2685374194745634e-05, + "loss": 0.0049, + "step": 34525 + }, + { + "epoch": 10.304205189382643, + "grad_norm": 0.015857083722949028, + "learning_rate": 3.2668805174836296e-05, + "loss": 0.0071, + "step": 34550 + }, + { + "epoch": 10.311661198926334, + "grad_norm": 0.7513738870620728, + "learning_rate": 3.2652236154926965e-05, + "loss": 0.0117, + "step": 34575 + }, + { + "epoch": 10.319117208470027, + "grad_norm": 0.30653056502342224, + "learning_rate": 3.2635667135017634e-05, + "loss": 0.0176, + "step": 34600 + }, + { + "epoch": 10.326573218013719, + "grad_norm": 0.09023960679769516, + "learning_rate": 3.2619098115108297e-05, + "loss": 0.0043, + "step": 34625 + }, + { + "epoch": 10.334029227557412, + "grad_norm": 1.2998549938201904, + "learning_rate": 3.2602529095198966e-05, + "loss": 0.0227, + "step": 34650 + }, + { + "epoch": 10.341485237101104, + "grad_norm": 63.218666076660156, + "learning_rate": 3.258596007528963e-05, + "loss": 0.0163, + "step": 34675 + }, + { + "epoch": 10.348941246644795, + "grad_norm": 5.3180251121521, + "learning_rate": 3.25693910553803e-05, + "loss": 0.015, + "step": 34700 + }, + { + "epoch": 10.356397256188488, + "grad_norm": 0.038112230598926544, + "learning_rate": 3.255282203547096e-05, + "loss": 0.0281, + "step": 34725 + }, + { + "epoch": 10.36385326573218, + "grad_norm": 3.67035174369812, + "learning_rate": 3.253625301556162e-05, + "loss": 0.0441, + "step": 34750 + }, + { + "epoch": 10.371309275275872, + "grad_norm": 0.013775043189525604, + "learning_rate": 3.251968399565229e-05, + "loss": 0.0089, + "step": 34775 + }, + { + "epoch": 10.378765284819565, + "grad_norm": 0.06075282394886017, + "learning_rate": 3.250311497574295e-05, + "loss": 0.0006, + "step": 34800 + }, + { + "epoch": 10.386221294363256, + "grad_norm": 28.066097259521484, + "learning_rate": 3.248654595583362e-05, + "loss": 0.052, + "step": 34825 + }, + { + "epoch": 10.39367730390695, + "grad_norm": 0.009753179736435413, + "learning_rate": 3.246997693592429e-05, + "loss": 0.0187, + "step": 34850 + }, + { + "epoch": 10.401133313450641, + "grad_norm": 0.005144761875271797, + "learning_rate": 3.2453407916014954e-05, + "loss": 0.0399, + "step": 34875 + }, + { + "epoch": 10.408589322994333, + "grad_norm": 2.791918992996216, + "learning_rate": 3.243683889610562e-05, + "loss": 0.005, + "step": 34900 + }, + { + "epoch": 10.416045332538026, + "grad_norm": 0.004900413099676371, + "learning_rate": 3.2420269876196285e-05, + "loss": 0.0072, + "step": 34925 + }, + { + "epoch": 10.423501342081718, + "grad_norm": 0.009620816446840763, + "learning_rate": 3.240370085628695e-05, + "loss": 0.0016, + "step": 34950 + }, + { + "epoch": 10.43095735162541, + "grad_norm": 1.331263780593872, + "learning_rate": 3.238713183637762e-05, + "loss": 0.0202, + "step": 34975 + }, + { + "epoch": 10.438413361169102, + "grad_norm": 0.10611845552921295, + "learning_rate": 3.237056281646828e-05, + "loss": 0.018, + "step": 35000 + }, + { + "epoch": 10.445869370712794, + "grad_norm": 0.5065842270851135, + "learning_rate": 3.235399379655895e-05, + "loss": 0.0153, + "step": 35025 + }, + { + "epoch": 10.453325380256487, + "grad_norm": 0.11372986435890198, + "learning_rate": 3.233742477664961e-05, + "loss": 0.0048, + "step": 35050 + }, + { + "epoch": 10.460781389800179, + "grad_norm": 0.8633256554603577, + "learning_rate": 3.232085575674028e-05, + "loss": 0.0014, + "step": 35075 + }, + { + "epoch": 10.468237399343872, + "grad_norm": 0.003506710985675454, + "learning_rate": 3.230428673683095e-05, + "loss": 0.0411, + "step": 35100 + }, + { + "epoch": 10.475693408887564, + "grad_norm": 0.018955357372760773, + "learning_rate": 3.228771771692161e-05, + "loss": 0.0014, + "step": 35125 + }, + { + "epoch": 10.483149418431255, + "grad_norm": 0.005029291845858097, + "learning_rate": 3.2271148697012274e-05, + "loss": 0.0193, + "step": 35150 + }, + { + "epoch": 10.490605427974948, + "grad_norm": 25.563217163085938, + "learning_rate": 3.225457967710294e-05, + "loss": 0.0691, + "step": 35175 + }, + { + "epoch": 10.49806143751864, + "grad_norm": 1.60642409324646, + "learning_rate": 3.2238010657193605e-05, + "loss": 0.0017, + "step": 35200 + }, + { + "epoch": 10.505517447062331, + "grad_norm": 0.017324600368738174, + "learning_rate": 3.2221441637284274e-05, + "loss": 0.0247, + "step": 35225 + }, + { + "epoch": 10.512973456606025, + "grad_norm": 0.1269446760416031, + "learning_rate": 3.2204872617374936e-05, + "loss": 0.0024, + "step": 35250 + }, + { + "epoch": 10.520429466149716, + "grad_norm": 0.14126725494861603, + "learning_rate": 3.2188303597465606e-05, + "loss": 0.0822, + "step": 35275 + }, + { + "epoch": 10.52788547569341, + "grad_norm": 0.0053797028958797455, + "learning_rate": 3.217173457755627e-05, + "loss": 0.0634, + "step": 35300 + }, + { + "epoch": 10.535341485237101, + "grad_norm": 0.01621091552078724, + "learning_rate": 3.215516555764694e-05, + "loss": 0.0914, + "step": 35325 + }, + { + "epoch": 10.542797494780793, + "grad_norm": 0.00323239853605628, + "learning_rate": 3.2138596537737606e-05, + "loss": 0.0158, + "step": 35350 + }, + { + "epoch": 10.550253504324486, + "grad_norm": 34.818946838378906, + "learning_rate": 3.212202751782827e-05, + "loss": 0.0173, + "step": 35375 + }, + { + "epoch": 10.557709513868177, + "grad_norm": 0.008614491671323776, + "learning_rate": 3.210545849791893e-05, + "loss": 0.0271, + "step": 35400 + }, + { + "epoch": 10.56516552341187, + "grad_norm": 0.001824019942432642, + "learning_rate": 3.20888894780096e-05, + "loss": 0.0395, + "step": 35425 + }, + { + "epoch": 10.572621532955562, + "grad_norm": 0.007367938291281462, + "learning_rate": 3.207232045810026e-05, + "loss": 0.0011, + "step": 35450 + }, + { + "epoch": 10.580077542499254, + "grad_norm": 1.3378117084503174, + "learning_rate": 3.205575143819093e-05, + "loss": 0.0146, + "step": 35475 + }, + { + "epoch": 10.587533552042947, + "grad_norm": 0.015617124736309052, + "learning_rate": 3.2039182418281594e-05, + "loss": 0.0826, + "step": 35500 + }, + { + "epoch": 10.594989561586639, + "grad_norm": 0.003436797996982932, + "learning_rate": 3.2022613398372256e-05, + "loss": 0.0084, + "step": 35525 + }, + { + "epoch": 10.602445571130332, + "grad_norm": 0.31922221183776855, + "learning_rate": 3.200604437846293e-05, + "loss": 0.0991, + "step": 35550 + }, + { + "epoch": 10.609901580674023, + "grad_norm": 0.8636718988418579, + "learning_rate": 3.1989475358553594e-05, + "loss": 0.0387, + "step": 35575 + }, + { + "epoch": 10.617357590217715, + "grad_norm": 0.04692121967673302, + "learning_rate": 3.197290633864426e-05, + "loss": 0.0072, + "step": 35600 + }, + { + "epoch": 10.624813599761408, + "grad_norm": 0.007798209320753813, + "learning_rate": 3.1956337318734926e-05, + "loss": 0.0464, + "step": 35625 + }, + { + "epoch": 10.6322696093051, + "grad_norm": 51.005245208740234, + "learning_rate": 3.193976829882559e-05, + "loss": 0.0759, + "step": 35650 + }, + { + "epoch": 10.639725618848793, + "grad_norm": 4.934067726135254, + "learning_rate": 3.192319927891626e-05, + "loss": 0.035, + "step": 35675 + }, + { + "epoch": 10.647181628392484, + "grad_norm": 0.1517515480518341, + "learning_rate": 3.190663025900692e-05, + "loss": 0.0236, + "step": 35700 + }, + { + "epoch": 10.654637637936176, + "grad_norm": 0.04861300066113472, + "learning_rate": 3.189006123909758e-05, + "loss": 0.0624, + "step": 35725 + }, + { + "epoch": 10.66209364747987, + "grad_norm": 0.010758434422314167, + "learning_rate": 3.187349221918825e-05, + "loss": 0.0222, + "step": 35750 + }, + { + "epoch": 10.66954965702356, + "grad_norm": 2.7868688106536865, + "learning_rate": 3.1856923199278913e-05, + "loss": 0.0418, + "step": 35775 + }, + { + "epoch": 10.677005666567254, + "grad_norm": 13.515581130981445, + "learning_rate": 3.184035417936958e-05, + "loss": 0.0125, + "step": 35800 + }, + { + "epoch": 10.684461676110946, + "grad_norm": 0.10708614438772202, + "learning_rate": 3.182378515946025e-05, + "loss": 0.06, + "step": 35825 + }, + { + "epoch": 10.691917685654637, + "grad_norm": 0.0015107860090211034, + "learning_rate": 3.1807216139550914e-05, + "loss": 0.1163, + "step": 35850 + }, + { + "epoch": 10.69937369519833, + "grad_norm": 0.032355859875679016, + "learning_rate": 3.179064711964158e-05, + "loss": 0.0372, + "step": 35875 + }, + { + "epoch": 10.706829704742022, + "grad_norm": 21.268131256103516, + "learning_rate": 3.1774078099732245e-05, + "loss": 0.0237, + "step": 35900 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 21.97075080871582, + "learning_rate": 3.1757509079822915e-05, + "loss": 0.0661, + "step": 35925 + }, + { + "epoch": 10.721741723829407, + "grad_norm": 0.10698114335536957, + "learning_rate": 3.174094005991358e-05, + "loss": 0.0203, + "step": 35950 + }, + { + "epoch": 10.729197733373098, + "grad_norm": 0.009910419583320618, + "learning_rate": 3.172437104000424e-05, + "loss": 0.0022, + "step": 35975 + }, + { + "epoch": 10.736653742916792, + "grad_norm": 0.007027831859886646, + "learning_rate": 3.170780202009491e-05, + "loss": 0.0578, + "step": 36000 + }, + { + "epoch": 10.744109752460483, + "grad_norm": 0.024194374680519104, + "learning_rate": 3.169123300018558e-05, + "loss": 0.0021, + "step": 36025 + }, + { + "epoch": 10.751565762004175, + "grad_norm": 0.018712317571043968, + "learning_rate": 3.167466398027624e-05, + "loss": 0.0096, + "step": 36050 + }, + { + "epoch": 10.759021771547868, + "grad_norm": 3.4336538314819336, + "learning_rate": 3.165809496036691e-05, + "loss": 0.0158, + "step": 36075 + }, + { + "epoch": 10.76647778109156, + "grad_norm": 9.927470207214355, + "learning_rate": 3.164152594045757e-05, + "loss": 0.066, + "step": 36100 + }, + { + "epoch": 10.773933790635253, + "grad_norm": 0.14512130618095398, + "learning_rate": 3.162495692054824e-05, + "loss": 0.0242, + "step": 36125 + }, + { + "epoch": 10.781389800178944, + "grad_norm": 0.05796957015991211, + "learning_rate": 3.16083879006389e-05, + "loss": 0.0366, + "step": 36150 + }, + { + "epoch": 10.788845809722636, + "grad_norm": 0.034359026700258255, + "learning_rate": 3.1591818880729565e-05, + "loss": 0.0064, + "step": 36175 + }, + { + "epoch": 10.79630181926633, + "grad_norm": 15.594103813171387, + "learning_rate": 3.1575249860820234e-05, + "loss": 0.0084, + "step": 36200 + }, + { + "epoch": 10.80375782881002, + "grad_norm": 0.1458369642496109, + "learning_rate": 3.1558680840910897e-05, + "loss": 0.0132, + "step": 36225 + }, + { + "epoch": 10.811213838353712, + "grad_norm": 0.034678563475608826, + "learning_rate": 3.1542111821001566e-05, + "loss": 0.0017, + "step": 36250 + }, + { + "epoch": 10.818669847897405, + "grad_norm": 10.139859199523926, + "learning_rate": 3.1525542801092235e-05, + "loss": 0.0353, + "step": 36275 + }, + { + "epoch": 10.826125857441097, + "grad_norm": 0.013228428550064564, + "learning_rate": 3.15089737811829e-05, + "loss": 0.0478, + "step": 36300 + }, + { + "epoch": 10.83358186698479, + "grad_norm": 0.234503373503685, + "learning_rate": 3.1492404761273566e-05, + "loss": 0.0034, + "step": 36325 + }, + { + "epoch": 10.841037876528482, + "grad_norm": 0.043588753789663315, + "learning_rate": 3.147583574136423e-05, + "loss": 0.0014, + "step": 36350 + }, + { + "epoch": 10.848493886072173, + "grad_norm": 0.0065058995969593525, + "learning_rate": 3.145926672145489e-05, + "loss": 0.0328, + "step": 36375 + }, + { + "epoch": 10.855949895615867, + "grad_norm": 0.07438325136899948, + "learning_rate": 3.144269770154556e-05, + "loss": 0.0166, + "step": 36400 + }, + { + "epoch": 10.863405905159558, + "grad_norm": 1.323083519935608, + "learning_rate": 3.142612868163622e-05, + "loss": 0.0564, + "step": 36425 + }, + { + "epoch": 10.870861914703251, + "grad_norm": 0.1142137348651886, + "learning_rate": 3.140955966172689e-05, + "loss": 0.0619, + "step": 36450 + }, + { + "epoch": 10.878317924246943, + "grad_norm": 0.04162459075450897, + "learning_rate": 3.1392990641817554e-05, + "loss": 0.02, + "step": 36475 + }, + { + "epoch": 10.885773933790635, + "grad_norm": 0.020116539672017097, + "learning_rate": 3.137642162190822e-05, + "loss": 0.0265, + "step": 36500 + }, + { + "epoch": 10.893229943334328, + "grad_norm": 0.04184208810329437, + "learning_rate": 3.135985260199889e-05, + "loss": 0.0149, + "step": 36525 + }, + { + "epoch": 10.90068595287802, + "grad_norm": 0.04820263385772705, + "learning_rate": 3.1343283582089554e-05, + "loss": 0.0125, + "step": 36550 + }, + { + "epoch": 10.908141962421713, + "grad_norm": 0.0024502065498381853, + "learning_rate": 3.1326714562180224e-05, + "loss": 0.0111, + "step": 36575 + }, + { + "epoch": 10.915597971965404, + "grad_norm": 2.155289649963379, + "learning_rate": 3.1310145542270886e-05, + "loss": 0.0035, + "step": 36600 + }, + { + "epoch": 10.923053981509096, + "grad_norm": 0.014378400519490242, + "learning_rate": 3.129357652236155e-05, + "loss": 0.0551, + "step": 36625 + }, + { + "epoch": 10.930509991052789, + "grad_norm": 2.1999733448028564, + "learning_rate": 3.127700750245222e-05, + "loss": 0.0459, + "step": 36650 + }, + { + "epoch": 10.93796600059648, + "grad_norm": 26.96295928955078, + "learning_rate": 3.126043848254288e-05, + "loss": 0.0074, + "step": 36675 + }, + { + "epoch": 10.945422010140174, + "grad_norm": 4.9573845863342285, + "learning_rate": 3.124386946263355e-05, + "loss": 0.0271, + "step": 36700 + }, + { + "epoch": 10.952878019683865, + "grad_norm": 0.9740040302276611, + "learning_rate": 3.122730044272421e-05, + "loss": 0.0067, + "step": 36725 + }, + { + "epoch": 10.960334029227557, + "grad_norm": 0.10786978155374527, + "learning_rate": 3.121073142281488e-05, + "loss": 0.0063, + "step": 36750 + }, + { + "epoch": 10.96779003877125, + "grad_norm": 0.9033737182617188, + "learning_rate": 3.119416240290555e-05, + "loss": 0.0241, + "step": 36775 + }, + { + "epoch": 10.975246048314942, + "grad_norm": 0.8337206840515137, + "learning_rate": 3.117759338299621e-05, + "loss": 0.0423, + "step": 36800 + }, + { + "epoch": 10.982702057858635, + "grad_norm": 34.99683380126953, + "learning_rate": 3.1161024363086874e-05, + "loss": 0.0323, + "step": 36825 + }, + { + "epoch": 10.990158067402326, + "grad_norm": 0.3933325707912445, + "learning_rate": 3.114445534317754e-05, + "loss": 0.0058, + "step": 36850 + }, + { + "epoch": 10.997614076946018, + "grad_norm": 0.019569693133234978, + "learning_rate": 3.1127886323268206e-05, + "loss": 0.0019, + "step": 36875 + }, + { + "epoch": 11.0, + "eval_gen_len": 8.7108, + "eval_loss": 0.0731603354215622, + "eval_rouge1": 97.3528, + "eval_rouge2": 84.9895, + "eval_rougeL": 97.3143, + "eval_rougeLsum": 97.3154, + "eval_runtime": 98.1335, + "eval_samples_per_second": 17.089, + "eval_steps_per_second": 4.28, + "step": 36883 + }, + { + "epoch": 11.005070086489711, + "grad_norm": 0.00565438624471426, + "learning_rate": 3.1111317303358875e-05, + "loss": 0.004, + "step": 36900 + }, + { + "epoch": 11.012526096033403, + "grad_norm": 0.008675969205796719, + "learning_rate": 3.109474828344954e-05, + "loss": 0.1224, + "step": 36925 + }, + { + "epoch": 11.019982105577094, + "grad_norm": 0.2460847944021225, + "learning_rate": 3.10781792635402e-05, + "loss": 0.0458, + "step": 36950 + }, + { + "epoch": 11.027438115120788, + "grad_norm": 0.015279813669621944, + "learning_rate": 3.106161024363087e-05, + "loss": 0.0285, + "step": 36975 + }, + { + "epoch": 11.03489412466448, + "grad_norm": 0.01354867685586214, + "learning_rate": 3.104504122372154e-05, + "loss": 0.02, + "step": 37000 + }, + { + "epoch": 11.042350134208172, + "grad_norm": 0.009907645173370838, + "learning_rate": 3.10284722038122e-05, + "loss": 0.1091, + "step": 37025 + }, + { + "epoch": 11.049806143751864, + "grad_norm": 0.033154286444187164, + "learning_rate": 3.101190318390287e-05, + "loss": 0.0064, + "step": 37050 + }, + { + "epoch": 11.057262153295556, + "grad_norm": 0.04053553566336632, + "learning_rate": 3.099533416399353e-05, + "loss": 0.0103, + "step": 37075 + }, + { + "epoch": 11.064718162839249, + "grad_norm": 0.05914434418082237, + "learning_rate": 3.09787651440842e-05, + "loss": 0.0343, + "step": 37100 + }, + { + "epoch": 11.07217417238294, + "grad_norm": 0.021367311477661133, + "learning_rate": 3.096219612417486e-05, + "loss": 0.0057, + "step": 37125 + }, + { + "epoch": 11.079630181926634, + "grad_norm": 0.001664644107222557, + "learning_rate": 3.094562710426553e-05, + "loss": 0.0005, + "step": 37150 + }, + { + "epoch": 11.087086191470325, + "grad_norm": 0.12640069425106049, + "learning_rate": 3.0929058084356194e-05, + "loss": 0.0003, + "step": 37175 + }, + { + "epoch": 11.094542201014017, + "grad_norm": 0.004810268059372902, + "learning_rate": 3.091248906444686e-05, + "loss": 0.0119, + "step": 37200 + }, + { + "epoch": 11.10199821055771, + "grad_norm": 1.4785408973693848, + "learning_rate": 3.089592004453753e-05, + "loss": 0.0008, + "step": 37225 + }, + { + "epoch": 11.109454220101401, + "grad_norm": 0.0022451153490692377, + "learning_rate": 3.0879351024628195e-05, + "loss": 0.0123, + "step": 37250 + }, + { + "epoch": 11.116910229645095, + "grad_norm": 0.1429530680179596, + "learning_rate": 3.086278200471886e-05, + "loss": 0.0115, + "step": 37275 + }, + { + "epoch": 11.124366239188786, + "grad_norm": 0.03411104530096054, + "learning_rate": 3.0846212984809526e-05, + "loss": 0.0212, + "step": 37300 + }, + { + "epoch": 11.131822248732478, + "grad_norm": 0.8815001845359802, + "learning_rate": 3.082964396490019e-05, + "loss": 0.0005, + "step": 37325 + }, + { + "epoch": 11.139278258276171, + "grad_norm": 0.008912123739719391, + "learning_rate": 3.081307494499086e-05, + "loss": 0.0029, + "step": 37350 + }, + { + "epoch": 11.146734267819863, + "grad_norm": 0.021167289465665817, + "learning_rate": 3.079650592508152e-05, + "loss": 0.016, + "step": 37375 + }, + { + "epoch": 11.154190277363554, + "grad_norm": 0.043260324746370316, + "learning_rate": 3.077993690517218e-05, + "loss": 0.0195, + "step": 37400 + }, + { + "epoch": 11.161646286907247, + "grad_norm": 0.0005007116124033928, + "learning_rate": 3.076336788526285e-05, + "loss": 0.001, + "step": 37425 + }, + { + "epoch": 11.169102296450939, + "grad_norm": 0.003123112255707383, + "learning_rate": 3.0746798865353514e-05, + "loss": 0.0003, + "step": 37450 + }, + { + "epoch": 11.176558305994632, + "grad_norm": 0.1323997676372528, + "learning_rate": 3.073022984544418e-05, + "loss": 0.0171, + "step": 37475 + }, + { + "epoch": 11.184014315538324, + "grad_norm": 0.00898793339729309, + "learning_rate": 3.071366082553485e-05, + "loss": 0.0087, + "step": 37500 + }, + { + "epoch": 11.191470325082015, + "grad_norm": 0.004661829676479101, + "learning_rate": 3.0697091805625515e-05, + "loss": 0.0006, + "step": 37525 + }, + { + "epoch": 11.198926334625709, + "grad_norm": 0.0023466881830245256, + "learning_rate": 3.0680522785716184e-05, + "loss": 0.0259, + "step": 37550 + }, + { + "epoch": 11.2063823441694, + "grad_norm": 0.016425127163529396, + "learning_rate": 3.0663953765806846e-05, + "loss": 0.0683, + "step": 37575 + }, + { + "epoch": 11.213838353713093, + "grad_norm": 0.12483184039592743, + "learning_rate": 3.064738474589751e-05, + "loss": 0.0005, + "step": 37600 + }, + { + "epoch": 11.221294363256785, + "grad_norm": 0.013576678931713104, + "learning_rate": 3.063081572598818e-05, + "loss": 0.0043, + "step": 37625 + }, + { + "epoch": 11.228750372800476, + "grad_norm": 0.025191502645611763, + "learning_rate": 3.061424670607884e-05, + "loss": 0.0029, + "step": 37650 + }, + { + "epoch": 11.23620638234417, + "grad_norm": 0.007174923084676266, + "learning_rate": 3.059767768616951e-05, + "loss": 0.043, + "step": 37675 + }, + { + "epoch": 11.243662391887861, + "grad_norm": 27.290576934814453, + "learning_rate": 3.058110866626018e-05, + "loss": 0.0149, + "step": 37700 + }, + { + "epoch": 11.251118401431555, + "grad_norm": 0.13593660295009613, + "learning_rate": 3.056453964635084e-05, + "loss": 0.0061, + "step": 37725 + }, + { + "epoch": 11.258574410975246, + "grad_norm": 1.3596652746200562, + "learning_rate": 3.054797062644151e-05, + "loss": 0.0196, + "step": 37750 + }, + { + "epoch": 11.266030420518938, + "grad_norm": 0.007876387797296047, + "learning_rate": 3.053140160653217e-05, + "loss": 0.0035, + "step": 37775 + }, + { + "epoch": 11.273486430062631, + "grad_norm": 0.016628660261631012, + "learning_rate": 3.0514832586622838e-05, + "loss": 0.0015, + "step": 37800 + }, + { + "epoch": 11.280942439606322, + "grad_norm": 19.705419540405273, + "learning_rate": 3.0498926327509875e-05, + "loss": 0.0885, + "step": 37825 + }, + { + "epoch": 11.288398449150016, + "grad_norm": 0.007640378549695015, + "learning_rate": 3.048235730760054e-05, + "loss": 0.0057, + "step": 37850 + }, + { + "epoch": 11.295854458693707, + "grad_norm": 0.044530533254146576, + "learning_rate": 3.046578828769121e-05, + "loss": 0.0026, + "step": 37875 + }, + { + "epoch": 11.303310468237399, + "grad_norm": 0.008475979790091515, + "learning_rate": 3.0449219267781875e-05, + "loss": 0.0099, + "step": 37900 + }, + { + "epoch": 11.310766477781092, + "grad_norm": 0.008451344445347786, + "learning_rate": 3.043265024787254e-05, + "loss": 0.0013, + "step": 37925 + }, + { + "epoch": 11.318222487324784, + "grad_norm": 0.0787351205945015, + "learning_rate": 3.0416081227963207e-05, + "loss": 0.0322, + "step": 37950 + }, + { + "epoch": 11.325678496868475, + "grad_norm": 0.13200482726097107, + "learning_rate": 3.0399512208053872e-05, + "loss": 0.0002, + "step": 37975 + }, + { + "epoch": 11.333134506412168, + "grad_norm": 0.15154162049293518, + "learning_rate": 3.0382943188144535e-05, + "loss": 0.0274, + "step": 38000 + }, + { + "epoch": 11.34059051595586, + "grad_norm": 0.18614114820957184, + "learning_rate": 3.03663741682352e-05, + "loss": 0.0056, + "step": 38025 + }, + { + "epoch": 11.348046525499553, + "grad_norm": 0.02761516161262989, + "learning_rate": 3.0349805148325866e-05, + "loss": 0.0028, + "step": 38050 + }, + { + "epoch": 11.355502535043245, + "grad_norm": 0.002439249772578478, + "learning_rate": 3.0333236128416532e-05, + "loss": 0.0038, + "step": 38075 + }, + { + "epoch": 11.362958544586936, + "grad_norm": 0.03615495190024376, + "learning_rate": 3.0316667108507198e-05, + "loss": 0.0226, + "step": 38100 + }, + { + "epoch": 11.37041455413063, + "grad_norm": 0.007680293172597885, + "learning_rate": 3.0300098088597867e-05, + "loss": 0.0394, + "step": 38125 + }, + { + "epoch": 11.377870563674321, + "grad_norm": 0.010748565196990967, + "learning_rate": 3.0283529068688533e-05, + "loss": 0.0169, + "step": 38150 + }, + { + "epoch": 11.385326573218014, + "grad_norm": 0.008955719880759716, + "learning_rate": 3.0266960048779198e-05, + "loss": 0.0071, + "step": 38175 + }, + { + "epoch": 11.392782582761706, + "grad_norm": 0.03065124712884426, + "learning_rate": 3.025039102886986e-05, + "loss": 0.0277, + "step": 38200 + }, + { + "epoch": 11.400238592305397, + "grad_norm": 0.0015641790814697742, + "learning_rate": 3.0234484769756898e-05, + "loss": 0.0472, + "step": 38225 + }, + { + "epoch": 11.40769460184909, + "grad_norm": 0.012706178240478039, + "learning_rate": 3.0217915749847563e-05, + "loss": 0.0035, + "step": 38250 + }, + { + "epoch": 11.415150611392782, + "grad_norm": 0.058647219091653824, + "learning_rate": 3.0201346729938236e-05, + "loss": 0.0022, + "step": 38275 + }, + { + "epoch": 11.422606620936476, + "grad_norm": 0.32084959745407104, + "learning_rate": 3.0184777710028898e-05, + "loss": 0.0311, + "step": 38300 + }, + { + "epoch": 11.430062630480167, + "grad_norm": 0.001740424195304513, + "learning_rate": 3.0168208690119564e-05, + "loss": 0.001, + "step": 38325 + }, + { + "epoch": 11.437518640023859, + "grad_norm": 0.01977064087986946, + "learning_rate": 3.015163967021023e-05, + "loss": 0.0064, + "step": 38350 + }, + { + "epoch": 11.444974649567552, + "grad_norm": 0.012652015313506126, + "learning_rate": 3.0135070650300895e-05, + "loss": 0.0038, + "step": 38375 + }, + { + "epoch": 11.452430659111243, + "grad_norm": 0.029624788090586662, + "learning_rate": 3.011850163039156e-05, + "loss": 0.0294, + "step": 38400 + }, + { + "epoch": 11.459886668654935, + "grad_norm": 0.729141891002655, + "learning_rate": 3.0101932610482224e-05, + "loss": 0.005, + "step": 38425 + }, + { + "epoch": 11.467342678198628, + "grad_norm": 0.21458280086517334, + "learning_rate": 3.008536359057289e-05, + "loss": 0.0201, + "step": 38450 + }, + { + "epoch": 11.47479868774232, + "grad_norm": 0.018610941246151924, + "learning_rate": 3.0068794570663555e-05, + "loss": 0.0118, + "step": 38475 + }, + { + "epoch": 11.482254697286013, + "grad_norm": 3.399533987045288, + "learning_rate": 3.005222555075422e-05, + "loss": 0.0311, + "step": 38500 + }, + { + "epoch": 11.489710706829705, + "grad_norm": 0.16793139278888702, + "learning_rate": 3.003565653084489e-05, + "loss": 0.0095, + "step": 38525 + }, + { + "epoch": 11.497166716373396, + "grad_norm": 0.3863959312438965, + "learning_rate": 3.0019087510935556e-05, + "loss": 0.0268, + "step": 38550 + }, + { + "epoch": 11.50462272591709, + "grad_norm": 0.005762874614447355, + "learning_rate": 3.0003181251822593e-05, + "loss": 0.0849, + "step": 38575 + }, + { + "epoch": 11.512078735460781, + "grad_norm": 0.061609476804733276, + "learning_rate": 2.998661223191326e-05, + "loss": 0.0359, + "step": 38600 + }, + { + "epoch": 11.519534745004474, + "grad_norm": 0.2524660527706146, + "learning_rate": 2.9970043212003924e-05, + "loss": 0.0463, + "step": 38625 + }, + { + "epoch": 11.526990754548166, + "grad_norm": 0.03191876783967018, + "learning_rate": 2.995347419209459e-05, + "loss": 0.0268, + "step": 38650 + }, + { + "epoch": 11.534446764091857, + "grad_norm": 0.32526031136512756, + "learning_rate": 2.993690517218526e-05, + "loss": 0.0362, + "step": 38675 + }, + { + "epoch": 11.54190277363555, + "grad_norm": 0.0009662279044277966, + "learning_rate": 2.9920336152275925e-05, + "loss": 0.0105, + "step": 38700 + }, + { + "epoch": 11.549358783179242, + "grad_norm": 0.9750592112541199, + "learning_rate": 2.990376713236659e-05, + "loss": 0.0283, + "step": 38725 + }, + { + "epoch": 11.556814792722935, + "grad_norm": 0.028177186846733093, + "learning_rate": 2.9887198112457253e-05, + "loss": 0.0169, + "step": 38750 + }, + { + "epoch": 11.564270802266627, + "grad_norm": 0.02503358945250511, + "learning_rate": 2.987062909254792e-05, + "loss": 0.0387, + "step": 38775 + }, + { + "epoch": 11.571726811810318, + "grad_norm": 79.97856903076172, + "learning_rate": 2.9854060072638584e-05, + "loss": 0.0216, + "step": 38800 + }, + { + "epoch": 11.579182821354012, + "grad_norm": 0.03171137720346451, + "learning_rate": 2.983749105272925e-05, + "loss": 0.0476, + "step": 38825 + }, + { + "epoch": 11.586638830897703, + "grad_norm": 0.011583199724555016, + "learning_rate": 2.9820922032819916e-05, + "loss": 0.0549, + "step": 38850 + }, + { + "epoch": 11.594094840441397, + "grad_norm": 0.04316306859254837, + "learning_rate": 2.9804353012910578e-05, + "loss": 0.0004, + "step": 38875 + }, + { + "epoch": 11.601550849985088, + "grad_norm": 0.12726345658302307, + "learning_rate": 2.9787783993001244e-05, + "loss": 0.0019, + "step": 38900 + }, + { + "epoch": 11.60900685952878, + "grad_norm": 0.0003953992563765496, + "learning_rate": 2.9771214973091916e-05, + "loss": 0.008, + "step": 38925 + }, + { + "epoch": 11.616462869072473, + "grad_norm": 0.003280686680227518, + "learning_rate": 2.9754645953182582e-05, + "loss": 0.0462, + "step": 38950 + }, + { + "epoch": 11.623918878616164, + "grad_norm": 0.09019768238067627, + "learning_rate": 2.9738076933273244e-05, + "loss": 0.0135, + "step": 38975 + }, + { + "epoch": 11.631374888159858, + "grad_norm": 0.009447668679058552, + "learning_rate": 2.972150791336391e-05, + "loss": 0.0133, + "step": 39000 + }, + { + "epoch": 11.63883089770355, + "grad_norm": 0.00495525635778904, + "learning_rate": 2.9704938893454576e-05, + "loss": 0.0032, + "step": 39025 + }, + { + "epoch": 11.64628690724724, + "grad_norm": 0.3824717402458191, + "learning_rate": 2.968836987354524e-05, + "loss": 0.0177, + "step": 39050 + }, + { + "epoch": 11.653742916790934, + "grad_norm": 0.013165218755602837, + "learning_rate": 2.9671800853635907e-05, + "loss": 0.0058, + "step": 39075 + }, + { + "epoch": 11.661198926334626, + "grad_norm": 0.016493534669280052, + "learning_rate": 2.965523183372657e-05, + "loss": 0.012, + "step": 39100 + }, + { + "epoch": 11.668654935878317, + "grad_norm": 0.02720525860786438, + "learning_rate": 2.9638662813817235e-05, + "loss": 0.0259, + "step": 39125 + }, + { + "epoch": 11.67611094542201, + "grad_norm": 0.012515093199908733, + "learning_rate": 2.9622093793907908e-05, + "loss": 0.0312, + "step": 39150 + }, + { + "epoch": 11.683566954965702, + "grad_norm": 0.011289069429039955, + "learning_rate": 2.9605524773998574e-05, + "loss": 0.0499, + "step": 39175 + }, + { + "epoch": 11.691022964509395, + "grad_norm": 8.175752639770508, + "learning_rate": 2.9588955754089236e-05, + "loss": 0.0021, + "step": 39200 + }, + { + "epoch": 11.698478974053087, + "grad_norm": 0.00964757427573204, + "learning_rate": 2.95723867341799e-05, + "loss": 0.0068, + "step": 39225 + }, + { + "epoch": 11.705934983596778, + "grad_norm": 1.0415167808532715, + "learning_rate": 2.9555817714270567e-05, + "loss": 0.0199, + "step": 39250 + }, + { + "epoch": 11.713390993140472, + "grad_norm": 0.3101750612258911, + "learning_rate": 2.9539248694361233e-05, + "loss": 0.0235, + "step": 39275 + }, + { + "epoch": 11.720847002684163, + "grad_norm": 0.002900744555518031, + "learning_rate": 2.95226796744519e-05, + "loss": 0.0138, + "step": 39300 + }, + { + "epoch": 11.728303012227856, + "grad_norm": 0.907034158706665, + "learning_rate": 2.950611065454256e-05, + "loss": 0.0401, + "step": 39325 + }, + { + "epoch": 11.735759021771548, + "grad_norm": 0.24928085505962372, + "learning_rate": 2.9489541634633227e-05, + "loss": 0.0124, + "step": 39350 + }, + { + "epoch": 11.74321503131524, + "grad_norm": 0.009192919358611107, + "learning_rate": 2.9472972614723893e-05, + "loss": 0.0254, + "step": 39375 + }, + { + "epoch": 11.750671040858933, + "grad_norm": 0.11580682545900345, + "learning_rate": 2.9456403594814562e-05, + "loss": 0.0157, + "step": 39400 + }, + { + "epoch": 11.758127050402624, + "grad_norm": 0.0013650426408275962, + "learning_rate": 2.9439834574905228e-05, + "loss": 0.0004, + "step": 39425 + }, + { + "epoch": 11.765583059946316, + "grad_norm": 0.02957271784543991, + "learning_rate": 2.9423265554995893e-05, + "loss": 0.0122, + "step": 39450 + }, + { + "epoch": 11.77303906949001, + "grad_norm": 0.004242660012096167, + "learning_rate": 2.940669653508656e-05, + "loss": 0.0008, + "step": 39475 + }, + { + "epoch": 11.7804950790337, + "grad_norm": 0.04420284926891327, + "learning_rate": 2.9390127515177225e-05, + "loss": 0.0015, + "step": 39500 + }, + { + "epoch": 11.787951088577394, + "grad_norm": 0.0010515704052522779, + "learning_rate": 2.937355849526789e-05, + "loss": 0.0013, + "step": 39525 + }, + { + "epoch": 11.795407098121085, + "grad_norm": 0.2385762482881546, + "learning_rate": 2.9356989475358553e-05, + "loss": 0.0434, + "step": 39550 + }, + { + "epoch": 11.802863107664777, + "grad_norm": 0.021942714229226112, + "learning_rate": 2.934042045544922e-05, + "loss": 0.0005, + "step": 39575 + }, + { + "epoch": 11.81031911720847, + "grad_norm": 0.02746811881661415, + "learning_rate": 2.9323851435539884e-05, + "loss": 0.0014, + "step": 39600 + }, + { + "epoch": 11.817775126752162, + "grad_norm": 0.013176214881241322, + "learning_rate": 2.9307282415630553e-05, + "loss": 0.0065, + "step": 39625 + }, + { + "epoch": 11.825231136295855, + "grad_norm": 0.0018259455682709813, + "learning_rate": 2.929071339572122e-05, + "loss": 0.0247, + "step": 39650 + }, + { + "epoch": 11.832687145839547, + "grad_norm": 0.0467984676361084, + "learning_rate": 2.9274144375811885e-05, + "loss": 0.0253, + "step": 39675 + }, + { + "epoch": 11.840143155383238, + "grad_norm": 0.0014794999733567238, + "learning_rate": 2.925757535590255e-05, + "loss": 0.0002, + "step": 39700 + }, + { + "epoch": 11.847599164926931, + "grad_norm": 0.7367884516716003, + "learning_rate": 2.9241006335993216e-05, + "loss": 0.0039, + "step": 39725 + }, + { + "epoch": 11.855055174470623, + "grad_norm": 0.0043684993870556355, + "learning_rate": 2.922443731608388e-05, + "loss": 0.0408, + "step": 39750 + }, + { + "epoch": 11.862511184014316, + "grad_norm": 13.370290756225586, + "learning_rate": 2.9207868296174544e-05, + "loss": 0.0133, + "step": 39775 + }, + { + "epoch": 11.869967193558008, + "grad_norm": 0.04149283468723297, + "learning_rate": 2.919129927626521e-05, + "loss": 0.009, + "step": 39800 + }, + { + "epoch": 11.8774232031017, + "grad_norm": 0.0070778988301754, + "learning_rate": 2.9174730256355876e-05, + "loss": 0.0003, + "step": 39825 + }, + { + "epoch": 11.884879212645393, + "grad_norm": 0.13461697101593018, + "learning_rate": 2.915816123644654e-05, + "loss": 0.0176, + "step": 39850 + }, + { + "epoch": 11.892335222189084, + "grad_norm": 0.06489887088537216, + "learning_rate": 2.914159221653721e-05, + "loss": 0.0796, + "step": 39875 + }, + { + "epoch": 11.899791231732777, + "grad_norm": 0.25228577852249146, + "learning_rate": 2.9125023196627876e-05, + "loss": 0.0176, + "step": 39900 + }, + { + "epoch": 11.907247241276469, + "grad_norm": 0.02081029862165451, + "learning_rate": 2.9108454176718542e-05, + "loss": 0.0103, + "step": 39925 + }, + { + "epoch": 11.91470325082016, + "grad_norm": 0.0007288819178938866, + "learning_rate": 2.9091885156809208e-05, + "loss": 0.0224, + "step": 39950 + }, + { + "epoch": 11.922159260363854, + "grad_norm": 0.45741400122642517, + "learning_rate": 2.907531613689987e-05, + "loss": 0.0431, + "step": 39975 + }, + { + "epoch": 11.929615269907545, + "grad_norm": 0.0039330171421170235, + "learning_rate": 2.9058747116990536e-05, + "loss": 0.005, + "step": 40000 + }, + { + "epoch": 11.937071279451239, + "grad_norm": 0.046019960194826126, + "learning_rate": 2.90421780970812e-05, + "loss": 0.0065, + "step": 40025 + }, + { + "epoch": 11.94452728899493, + "grad_norm": 0.059516049921512604, + "learning_rate": 2.9025609077171867e-05, + "loss": 0.0031, + "step": 40050 + }, + { + "epoch": 11.951983298538622, + "grad_norm": 0.005144066177308559, + "learning_rate": 2.9009040057262533e-05, + "loss": 0.0752, + "step": 40075 + }, + { + "epoch": 11.959439308082315, + "grad_norm": 0.0025731483474373817, + "learning_rate": 2.8992471037353195e-05, + "loss": 0.0682, + "step": 40100 + }, + { + "epoch": 11.966895317626006, + "grad_norm": 0.0010169928427785635, + "learning_rate": 2.8975902017443868e-05, + "loss": 0.0004, + "step": 40125 + }, + { + "epoch": 11.974351327169698, + "grad_norm": 0.1493087112903595, + "learning_rate": 2.8959332997534534e-05, + "loss": 0.0745, + "step": 40150 + }, + { + "epoch": 11.981807336713391, + "grad_norm": 0.024421213194727898, + "learning_rate": 2.89427639776252e-05, + "loss": 0.0019, + "step": 40175 + }, + { + "epoch": 11.989263346257083, + "grad_norm": 0.01383435633033514, + "learning_rate": 2.8926194957715862e-05, + "loss": 0.0031, + "step": 40200 + }, + { + "epoch": 11.996719355800776, + "grad_norm": 0.015260148793458939, + "learning_rate": 2.8909625937806528e-05, + "loss": 0.0206, + "step": 40225 + }, + { + "epoch": 12.0, + "eval_gen_len": 8.7358, + "eval_loss": 0.07119181752204895, + "eval_rouge1": 97.3656, + "eval_rouge2": 84.7781, + "eval_rougeL": 97.3473, + "eval_rougeLsum": 97.3476, + "eval_runtime": 100.8839, + "eval_samples_per_second": 16.623, + "eval_steps_per_second": 4.163, + "step": 40236 + }, + { + "epoch": 12.004175365344468, + "grad_norm": 0.02059837430715561, + "learning_rate": 2.8893056917897193e-05, + "loss": 0.017, + "step": 40250 + }, + { + "epoch": 12.01163137488816, + "grad_norm": 13.741105079650879, + "learning_rate": 2.887648789798786e-05, + "loss": 0.0102, + "step": 40275 + }, + { + "epoch": 12.019087384431852, + "grad_norm": 0.038666047155857086, + "learning_rate": 2.8859918878078525e-05, + "loss": 0.0228, + "step": 40300 + }, + { + "epoch": 12.026543393975544, + "grad_norm": 0.004178918898105621, + "learning_rate": 2.8843349858169187e-05, + "loss": 0.0009, + "step": 40325 + }, + { + "epoch": 12.033999403519237, + "grad_norm": 0.0011557178804650903, + "learning_rate": 2.882678083825986e-05, + "loss": 0.0194, + "step": 40350 + }, + { + "epoch": 12.041455413062929, + "grad_norm": 0.11613950878381729, + "learning_rate": 2.8810211818350525e-05, + "loss": 0.0164, + "step": 40375 + }, + { + "epoch": 12.04891142260662, + "grad_norm": 0.0020942033734172583, + "learning_rate": 2.879364279844119e-05, + "loss": 0.0246, + "step": 40400 + }, + { + "epoch": 12.056367432150314, + "grad_norm": 0.0037937331944704056, + "learning_rate": 2.8777073778531853e-05, + "loss": 0.0229, + "step": 40425 + }, + { + "epoch": 12.063823441694005, + "grad_norm": 0.47289687395095825, + "learning_rate": 2.876050475862252e-05, + "loss": 0.0003, + "step": 40450 + }, + { + "epoch": 12.071279451237698, + "grad_norm": 0.0333111509680748, + "learning_rate": 2.8743935738713185e-05, + "loss": 0.0011, + "step": 40475 + }, + { + "epoch": 12.07873546078139, + "grad_norm": 0.3923133909702301, + "learning_rate": 2.872736671880385e-05, + "loss": 0.0037, + "step": 40500 + }, + { + "epoch": 12.086191470325081, + "grad_norm": 0.006680936552584171, + "learning_rate": 2.8710797698894516e-05, + "loss": 0.021, + "step": 40525 + }, + { + "epoch": 12.093647479868775, + "grad_norm": 0.010220406576991081, + "learning_rate": 2.869422867898518e-05, + "loss": 0.0013, + "step": 40550 + }, + { + "epoch": 12.101103489412466, + "grad_norm": 0.026769593358039856, + "learning_rate": 2.8677659659075844e-05, + "loss": 0.0036, + "step": 40575 + }, + { + "epoch": 12.108559498956158, + "grad_norm": 0.006459955126047134, + "learning_rate": 2.8661090639166517e-05, + "loss": 0.001, + "step": 40600 + }, + { + "epoch": 12.116015508499851, + "grad_norm": 0.5409096479415894, + "learning_rate": 2.864452161925718e-05, + "loss": 0.02, + "step": 40625 + }, + { + "epoch": 12.123471518043543, + "grad_norm": 0.009102045558393002, + "learning_rate": 2.8627952599347845e-05, + "loss": 0.0743, + "step": 40650 + }, + { + "epoch": 12.130927527587236, + "grad_norm": 0.04457883909344673, + "learning_rate": 2.861138357943851e-05, + "loss": 0.0082, + "step": 40675 + }, + { + "epoch": 12.138383537130927, + "grad_norm": 0.009046703577041626, + "learning_rate": 2.8594814559529176e-05, + "loss": 0.0008, + "step": 40700 + }, + { + "epoch": 12.145839546674619, + "grad_norm": 21.693180084228516, + "learning_rate": 2.8578245539619842e-05, + "loss": 0.0645, + "step": 40725 + }, + { + "epoch": 12.153295556218312, + "grad_norm": 0.010429131798446178, + "learning_rate": 2.8561676519710508e-05, + "loss": 0.0301, + "step": 40750 + }, + { + "epoch": 12.160751565762004, + "grad_norm": 0.11445409804582596, + "learning_rate": 2.854510749980117e-05, + "loss": 0.0078, + "step": 40775 + }, + { + "epoch": 12.168207575305697, + "grad_norm": 0.001903259544633329, + "learning_rate": 2.8528538479891836e-05, + "loss": 0.0154, + "step": 40800 + }, + { + "epoch": 12.175663584849389, + "grad_norm": 0.0068878475576639175, + "learning_rate": 2.851196945998251e-05, + "loss": 0.0041, + "step": 40825 + }, + { + "epoch": 12.18311959439308, + "grad_norm": 0.0009471174562349916, + "learning_rate": 2.849540044007317e-05, + "loss": 0.0211, + "step": 40850 + }, + { + "epoch": 12.190575603936773, + "grad_norm": 0.0013344428734853864, + "learning_rate": 2.8478831420163837e-05, + "loss": 0.0003, + "step": 40875 + }, + { + "epoch": 12.198031613480465, + "grad_norm": 4.818425178527832, + "learning_rate": 2.8462262400254502e-05, + "loss": 0.0184, + "step": 40900 + }, + { + "epoch": 12.205487623024158, + "grad_norm": 0.0003423032758291811, + "learning_rate": 2.8445693380345168e-05, + "loss": 0.0222, + "step": 40925 + }, + { + "epoch": 12.21294363256785, + "grad_norm": 0.01015832182019949, + "learning_rate": 2.8429124360435834e-05, + "loss": 0.0044, + "step": 40950 + }, + { + "epoch": 12.220399642111541, + "grad_norm": 0.00136271002702415, + "learning_rate": 2.8412555340526496e-05, + "loss": 0.0027, + "step": 40975 + }, + { + "epoch": 12.227855651655235, + "grad_norm": 0.033610399812459946, + "learning_rate": 2.8395986320617162e-05, + "loss": 0.0345, + "step": 41000 + }, + { + "epoch": 12.235311661198926, + "grad_norm": 0.0037056605797261, + "learning_rate": 2.8379417300707828e-05, + "loss": 0.0032, + "step": 41025 + }, + { + "epoch": 12.24276767074262, + "grad_norm": 0.02005760185420513, + "learning_rate": 2.8362848280798493e-05, + "loss": 0.0306, + "step": 41050 + }, + { + "epoch": 12.250223680286311, + "grad_norm": 0.9534538984298706, + "learning_rate": 2.8346279260889162e-05, + "loss": 0.0531, + "step": 41075 + }, + { + "epoch": 12.257679689830002, + "grad_norm": 26.52764892578125, + "learning_rate": 2.8329710240979828e-05, + "loss": 0.0089, + "step": 41100 + }, + { + "epoch": 12.265135699373696, + "grad_norm": 12.732146263122559, + "learning_rate": 2.8313141221070494e-05, + "loss": 0.0057, + "step": 41125 + }, + { + "epoch": 12.272591708917387, + "grad_norm": 0.009532719850540161, + "learning_rate": 2.829657220116116e-05, + "loss": 0.0027, + "step": 41150 + }, + { + "epoch": 12.280047718461079, + "grad_norm": 0.2986721694469452, + "learning_rate": 2.8280003181251825e-05, + "loss": 0.0284, + "step": 41175 + }, + { + "epoch": 12.287503728004772, + "grad_norm": 0.010956763289868832, + "learning_rate": 2.8263434161342488e-05, + "loss": 0.0321, + "step": 41200 + }, + { + "epoch": 12.294959737548464, + "grad_norm": 0.010674208402633667, + "learning_rate": 2.8246865141433153e-05, + "loss": 0.0036, + "step": 41225 + }, + { + "epoch": 12.302415747092157, + "grad_norm": 0.00191806023940444, + "learning_rate": 2.823029612152382e-05, + "loss": 0.0047, + "step": 41250 + }, + { + "epoch": 12.309871756635848, + "grad_norm": 0.03268317133188248, + "learning_rate": 2.8213727101614485e-05, + "loss": 0.0026, + "step": 41275 + }, + { + "epoch": 12.31732776617954, + "grad_norm": 0.018544087186455727, + "learning_rate": 2.8197158081705154e-05, + "loss": 0.0022, + "step": 41300 + }, + { + "epoch": 12.324783775723233, + "grad_norm": 0.04845629632472992, + "learning_rate": 2.818058906179582e-05, + "loss": 0.0147, + "step": 41325 + }, + { + "epoch": 12.332239785266925, + "grad_norm": 0.3336451053619385, + "learning_rate": 2.8164020041886485e-05, + "loss": 0.0272, + "step": 41350 + }, + { + "epoch": 12.339695794810618, + "grad_norm": 0.0033389104064553976, + "learning_rate": 2.814745102197715e-05, + "loss": 0.04, + "step": 41375 + }, + { + "epoch": 12.34715180435431, + "grad_norm": 0.0076507944613695145, + "learning_rate": 2.8130882002067817e-05, + "loss": 0.0039, + "step": 41400 + }, + { + "epoch": 12.354607813898001, + "grad_norm": 22.395612716674805, + "learning_rate": 2.811431298215848e-05, + "loss": 0.005, + "step": 41425 + }, + { + "epoch": 12.362063823441694, + "grad_norm": 0.0016582268290221691, + "learning_rate": 2.8097743962249145e-05, + "loss": 0.0293, + "step": 41450 + }, + { + "epoch": 12.369519832985386, + "grad_norm": 11.2319974899292, + "learning_rate": 2.808117494233981e-05, + "loss": 0.0066, + "step": 41475 + }, + { + "epoch": 12.37697584252908, + "grad_norm": 2.7760393619537354, + "learning_rate": 2.8064605922430476e-05, + "loss": 0.0015, + "step": 41500 + }, + { + "epoch": 12.38443185207277, + "grad_norm": 0.003564919577911496, + "learning_rate": 2.8048036902521142e-05, + "loss": 0.002, + "step": 41525 + }, + { + "epoch": 12.391887861616462, + "grad_norm": 0.002883300883695483, + "learning_rate": 2.803146788261181e-05, + "loss": 0.0022, + "step": 41550 + }, + { + "epoch": 12.399343871160156, + "grad_norm": 0.1490948647260666, + "learning_rate": 2.8014898862702477e-05, + "loss": 0.0006, + "step": 41575 + }, + { + "epoch": 12.406799880703847, + "grad_norm": 0.004211048129945993, + "learning_rate": 2.7998329842793143e-05, + "loss": 0.0007, + "step": 41600 + }, + { + "epoch": 12.414255890247539, + "grad_norm": 0.008610145188868046, + "learning_rate": 2.798176082288381e-05, + "loss": 0.0316, + "step": 41625 + }, + { + "epoch": 12.421711899791232, + "grad_norm": 0.023798583075404167, + "learning_rate": 2.796519180297447e-05, + "loss": 0.0136, + "step": 41650 + }, + { + "epoch": 12.429167909334923, + "grad_norm": 0.07987093180418015, + "learning_rate": 2.7948622783065137e-05, + "loss": 0.0008, + "step": 41675 + }, + { + "epoch": 12.436623918878617, + "grad_norm": 0.48281994462013245, + "learning_rate": 2.7932053763155802e-05, + "loss": 0.0188, + "step": 41700 + }, + { + "epoch": 12.444079928422308, + "grad_norm": 0.00766774220392108, + "learning_rate": 2.7915484743246468e-05, + "loss": 0.0128, + "step": 41725 + }, + { + "epoch": 12.451535937966, + "grad_norm": 0.025086617097258568, + "learning_rate": 2.7898915723337134e-05, + "loss": 0.0431, + "step": 41750 + }, + { + "epoch": 12.458991947509693, + "grad_norm": 0.012511699460446835, + "learning_rate": 2.7882346703427796e-05, + "loss": 0.0017, + "step": 41775 + }, + { + "epoch": 12.466447957053385, + "grad_norm": 0.0011564865708351135, + "learning_rate": 2.786577768351847e-05, + "loss": 0.0011, + "step": 41800 + }, + { + "epoch": 12.473903966597078, + "grad_norm": 0.65472412109375, + "learning_rate": 2.7849208663609134e-05, + "loss": 0.019, + "step": 41825 + }, + { + "epoch": 12.48135997614077, + "grad_norm": 0.005623379722237587, + "learning_rate": 2.7832639643699797e-05, + "loss": 0.0303, + "step": 41850 + }, + { + "epoch": 12.488815985684461, + "grad_norm": 0.018389273434877396, + "learning_rate": 2.7816070623790462e-05, + "loss": 0.0032, + "step": 41875 + }, + { + "epoch": 12.496271995228154, + "grad_norm": 0.0019741118885576725, + "learning_rate": 2.7799501603881128e-05, + "loss": 0.0167, + "step": 41900 + }, + { + "epoch": 12.503728004771846, + "grad_norm": 0.1198592334985733, + "learning_rate": 2.7782932583971794e-05, + "loss": 0.0053, + "step": 41925 + }, + { + "epoch": 12.511184014315539, + "grad_norm": 0.03189831227064133, + "learning_rate": 2.776636356406246e-05, + "loss": 0.0043, + "step": 41950 + }, + { + "epoch": 12.51864002385923, + "grad_norm": 0.12729355692863464, + "learning_rate": 2.7749794544153125e-05, + "loss": 0.001, + "step": 41975 + }, + { + "epoch": 12.526096033402922, + "grad_norm": 0.00783438328653574, + "learning_rate": 2.7733225524243788e-05, + "loss": 0.012, + "step": 42000 + }, + { + "epoch": 12.533552042946615, + "grad_norm": 0.024893639609217644, + "learning_rate": 2.771665650433446e-05, + "loss": 0.0283, + "step": 42025 + }, + { + "epoch": 12.541008052490307, + "grad_norm": 0.016918212175369263, + "learning_rate": 2.7700087484425126e-05, + "loss": 0.0006, + "step": 42050 + }, + { + "epoch": 12.548464062034, + "grad_norm": 0.03593961149454117, + "learning_rate": 2.7683518464515788e-05, + "loss": 0.0196, + "step": 42075 + }, + { + "epoch": 12.555920071577692, + "grad_norm": 0.024990715086460114, + "learning_rate": 2.7666949444606454e-05, + "loss": 0.0039, + "step": 42100 + }, + { + "epoch": 12.563376081121383, + "grad_norm": 0.09220023453235626, + "learning_rate": 2.765038042469712e-05, + "loss": 0.0034, + "step": 42125 + }, + { + "epoch": 12.570832090665077, + "grad_norm": 0.007467248011380434, + "learning_rate": 2.7633811404787785e-05, + "loss": 0.0014, + "step": 42150 + }, + { + "epoch": 12.578288100208768, + "grad_norm": 0.0017490466125309467, + "learning_rate": 2.761724238487845e-05, + "loss": 0.001, + "step": 42175 + }, + { + "epoch": 12.585744109752461, + "grad_norm": 0.08711958676576614, + "learning_rate": 2.7600673364969113e-05, + "loss": 0.0216, + "step": 42200 + }, + { + "epoch": 12.593200119296153, + "grad_norm": 0.05300259217619896, + "learning_rate": 2.758410434505978e-05, + "loss": 0.0161, + "step": 42225 + }, + { + "epoch": 12.600656128839844, + "grad_norm": 0.009109907783567905, + "learning_rate": 2.7567535325150445e-05, + "loss": 0.0456, + "step": 42250 + }, + { + "epoch": 12.608112138383538, + "grad_norm": 1.0785945653915405, + "learning_rate": 2.7550966305241117e-05, + "loss": 0.0015, + "step": 42275 + }, + { + "epoch": 12.61556814792723, + "grad_norm": 0.0029738135635852814, + "learning_rate": 2.753439728533178e-05, + "loss": 0.0102, + "step": 42300 + }, + { + "epoch": 12.62302415747092, + "grad_norm": 0.0014897419605404139, + "learning_rate": 2.7517828265422446e-05, + "loss": 0.0029, + "step": 42325 + }, + { + "epoch": 12.630480167014614, + "grad_norm": 0.0512629896402359, + "learning_rate": 2.750125924551311e-05, + "loss": 0.0007, + "step": 42350 + }, + { + "epoch": 12.637936176558306, + "grad_norm": 0.0043296511285007, + "learning_rate": 2.7484690225603777e-05, + "loss": 0.0004, + "step": 42375 + }, + { + "epoch": 12.645392186101999, + "grad_norm": 0.005691382568329573, + "learning_rate": 2.7468121205694443e-05, + "loss": 0.01, + "step": 42400 + }, + { + "epoch": 12.65284819564569, + "grad_norm": 0.004919271916151047, + "learning_rate": 2.7451552185785105e-05, + "loss": 0.0016, + "step": 42425 + }, + { + "epoch": 12.660304205189382, + "grad_norm": 2.6745660305023193, + "learning_rate": 2.743498316587577e-05, + "loss": 0.037, + "step": 42450 + }, + { + "epoch": 12.667760214733075, + "grad_norm": 0.005345212761312723, + "learning_rate": 2.7418414145966437e-05, + "loss": 0.0026, + "step": 42475 + }, + { + "epoch": 12.675216224276767, + "grad_norm": 0.04073727875947952, + "learning_rate": 2.740184512605711e-05, + "loss": 0.0007, + "step": 42500 + }, + { + "epoch": 12.68267223382046, + "grad_norm": 0.001461253734305501, + "learning_rate": 2.738527610614777e-05, + "loss": 0.0206, + "step": 42525 + }, + { + "epoch": 12.690128243364152, + "grad_norm": 0.010080489329993725, + "learning_rate": 2.7368707086238437e-05, + "loss": 0.0007, + "step": 42550 + }, + { + "epoch": 12.697584252907843, + "grad_norm": 0.0009966216748580337, + "learning_rate": 2.7352138066329103e-05, + "loss": 0.0004, + "step": 42575 + }, + { + "epoch": 12.705040262451536, + "grad_norm": 0.04050817713141441, + "learning_rate": 2.733556904641977e-05, + "loss": 0.0143, + "step": 42600 + }, + { + "epoch": 12.712496271995228, + "grad_norm": 0.00188263482414186, + "learning_rate": 2.7319000026510434e-05, + "loss": 0.0028, + "step": 42625 + }, + { + "epoch": 12.71995228153892, + "grad_norm": 0.020978985354304314, + "learning_rate": 2.7302431006601097e-05, + "loss": 0.0174, + "step": 42650 + }, + { + "epoch": 12.727408291082613, + "grad_norm": 0.00698693236336112, + "learning_rate": 2.7285861986691762e-05, + "loss": 0.0006, + "step": 42675 + }, + { + "epoch": 12.734864300626304, + "grad_norm": 0.019326740875840187, + "learning_rate": 2.7269292966782428e-05, + "loss": 0.0027, + "step": 42700 + }, + { + "epoch": 12.742320310169998, + "grad_norm": 0.03213045001029968, + "learning_rate": 2.7252723946873094e-05, + "loss": 0.0187, + "step": 42725 + }, + { + "epoch": 12.749776319713689, + "grad_norm": 0.0011091905180364847, + "learning_rate": 2.7236154926963763e-05, + "loss": 0.0002, + "step": 42750 + }, + { + "epoch": 12.75723232925738, + "grad_norm": 0.13166595995426178, + "learning_rate": 2.721958590705443e-05, + "loss": 0.0189, + "step": 42775 + }, + { + "epoch": 12.764688338801074, + "grad_norm": 0.015718284994363785, + "learning_rate": 2.7203016887145094e-05, + "loss": 0.0002, + "step": 42800 + }, + { + "epoch": 12.772144348344765, + "grad_norm": 0.01700720004737377, + "learning_rate": 2.718644786723576e-05, + "loss": 0.0119, + "step": 42825 + }, + { + "epoch": 12.779600357888459, + "grad_norm": 0.000691990542691201, + "learning_rate": 2.7169878847326426e-05, + "loss": 0.0585, + "step": 42850 + }, + { + "epoch": 12.78705636743215, + "grad_norm": 0.033096782863140106, + "learning_rate": 2.7153309827417088e-05, + "loss": 0.0066, + "step": 42875 + }, + { + "epoch": 12.794512376975842, + "grad_norm": 0.0045728497207164764, + "learning_rate": 2.7136740807507754e-05, + "loss": 0.0022, + "step": 42900 + }, + { + "epoch": 12.801968386519535, + "grad_norm": 0.035219863057136536, + "learning_rate": 2.712017178759842e-05, + "loss": 0.0287, + "step": 42925 + }, + { + "epoch": 12.809424396063227, + "grad_norm": 0.04110410436987877, + "learning_rate": 2.7103602767689085e-05, + "loss": 0.0012, + "step": 42950 + }, + { + "epoch": 12.81688040560692, + "grad_norm": 0.0013123464304953814, + "learning_rate": 2.7087033747779755e-05, + "loss": 0.019, + "step": 42975 + }, + { + "epoch": 12.824336415150611, + "grad_norm": 0.005283961072564125, + "learning_rate": 2.707046472787042e-05, + "loss": 0.006, + "step": 43000 + }, + { + "epoch": 12.831792424694303, + "grad_norm": 0.003638888243585825, + "learning_rate": 2.7053895707961086e-05, + "loss": 0.0198, + "step": 43025 + }, + { + "epoch": 12.839248434237996, + "grad_norm": 0.10352325439453125, + "learning_rate": 2.7037326688051752e-05, + "loss": 0.0011, + "step": 43050 + }, + { + "epoch": 12.846704443781688, + "grad_norm": 8.774328231811523, + "learning_rate": 2.7020757668142414e-05, + "loss": 0.0223, + "step": 43075 + }, + { + "epoch": 12.854160453325381, + "grad_norm": 0.017438048496842384, + "learning_rate": 2.700418864823308e-05, + "loss": 0.0014, + "step": 43100 + }, + { + "epoch": 12.861616462869073, + "grad_norm": 0.0033278854098170996, + "learning_rate": 2.6987619628323746e-05, + "loss": 0.0031, + "step": 43125 + }, + { + "epoch": 12.869072472412764, + "grad_norm": 0.021386155858635902, + "learning_rate": 2.697105060841441e-05, + "loss": 0.0066, + "step": 43150 + }, + { + "epoch": 12.876528481956457, + "grad_norm": 0.005346431862562895, + "learning_rate": 2.6954481588505077e-05, + "loss": 0.0233, + "step": 43175 + }, + { + "epoch": 12.883984491500149, + "grad_norm": 0.18703578412532806, + "learning_rate": 2.6937912568595743e-05, + "loss": 0.0135, + "step": 43200 + }, + { + "epoch": 12.891440501043842, + "grad_norm": 0.0013108194107189775, + "learning_rate": 2.6921343548686412e-05, + "loss": 0.0136, + "step": 43225 + }, + { + "epoch": 12.898896510587534, + "grad_norm": 0.17484645545482635, + "learning_rate": 2.6904774528777078e-05, + "loss": 0.0017, + "step": 43250 + }, + { + "epoch": 12.906352520131225, + "grad_norm": 0.008680760860443115, + "learning_rate": 2.6888205508867743e-05, + "loss": 0.0115, + "step": 43275 + }, + { + "epoch": 12.913808529674919, + "grad_norm": 0.5126601457595825, + "learning_rate": 2.6871636488958406e-05, + "loss": 0.0152, + "step": 43300 + }, + { + "epoch": 12.92126453921861, + "grad_norm": 1.5607045888900757, + "learning_rate": 2.685506746904907e-05, + "loss": 0.025, + "step": 43325 + }, + { + "epoch": 12.928720548762302, + "grad_norm": 10.233118057250977, + "learning_rate": 2.6838498449139737e-05, + "loss": 0.0017, + "step": 43350 + }, + { + "epoch": 12.936176558305995, + "grad_norm": 0.0010243066353723407, + "learning_rate": 2.6821929429230403e-05, + "loss": 0.0085, + "step": 43375 + }, + { + "epoch": 12.943632567849686, + "grad_norm": 2.000396251678467, + "learning_rate": 2.680536040932107e-05, + "loss": 0.0107, + "step": 43400 + }, + { + "epoch": 12.95108857739338, + "grad_norm": 0.002072014380246401, + "learning_rate": 2.678879138941173e-05, + "loss": 0.0084, + "step": 43425 + }, + { + "epoch": 12.958544586937071, + "grad_norm": 0.01992828957736492, + "learning_rate": 2.6772222369502397e-05, + "loss": 0.027, + "step": 43450 + }, + { + "epoch": 12.966000596480763, + "grad_norm": 0.027412349358201027, + "learning_rate": 2.675565334959307e-05, + "loss": 0.001, + "step": 43475 + }, + { + "epoch": 12.973456606024456, + "grad_norm": 0.0024006732273846865, + "learning_rate": 2.6739084329683735e-05, + "loss": 0.0332, + "step": 43500 + }, + { + "epoch": 12.980912615568148, + "grad_norm": 0.004388559143990278, + "learning_rate": 2.6722515309774397e-05, + "loss": 0.005, + "step": 43525 + }, + { + "epoch": 12.98836862511184, + "grad_norm": 0.009956770576536655, + "learning_rate": 2.6705946289865063e-05, + "loss": 0.0473, + "step": 43550 + }, + { + "epoch": 12.995824634655532, + "grad_norm": 0.001230629743076861, + "learning_rate": 2.668937726995573e-05, + "loss": 0.027, + "step": 43575 + }, + { + "epoch": 13.0, + "eval_gen_len": 8.7484, + "eval_loss": 0.06554193049669266, + "eval_rouge1": 97.8785, + "eval_rouge2": 85.6962, + "eval_rougeL": 97.854, + "eval_rougeLsum": 97.8551, + "eval_runtime": 99.1128, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 4.238, + "step": 43589 + }, + { + "epoch": 13.003280644199224, + "grad_norm": 0.02172417752444744, + "learning_rate": 2.6672808250046394e-05, + "loss": 0.034, + "step": 43600 + }, + { + "epoch": 13.010736653742917, + "grad_norm": 0.01802818849682808, + "learning_rate": 2.665623923013706e-05, + "loss": 0.0239, + "step": 43625 + }, + { + "epoch": 13.018192663286609, + "grad_norm": 0.0032581316772848368, + "learning_rate": 2.6639670210227722e-05, + "loss": 0.0071, + "step": 43650 + }, + { + "epoch": 13.025648672830302, + "grad_norm": 36.717140197753906, + "learning_rate": 2.6623101190318388e-05, + "loss": 0.0701, + "step": 43675 + }, + { + "epoch": 13.033104682373994, + "grad_norm": 0.012505823746323586, + "learning_rate": 2.660653217040906e-05, + "loss": 0.013, + "step": 43700 + }, + { + "epoch": 13.040560691917685, + "grad_norm": 0.1828284114599228, + "learning_rate": 2.6589963150499726e-05, + "loss": 0.0014, + "step": 43725 + }, + { + "epoch": 13.048016701461378, + "grad_norm": 14.492470741271973, + "learning_rate": 2.657339413059039e-05, + "loss": 0.0455, + "step": 43750 + }, + { + "epoch": 13.05547271100507, + "grad_norm": 0.023192638531327248, + "learning_rate": 2.6556825110681055e-05, + "loss": 0.0421, + "step": 43775 + }, + { + "epoch": 13.062928720548761, + "grad_norm": 0.006542236544191837, + "learning_rate": 2.654025609077172e-05, + "loss": 0.0004, + "step": 43800 + }, + { + "epoch": 13.070384730092455, + "grad_norm": 0.001725667854771018, + "learning_rate": 2.6523687070862386e-05, + "loss": 0.0026, + "step": 43825 + }, + { + "epoch": 13.077840739636146, + "grad_norm": 0.01235037762671709, + "learning_rate": 2.6507118050953052e-05, + "loss": 0.003, + "step": 43850 + }, + { + "epoch": 13.08529674917984, + "grad_norm": 0.006246176082640886, + "learning_rate": 2.6490549031043714e-05, + "loss": 0.0019, + "step": 43875 + }, + { + "epoch": 13.092752758723531, + "grad_norm": 0.19377729296684265, + "learning_rate": 2.647398001113438e-05, + "loss": 0.0008, + "step": 43900 + }, + { + "epoch": 13.100208768267223, + "grad_norm": 0.010808723047375679, + "learning_rate": 2.6457410991225046e-05, + "loss": 0.0011, + "step": 43925 + }, + { + "epoch": 13.107664777810916, + "grad_norm": 0.10874485224485397, + "learning_rate": 2.6440841971315715e-05, + "loss": 0.0508, + "step": 43950 + }, + { + "epoch": 13.115120787354607, + "grad_norm": 4.295207977294922, + "learning_rate": 2.642427295140638e-05, + "loss": 0.0035, + "step": 43975 + }, + { + "epoch": 13.1225767968983, + "grad_norm": 0.027464494109153748, + "learning_rate": 2.6407703931497046e-05, + "loss": 0.034, + "step": 44000 + }, + { + "epoch": 13.130032806441992, + "grad_norm": 0.007744842674583197, + "learning_rate": 2.6391134911587712e-05, + "loss": 0.0144, + "step": 44025 + }, + { + "epoch": 13.137488815985684, + "grad_norm": 0.026684366166591644, + "learning_rate": 2.6374565891678378e-05, + "loss": 0.0097, + "step": 44050 + }, + { + "epoch": 13.144944825529377, + "grad_norm": 0.00497918576002121, + "learning_rate": 2.6357996871769043e-05, + "loss": 0.024, + "step": 44075 + }, + { + "epoch": 13.152400835073069, + "grad_norm": 0.0011154951062053442, + "learning_rate": 2.6341427851859706e-05, + "loss": 0.0351, + "step": 44100 + }, + { + "epoch": 13.159856844616762, + "grad_norm": 0.07896313071250916, + "learning_rate": 2.632485883195037e-05, + "loss": 0.0005, + "step": 44125 + }, + { + "epoch": 13.167312854160453, + "grad_norm": 0.0008197020506486297, + "learning_rate": 2.6308289812041037e-05, + "loss": 0.0072, + "step": 44150 + }, + { + "epoch": 13.174768863704145, + "grad_norm": 0.005408796481788158, + "learning_rate": 2.6291720792131706e-05, + "loss": 0.0253, + "step": 44175 + }, + { + "epoch": 13.182224873247838, + "grad_norm": 0.04275033622980118, + "learning_rate": 2.6275151772222372e-05, + "loss": 0.0243, + "step": 44200 + }, + { + "epoch": 13.18968088279153, + "grad_norm": 0.009670349769294262, + "learning_rate": 2.6258582752313038e-05, + "loss": 0.0332, + "step": 44225 + }, + { + "epoch": 13.197136892335223, + "grad_norm": 0.000518015876878053, + "learning_rate": 2.6242013732403703e-05, + "loss": 0.0136, + "step": 44250 + }, + { + "epoch": 13.204592901878915, + "grad_norm": 0.03769877925515175, + "learning_rate": 2.622544471249437e-05, + "loss": 0.0135, + "step": 44275 + }, + { + "epoch": 13.212048911422606, + "grad_norm": 0.05490969493985176, + "learning_rate": 2.620887569258503e-05, + "loss": 0.0024, + "step": 44300 + }, + { + "epoch": 13.2195049209663, + "grad_norm": 0.86505126953125, + "learning_rate": 2.6192306672675697e-05, + "loss": 0.0024, + "step": 44325 + }, + { + "epoch": 13.22696093050999, + "grad_norm": 0.004475884605199099, + "learning_rate": 2.6175737652766363e-05, + "loss": 0.0296, + "step": 44350 + }, + { + "epoch": 13.234416940053682, + "grad_norm": 0.05450137332081795, + "learning_rate": 2.615916863285703e-05, + "loss": 0.0002, + "step": 44375 + }, + { + "epoch": 13.241872949597376, + "grad_norm": 0.139171302318573, + "learning_rate": 2.6142599612947694e-05, + "loss": 0.0006, + "step": 44400 + }, + { + "epoch": 13.249328959141067, + "grad_norm": 0.032572779804468155, + "learning_rate": 2.6126030593038364e-05, + "loss": 0.0002, + "step": 44425 + }, + { + "epoch": 13.25678496868476, + "grad_norm": 59.228363037109375, + "learning_rate": 2.610946157312903e-05, + "loss": 0.0387, + "step": 44450 + }, + { + "epoch": 13.264240978228452, + "grad_norm": 0.001431209035217762, + "learning_rate": 2.6092892553219695e-05, + "loss": 0.0001, + "step": 44475 + }, + { + "epoch": 13.271696987772144, + "grad_norm": 0.5849717855453491, + "learning_rate": 2.607632353331036e-05, + "loss": 0.005, + "step": 44500 + }, + { + "epoch": 13.279152997315837, + "grad_norm": 0.007736376952379942, + "learning_rate": 2.6059754513401023e-05, + "loss": 0.0264, + "step": 44525 + }, + { + "epoch": 13.286609006859528, + "grad_norm": 0.00018958588771056384, + "learning_rate": 2.604318549349169e-05, + "loss": 0.0002, + "step": 44550 + }, + { + "epoch": 13.294065016403222, + "grad_norm": 52.449249267578125, + "learning_rate": 2.6026616473582355e-05, + "loss": 0.0724, + "step": 44575 + }, + { + "epoch": 13.301521025946913, + "grad_norm": 0.0037214909680187702, + "learning_rate": 2.601004745367302e-05, + "loss": 0.0003, + "step": 44600 + }, + { + "epoch": 13.308977035490605, + "grad_norm": 0.009356235153973103, + "learning_rate": 2.5993478433763686e-05, + "loss": 0.0005, + "step": 44625 + }, + { + "epoch": 13.316433045034298, + "grad_norm": 8.970348358154297, + "learning_rate": 2.5976909413854355e-05, + "loss": 0.0087, + "step": 44650 + }, + { + "epoch": 13.32388905457799, + "grad_norm": 0.0007844800129532814, + "learning_rate": 2.596034039394502e-05, + "loss": 0.0002, + "step": 44675 + }, + { + "epoch": 13.331345064121683, + "grad_norm": 0.0024908827617764473, + "learning_rate": 2.5943771374035687e-05, + "loss": 0.0006, + "step": 44700 + }, + { + "epoch": 13.338801073665374, + "grad_norm": 0.004964989144355059, + "learning_rate": 2.5927202354126352e-05, + "loss": 0.0028, + "step": 44725 + }, + { + "epoch": 13.346257083209066, + "grad_norm": 1.3698699474334717, + "learning_rate": 2.5910633334217015e-05, + "loss": 0.0058, + "step": 44750 + }, + { + "epoch": 13.35371309275276, + "grad_norm": 0.04636652022600174, + "learning_rate": 2.589406431430768e-05, + "loss": 0.0266, + "step": 44775 + }, + { + "epoch": 13.36116910229645, + "grad_norm": 0.04417644068598747, + "learning_rate": 2.5877495294398346e-05, + "loss": 0.0054, + "step": 44800 + }, + { + "epoch": 13.368625111840144, + "grad_norm": 0.0034272114280611277, + "learning_rate": 2.5860926274489012e-05, + "loss": 0.0002, + "step": 44825 + }, + { + "epoch": 13.376081121383836, + "grad_norm": 0.04019004851579666, + "learning_rate": 2.5845020015376052e-05, + "loss": 0.0306, + "step": 44850 + }, + { + "epoch": 13.383537130927527, + "grad_norm": 3.370781898498535, + "learning_rate": 2.5828450995466718e-05, + "loss": 0.0017, + "step": 44875 + }, + { + "epoch": 13.39099314047122, + "grad_norm": 0.15892678499221802, + "learning_rate": 2.5811881975557384e-05, + "loss": 0.0011, + "step": 44900 + }, + { + "epoch": 13.398449150014912, + "grad_norm": 0.0011944427387788892, + "learning_rate": 2.579531295564805e-05, + "loss": 0.0188, + "step": 44925 + }, + { + "epoch": 13.405905159558603, + "grad_norm": 0.023739833384752274, + "learning_rate": 2.5778743935738715e-05, + "loss": 0.0029, + "step": 44950 + }, + { + "epoch": 13.413361169102297, + "grad_norm": 0.002228564117103815, + "learning_rate": 2.5762174915829378e-05, + "loss": 0.0002, + "step": 44975 + }, + { + "epoch": 13.420817178645988, + "grad_norm": 0.007577619515359402, + "learning_rate": 2.5745605895920043e-05, + "loss": 0.0058, + "step": 45000 + }, + { + "epoch": 13.428273188189682, + "grad_norm": 0.00293240649625659, + "learning_rate": 2.572903687601071e-05, + "loss": 0.0185, + "step": 45025 + }, + { + "epoch": 13.435729197733373, + "grad_norm": 0.3604615330696106, + "learning_rate": 2.571246785610138e-05, + "loss": 0.0478, + "step": 45050 + }, + { + "epoch": 13.443185207277065, + "grad_norm": 0.005937446840107441, + "learning_rate": 2.5695898836192044e-05, + "loss": 0.0035, + "step": 45075 + }, + { + "epoch": 13.450641216820758, + "grad_norm": 0.003275347175076604, + "learning_rate": 2.567932981628271e-05, + "loss": 0.0061, + "step": 45100 + }, + { + "epoch": 13.45809722636445, + "grad_norm": 0.003746124915778637, + "learning_rate": 2.5662760796373375e-05, + "loss": 0.0029, + "step": 45125 + }, + { + "epoch": 13.465553235908143, + "grad_norm": 12.623387336730957, + "learning_rate": 2.564619177646404e-05, + "loss": 0.0121, + "step": 45150 + }, + { + "epoch": 13.473009245451834, + "grad_norm": 0.2839594781398773, + "learning_rate": 2.5629622756554707e-05, + "loss": 0.0013, + "step": 45175 + }, + { + "epoch": 13.480465254995526, + "grad_norm": 6.323868751525879, + "learning_rate": 2.561305373664537e-05, + "loss": 0.01, + "step": 45200 + }, + { + "epoch": 13.487921264539219, + "grad_norm": 0.0072549269534647465, + "learning_rate": 2.5596484716736035e-05, + "loss": 0.002, + "step": 45225 + }, + { + "epoch": 13.49537727408291, + "grad_norm": 0.01581634394824505, + "learning_rate": 2.55799156968267e-05, + "loss": 0.0002, + "step": 45250 + }, + { + "epoch": 13.502833283626604, + "grad_norm": 0.013846264220774174, + "learning_rate": 2.5563346676917366e-05, + "loss": 0.0001, + "step": 45275 + }, + { + "epoch": 13.510289293170295, + "grad_norm": 0.004495398607105017, + "learning_rate": 2.5546777657008035e-05, + "loss": 0.001, + "step": 45300 + }, + { + "epoch": 13.517745302713987, + "grad_norm": 0.029844263568520546, + "learning_rate": 2.55302086370987e-05, + "loss": 0.0029, + "step": 45325 + }, + { + "epoch": 13.52520131225768, + "grad_norm": 0.002655371557921171, + "learning_rate": 2.5513639617189367e-05, + "loss": 0.0001, + "step": 45350 + }, + { + "epoch": 13.532657321801372, + "grad_norm": 0.0574614480137825, + "learning_rate": 2.5497070597280033e-05, + "loss": 0.0117, + "step": 45375 + }, + { + "epoch": 13.540113331345065, + "grad_norm": 0.0013383477926254272, + "learning_rate": 2.54805015773707e-05, + "loss": 0.0002, + "step": 45400 + }, + { + "epoch": 13.547569340888757, + "grad_norm": 0.061951614916324615, + "learning_rate": 2.546393255746136e-05, + "loss": 0.001, + "step": 45425 + }, + { + "epoch": 13.555025350432448, + "grad_norm": 19.441818237304688, + "learning_rate": 2.5447363537552026e-05, + "loss": 0.0544, + "step": 45450 + }, + { + "epoch": 13.562481359976141, + "grad_norm": 0.0009547994122840464, + "learning_rate": 2.5430794517642692e-05, + "loss": 0.0055, + "step": 45475 + }, + { + "epoch": 13.569937369519833, + "grad_norm": 8.116317749023438, + "learning_rate": 2.5414225497733358e-05, + "loss": 0.004, + "step": 45500 + }, + { + "epoch": 13.577393379063524, + "grad_norm": 0.016299117356538773, + "learning_rate": 2.5397656477824024e-05, + "loss": 0.0001, + "step": 45525 + }, + { + "epoch": 13.584849388607218, + "grad_norm": 0.10050185024738312, + "learning_rate": 2.5381087457914693e-05, + "loss": 0.0019, + "step": 45550 + }, + { + "epoch": 13.59230539815091, + "grad_norm": 0.0007255134987644851, + "learning_rate": 2.536451843800536e-05, + "loss": 0.0002, + "step": 45575 + }, + { + "epoch": 13.599761407694603, + "grad_norm": 0.017564982175827026, + "learning_rate": 2.5347949418096024e-05, + "loss": 0.0096, + "step": 45600 + }, + { + "epoch": 13.607217417238294, + "grad_norm": 0.002698665950447321, + "learning_rate": 2.5331380398186687e-05, + "loss": 0.0121, + "step": 45625 + }, + { + "epoch": 13.614673426781986, + "grad_norm": 0.03711654245853424, + "learning_rate": 2.5314811378277352e-05, + "loss": 0.0072, + "step": 45650 + }, + { + "epoch": 13.622129436325679, + "grad_norm": 0.015306944027543068, + "learning_rate": 2.5298242358368018e-05, + "loss": 0.0275, + "step": 45675 + }, + { + "epoch": 13.62958544586937, + "grad_norm": 0.04432107135653496, + "learning_rate": 2.5281673338458684e-05, + "loss": 0.0008, + "step": 45700 + }, + { + "epoch": 13.637041455413064, + "grad_norm": 0.013447861187160015, + "learning_rate": 2.526510431854935e-05, + "loss": 0.0009, + "step": 45725 + }, + { + "epoch": 13.644497464956755, + "grad_norm": 0.012608986347913742, + "learning_rate": 2.5248535298640015e-05, + "loss": 0.0013, + "step": 45750 + }, + { + "epoch": 13.651953474500447, + "grad_norm": 0.0007090241997502744, + "learning_rate": 2.5231966278730684e-05, + "loss": 0.0058, + "step": 45775 + }, + { + "epoch": 13.65940948404414, + "grad_norm": 0.08077728748321533, + "learning_rate": 2.521539725882135e-05, + "loss": 0.0259, + "step": 45800 + }, + { + "epoch": 13.666865493587832, + "grad_norm": 1.085524082183838, + "learning_rate": 2.5198828238912016e-05, + "loss": 0.0181, + "step": 45825 + }, + { + "epoch": 13.674321503131525, + "grad_norm": 0.0005606280756182969, + "learning_rate": 2.5182259219002678e-05, + "loss": 0.0004, + "step": 45850 + }, + { + "epoch": 13.681777512675216, + "grad_norm": 0.004463412798941135, + "learning_rate": 2.5165690199093344e-05, + "loss": 0.0014, + "step": 45875 + }, + { + "epoch": 13.689233522218908, + "grad_norm": 0.15161412954330444, + "learning_rate": 2.514912117918401e-05, + "loss": 0.0128, + "step": 45900 + }, + { + "epoch": 13.696689531762601, + "grad_norm": 0.009529800154268742, + "learning_rate": 2.5132552159274675e-05, + "loss": 0.0178, + "step": 45925 + }, + { + "epoch": 13.704145541306293, + "grad_norm": 4.095335006713867, + "learning_rate": 2.511598313936534e-05, + "loss": 0.011, + "step": 45950 + }, + { + "epoch": 13.711601550849984, + "grad_norm": 0.05283172428607941, + "learning_rate": 2.5099414119456003e-05, + "loss": 0.0004, + "step": 45975 + }, + { + "epoch": 13.719057560393678, + "grad_norm": 0.012699637562036514, + "learning_rate": 2.508284509954667e-05, + "loss": 0.0002, + "step": 46000 + }, + { + "epoch": 13.726513569937369, + "grad_norm": 0.9599093794822693, + "learning_rate": 2.506627607963734e-05, + "loss": 0.0346, + "step": 46025 + }, + { + "epoch": 13.733969579481062, + "grad_norm": 0.0005534732481464744, + "learning_rate": 2.5049707059728007e-05, + "loss": 0.0164, + "step": 46050 + }, + { + "epoch": 13.741425589024754, + "grad_norm": 0.013052860274910927, + "learning_rate": 2.503313803981867e-05, + "loss": 0.005, + "step": 46075 + }, + { + "epoch": 13.748881598568445, + "grad_norm": 0.0030805133283138275, + "learning_rate": 2.5016569019909335e-05, + "loss": 0.0594, + "step": 46100 + }, + { + "epoch": 13.756337608112139, + "grad_norm": 0.010317280888557434, + "learning_rate": 2.5e-05, + "loss": 0.0076, + "step": 46125 + }, + { + "epoch": 13.76379361765583, + "grad_norm": 0.0013628338929265738, + "learning_rate": 2.4983430980090667e-05, + "loss": 0.0014, + "step": 46150 + }, + { + "epoch": 13.771249627199524, + "grad_norm": 0.0019057797035202384, + "learning_rate": 2.4966861960181333e-05, + "loss": 0.0003, + "step": 46175 + }, + { + "epoch": 13.778705636743215, + "grad_norm": 0.10203558206558228, + "learning_rate": 2.4950292940272e-05, + "loss": 0.0003, + "step": 46200 + }, + { + "epoch": 13.786161646286907, + "grad_norm": 0.007893604226410389, + "learning_rate": 2.4933723920362664e-05, + "loss": 0.0139, + "step": 46225 + }, + { + "epoch": 13.7936176558306, + "grad_norm": 0.04614598676562309, + "learning_rate": 2.491715490045333e-05, + "loss": 0.0022, + "step": 46250 + }, + { + "epoch": 13.801073665374291, + "grad_norm": 0.010470214299857616, + "learning_rate": 2.4900585880543996e-05, + "loss": 0.0021, + "step": 46275 + }, + { + "epoch": 13.808529674917985, + "grad_norm": 0.001019405317492783, + "learning_rate": 2.488401686063466e-05, + "loss": 0.0131, + "step": 46300 + }, + { + "epoch": 13.815985684461676, + "grad_norm": 0.01035932544618845, + "learning_rate": 2.4867447840725327e-05, + "loss": 0.0069, + "step": 46325 + }, + { + "epoch": 13.823441694005368, + "grad_norm": 0.004152575973421335, + "learning_rate": 2.4850878820815993e-05, + "loss": 0.0263, + "step": 46350 + }, + { + "epoch": 13.830897703549061, + "grad_norm": 0.00521685928106308, + "learning_rate": 2.483430980090666e-05, + "loss": 0.0389, + "step": 46375 + }, + { + "epoch": 13.838353713092753, + "grad_norm": 0.008577450178563595, + "learning_rate": 2.4817740780997324e-05, + "loss": 0.0006, + "step": 46400 + }, + { + "epoch": 13.845809722636446, + "grad_norm": 0.004177641589194536, + "learning_rate": 2.480117176108799e-05, + "loss": 0.0046, + "step": 46425 + }, + { + "epoch": 13.853265732180137, + "grad_norm": 0.0356358103454113, + "learning_rate": 2.4784602741178656e-05, + "loss": 0.0005, + "step": 46450 + }, + { + "epoch": 13.860721741723829, + "grad_norm": 1.355373740196228, + "learning_rate": 2.476803372126932e-05, + "loss": 0.0133, + "step": 46475 + }, + { + "epoch": 13.868177751267522, + "grad_norm": 24.991962432861328, + "learning_rate": 2.4751464701359987e-05, + "loss": 0.0436, + "step": 46500 + }, + { + "epoch": 13.875633760811214, + "grad_norm": 0.007507434580475092, + "learning_rate": 2.473489568145065e-05, + "loss": 0.0004, + "step": 46525 + }, + { + "epoch": 13.883089770354907, + "grad_norm": 29.0565242767334, + "learning_rate": 2.471832666154132e-05, + "loss": 0.0673, + "step": 46550 + }, + { + "epoch": 13.890545779898599, + "grad_norm": 0.00047484287642873824, + "learning_rate": 2.4701757641631984e-05, + "loss": 0.0258, + "step": 46575 + }, + { + "epoch": 13.89800178944229, + "grad_norm": 0.0007500798092223704, + "learning_rate": 2.468518862172265e-05, + "loss": 0.0204, + "step": 46600 + }, + { + "epoch": 13.905457798985983, + "grad_norm": 0.009910466149449348, + "learning_rate": 2.4668619601813316e-05, + "loss": 0.004, + "step": 46625 + }, + { + "epoch": 13.912913808529675, + "grad_norm": 0.03574687987565994, + "learning_rate": 2.4652050581903978e-05, + "loss": 0.0359, + "step": 46650 + }, + { + "epoch": 13.920369818073366, + "grad_norm": 0.004774956498295069, + "learning_rate": 2.4635481561994647e-05, + "loss": 0.0013, + "step": 46675 + }, + { + "epoch": 13.92782582761706, + "grad_norm": 0.005982845090329647, + "learning_rate": 2.4618912542085313e-05, + "loss": 0.0005, + "step": 46700 + }, + { + "epoch": 13.935281837160751, + "grad_norm": 0.23766636848449707, + "learning_rate": 2.460234352217598e-05, + "loss": 0.0157, + "step": 46725 + }, + { + "epoch": 13.942737846704444, + "grad_norm": 0.026977479457855225, + "learning_rate": 2.458577450226664e-05, + "loss": 0.0192, + "step": 46750 + }, + { + "epoch": 13.950193856248136, + "grad_norm": 0.009664705954492092, + "learning_rate": 2.4569205482357307e-05, + "loss": 0.0485, + "step": 46775 + }, + { + "epoch": 13.957649865791828, + "grad_norm": 0.025634411722421646, + "learning_rate": 2.4552636462447976e-05, + "loss": 0.0291, + "step": 46800 + }, + { + "epoch": 13.96510587533552, + "grad_norm": 2.642550468444824, + "learning_rate": 2.453606744253864e-05, + "loss": 0.0156, + "step": 46825 + }, + { + "epoch": 13.972561884879212, + "grad_norm": 0.004317351151257753, + "learning_rate": 2.4519498422629304e-05, + "loss": 0.0001, + "step": 46850 + }, + { + "epoch": 13.980017894422906, + "grad_norm": 0.19009476900100708, + "learning_rate": 2.450292940271997e-05, + "loss": 0.0559, + "step": 46875 + }, + { + "epoch": 13.987473903966597, + "grad_norm": 0.9585381150245667, + "learning_rate": 2.4486360382810635e-05, + "loss": 0.0051, + "step": 46900 + }, + { + "epoch": 13.994929913510289, + "grad_norm": 0.01820201426744461, + "learning_rate": 2.4469791362901305e-05, + "loss": 0.0011, + "step": 46925 + }, + { + "epoch": 14.0, + "eval_gen_len": 8.7346, + "eval_loss": 0.06252285093069077, + "eval_rouge1": 97.9816, + "eval_rouge2": 85.7167, + "eval_rougeL": 97.9566, + "eval_rougeLsum": 97.9606, + "eval_runtime": 97.0515, + "eval_samples_per_second": 17.279, + "eval_steps_per_second": 4.328, + "step": 46942 + } + ], + "logging_steps": 25, + "max_steps": 83825, + "num_input_tokens_seen": 0, + "num_train_epochs": 25, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 232890506477568.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}