{ "best_metric": 0.06252285093069077, "best_model_checkpoint": "autotrain-yz7wm-laa5q/checkpoint-46942", "epoch": 14.0, "eval_steps": 500, "global_step": 46942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007456009543692216, "grad_norm": 45.01611328125, "learning_rate": 1.252534892043421e-07, "loss": 4.1595, "step": 25 }, { "epoch": 0.014912019087384432, "grad_norm": 71.49336242675781, "learning_rate": 2.74364785876178e-07, "loss": 2.8452, "step": 50 }, { "epoch": 0.022368028631076647, "grad_norm": 73.92076110839844, "learning_rate": 4.234760825480138e-07, "loss": 3.1876, "step": 75 }, { "epoch": 0.029824038174768863, "grad_norm": 35.54814529418945, "learning_rate": 5.725873792198497e-07, "loss": 3.1838, "step": 100 }, { "epoch": 0.03728004771846108, "grad_norm": 45.263160705566406, "learning_rate": 7.157342240248122e-07, "loss": 3.8779, "step": 125 }, { "epoch": 0.044736057262153295, "grad_norm": 57.34968566894531, "learning_rate": 8.64845520696648e-07, "loss": 3.3267, "step": 150 }, { "epoch": 0.05219206680584551, "grad_norm": 54.16720199584961, "learning_rate": 1.013956817368484e-06, "loss": 2.8099, "step": 175 }, { "epoch": 0.059648076349537726, "grad_norm": 66.92790222167969, "learning_rate": 1.1571036621734464e-06, "loss": 3.0295, "step": 200 }, { "epoch": 0.06710408589322994, "grad_norm": 49.223506927490234, "learning_rate": 1.3062149588452823e-06, "loss": 3.5417, "step": 225 }, { "epoch": 0.07456009543692216, "grad_norm": 28.979656219482422, "learning_rate": 1.455326255517118e-06, "loss": 2.8779, "step": 250 }, { "epoch": 0.08201610498061437, "grad_norm": 58.497398376464844, "learning_rate": 1.6044375521889539e-06, "loss": 3.8402, "step": 275 }, { "epoch": 0.08947211452430659, "grad_norm": 39.97136306762695, "learning_rate": 1.7535488488607896e-06, "loss": 2.8882, "step": 300 }, { "epoch": 0.0969281240679988, "grad_norm": 59.521690368652344, "learning_rate": 1.8966956936657524e-06, "loss": 3.3457, "step": 325 }, { "epoch": 0.10438413361169102, "grad_norm": 46.65399932861328, "learning_rate": 2.045806990337588e-06, "loss": 3.2512, "step": 350 }, { "epoch": 0.11184014315538324, "grad_norm": 72.7174072265625, "learning_rate": 2.194918287009424e-06, "loss": 3.4201, "step": 375 }, { "epoch": 0.11929615269907545, "grad_norm": 42.24202346801758, "learning_rate": 2.34402958368126e-06, "loss": 2.9563, "step": 400 }, { "epoch": 0.12675216224276767, "grad_norm": 41.53296661376953, "learning_rate": 2.493140880353096e-06, "loss": 3.2227, "step": 425 }, { "epoch": 0.13420817178645988, "grad_norm": 39.92971420288086, "learning_rate": 2.6422521770249313e-06, "loss": 2.9322, "step": 450 }, { "epoch": 0.1416641813301521, "grad_norm": 56.5375862121582, "learning_rate": 2.7913634736967676e-06, "loss": 3.1309, "step": 475 }, { "epoch": 0.14912019087384432, "grad_norm": 50.100341796875, "learning_rate": 2.940474770368603e-06, "loss": 3.28, "step": 500 }, { "epoch": 0.15657620041753653, "grad_norm": 45.758445739746094, "learning_rate": 3.089586067040439e-06, "loss": 2.9504, "step": 525 }, { "epoch": 0.16403220996122875, "grad_norm": 57.31959915161133, "learning_rate": 3.238697363712275e-06, "loss": 3.2988, "step": 550 }, { "epoch": 0.17148821950492096, "grad_norm": 79.53318786621094, "learning_rate": 3.3878086603841104e-06, "loss": 2.6523, "step": 575 }, { "epoch": 0.17894422904861318, "grad_norm": 60.8985481262207, "learning_rate": 3.5369199570559468e-06, "loss": 2.7978, "step": 600 }, { "epoch": 0.1864002385923054, "grad_norm": 40.064605712890625, "learning_rate": 3.6860312537277827e-06, "loss": 2.6045, "step": 625 }, { "epoch": 0.1938562481359976, "grad_norm": 40.542083740234375, "learning_rate": 3.835142550399618e-06, "loss": 2.7585, "step": 650 }, { "epoch": 0.20131225767968983, "grad_norm": 18.070505142211914, "learning_rate": 3.984253847071454e-06, "loss": 2.274, "step": 675 }, { "epoch": 0.20876826722338204, "grad_norm": 41.26124572753906, "learning_rate": 4.1333651437432904e-06, "loss": 2.6105, "step": 700 }, { "epoch": 0.21622427676707426, "grad_norm": 49.613983154296875, "learning_rate": 4.282476440415126e-06, "loss": 2.2197, "step": 725 }, { "epoch": 0.22368028631076647, "grad_norm": 31.069345474243164, "learning_rate": 4.431587737086961e-06, "loss": 2.4944, "step": 750 }, { "epoch": 0.2311362958544587, "grad_norm": 47.66035461425781, "learning_rate": 4.580699033758798e-06, "loss": 2.5574, "step": 775 }, { "epoch": 0.2385923053981509, "grad_norm": 46.78519058227539, "learning_rate": 4.729810330430633e-06, "loss": 2.6472, "step": 800 }, { "epoch": 0.24604831494184312, "grad_norm": 45.38592529296875, "learning_rate": 4.87892162710247e-06, "loss": 2.8814, "step": 825 }, { "epoch": 0.25350432448553534, "grad_norm": 40.204872131347656, "learning_rate": 5.028032923774306e-06, "loss": 2.4151, "step": 850 }, { "epoch": 0.2609603340292276, "grad_norm": 18.755455017089844, "learning_rate": 5.1771442204461406e-06, "loss": 2.8743, "step": 875 }, { "epoch": 0.26841634357291977, "grad_norm": 45.9006233215332, "learning_rate": 5.326255517117977e-06, "loss": 2.646, "step": 900 }, { "epoch": 0.275872353116612, "grad_norm": 41.10917282104492, "learning_rate": 5.475366813789813e-06, "loss": 2.3558, "step": 925 }, { "epoch": 0.2833283626603042, "grad_norm": 17.992292404174805, "learning_rate": 5.624478110461649e-06, "loss": 2.4183, "step": 950 }, { "epoch": 0.29078437220399644, "grad_norm": 40.263668060302734, "learning_rate": 5.773589407133484e-06, "loss": 2.4558, "step": 975 }, { "epoch": 0.29824038174768863, "grad_norm": 45.932186126708984, "learning_rate": 5.9227007038053206e-06, "loss": 2.1838, "step": 1000 }, { "epoch": 0.3056963912913809, "grad_norm": 80.81851196289062, "learning_rate": 6.071812000477156e-06, "loss": 2.2045, "step": 1025 }, { "epoch": 0.31315240083507306, "grad_norm": 5.3733954429626465, "learning_rate": 6.220923297148992e-06, "loss": 2.0902, "step": 1050 }, { "epoch": 0.3206084103787653, "grad_norm": 44.87431716918945, "learning_rate": 6.370034593820829e-06, "loss": 2.373, "step": 1075 }, { "epoch": 0.3280644199224575, "grad_norm": 42.271202087402344, "learning_rate": 6.519145890492664e-06, "loss": 2.5756, "step": 1100 }, { "epoch": 0.33552042946614974, "grad_norm": 67.8478775024414, "learning_rate": 6.6682571871645006e-06, "loss": 2.4065, "step": 1125 }, { "epoch": 0.3429764390098419, "grad_norm": 49.30979919433594, "learning_rate": 6.817368483836335e-06, "loss": 2.4142, "step": 1150 }, { "epoch": 0.35043244855353417, "grad_norm": 43.35612869262695, "learning_rate": 6.9664797805081715e-06, "loss": 2.4022, "step": 1175 }, { "epoch": 0.35788845809722636, "grad_norm": 61.253299713134766, "learning_rate": 7.115591077180007e-06, "loss": 2.6125, "step": 1200 }, { "epoch": 0.3653444676409186, "grad_norm": 23.769813537597656, "learning_rate": 7.264702373851843e-06, "loss": 2.5133, "step": 1225 }, { "epoch": 0.3728004771846108, "grad_norm": 27.436845779418945, "learning_rate": 7.41381367052368e-06, "loss": 1.711, "step": 1250 }, { "epoch": 0.38025648672830303, "grad_norm": 12.745654106140137, "learning_rate": 7.562924967195516e-06, "loss": 2.2353, "step": 1275 }, { "epoch": 0.3877124962719952, "grad_norm": 53.59686279296875, "learning_rate": 7.71203626386735e-06, "loss": 2.3294, "step": 1300 }, { "epoch": 0.39516850581568747, "grad_norm": 29.58496856689453, "learning_rate": 7.861147560539186e-06, "loss": 2.5862, "step": 1325 }, { "epoch": 0.40262451535937965, "grad_norm": 50.43803405761719, "learning_rate": 8.010258857211023e-06, "loss": 2.1057, "step": 1350 }, { "epoch": 0.4100805249030719, "grad_norm": 41.37960433959961, "learning_rate": 8.159370153882859e-06, "loss": 2.0668, "step": 1375 }, { "epoch": 0.4175365344467641, "grad_norm": 40.834102630615234, "learning_rate": 8.308481450554695e-06, "loss": 2.4631, "step": 1400 }, { "epoch": 0.42499254399045633, "grad_norm": 21.868328094482422, "learning_rate": 8.457592747226532e-06, "loss": 2.3086, "step": 1425 }, { "epoch": 0.4324485535341485, "grad_norm": 27.166046142578125, "learning_rate": 8.606704043898366e-06, "loss": 2.2048, "step": 1450 }, { "epoch": 0.43990456307784076, "grad_norm": 35.7780876159668, "learning_rate": 8.755815340570203e-06, "loss": 2.2996, "step": 1475 }, { "epoch": 0.44736057262153295, "grad_norm": 23.72397804260254, "learning_rate": 8.904926637242037e-06, "loss": 2.2948, "step": 1500 }, { "epoch": 0.4548165821652252, "grad_norm": 31.295326232910156, "learning_rate": 9.054037933913873e-06, "loss": 1.6803, "step": 1525 }, { "epoch": 0.4622725917089174, "grad_norm": 32.05657196044922, "learning_rate": 9.20314923058571e-06, "loss": 2.1551, "step": 1550 }, { "epoch": 0.4697286012526096, "grad_norm": 12.753507614135742, "learning_rate": 9.352260527257546e-06, "loss": 2.177, "step": 1575 }, { "epoch": 0.4771846107963018, "grad_norm": 18.606224060058594, "learning_rate": 9.50137182392938e-06, "loss": 1.781, "step": 1600 }, { "epoch": 0.48464062033999405, "grad_norm": 14.352407455444336, "learning_rate": 9.650483120601217e-06, "loss": 1.8743, "step": 1625 }, { "epoch": 0.49209662988368624, "grad_norm": 41.37252426147461, "learning_rate": 9.799594417273053e-06, "loss": 1.8291, "step": 1650 }, { "epoch": 0.4995526394273785, "grad_norm": 34.19483184814453, "learning_rate": 9.94870571394489e-06, "loss": 1.7963, "step": 1675 }, { "epoch": 0.5070086489710707, "grad_norm": 32.18720626831055, "learning_rate": 1.0097817010616724e-05, "loss": 1.4178, "step": 1700 }, { "epoch": 0.5144646585147629, "grad_norm": 17.12651824951172, "learning_rate": 1.024692830728856e-05, "loss": 1.6896, "step": 1725 }, { "epoch": 0.5219206680584552, "grad_norm": 32.80767059326172, "learning_rate": 1.0396039603960395e-05, "loss": 1.6759, "step": 1750 }, { "epoch": 0.5293766776021473, "grad_norm": 62.32932662963867, "learning_rate": 1.0545150900632232e-05, "loss": 1.7309, "step": 1775 }, { "epoch": 0.5368326871458395, "grad_norm": 55.90302276611328, "learning_rate": 1.0694262197304068e-05, "loss": 1.8298, "step": 1800 }, { "epoch": 0.5442886966895317, "grad_norm": 22.71988868713379, "learning_rate": 1.0843373493975904e-05, "loss": 1.6003, "step": 1825 }, { "epoch": 0.551744706233224, "grad_norm": 18.055763244628906, "learning_rate": 1.099248479064774e-05, "loss": 1.5038, "step": 1850 }, { "epoch": 0.5592007157769162, "grad_norm": 36.303192138671875, "learning_rate": 1.1141596087319577e-05, "loss": 1.5591, "step": 1875 }, { "epoch": 0.5666567253206084, "grad_norm": 4.943870544433594, "learning_rate": 1.1290707383991412e-05, "loss": 1.7191, "step": 1900 }, { "epoch": 0.5741127348643006, "grad_norm": 34.8875846862793, "learning_rate": 1.1439818680663248e-05, "loss": 1.4872, "step": 1925 }, { "epoch": 0.5815687444079929, "grad_norm": 104.60694885253906, "learning_rate": 1.1588929977335083e-05, "loss": 2.0672, "step": 1950 }, { "epoch": 0.5890247539516851, "grad_norm": 8.631030082702637, "learning_rate": 1.1738041274006919e-05, "loss": 1.5613, "step": 1975 }, { "epoch": 0.5964807634953773, "grad_norm": 25.058610916137695, "learning_rate": 1.1887152570678755e-05, "loss": 1.2022, "step": 2000 }, { "epoch": 0.6039367730390695, "grad_norm": 22.961284637451172, "learning_rate": 1.2036263867350592e-05, "loss": 1.6524, "step": 2025 }, { "epoch": 0.6113927825827618, "grad_norm": 21.290424346923828, "learning_rate": 1.2185375164022426e-05, "loss": 1.6892, "step": 2050 }, { "epoch": 0.6188487921264539, "grad_norm": 4.9671406745910645, "learning_rate": 1.2334486460694263e-05, "loss": 1.2334, "step": 2075 }, { "epoch": 0.6263048016701461, "grad_norm": 13.194502830505371, "learning_rate": 1.2483597757366099e-05, "loss": 1.3793, "step": 2100 }, { "epoch": 0.6337608112138383, "grad_norm": 51.21866989135742, "learning_rate": 1.2632709054037934e-05, "loss": 1.6602, "step": 2125 }, { "epoch": 0.6412168207575306, "grad_norm": 38.84183883666992, "learning_rate": 1.278182035070977e-05, "loss": 1.5312, "step": 2150 }, { "epoch": 0.6486728303012228, "grad_norm": 9.87134838104248, "learning_rate": 1.2930931647381605e-05, "loss": 1.6768, "step": 2175 }, { "epoch": 0.656128839844915, "grad_norm": 30.30939292907715, "learning_rate": 1.3080042944053441e-05, "loss": 1.2343, "step": 2200 }, { "epoch": 0.6635848493886072, "grad_norm": 64.63691711425781, "learning_rate": 1.3229154240725277e-05, "loss": 1.4027, "step": 2225 }, { "epoch": 0.6710408589322995, "grad_norm": 29.965591430664062, "learning_rate": 1.3378265537397114e-05, "loss": 1.1293, "step": 2250 }, { "epoch": 0.6784968684759917, "grad_norm": 44.84248733520508, "learning_rate": 1.352737683406895e-05, "loss": 1.6333, "step": 2275 }, { "epoch": 0.6859528780196839, "grad_norm": 13.158123016357422, "learning_rate": 1.3676488130740786e-05, "loss": 1.5718, "step": 2300 }, { "epoch": 0.693408887563376, "grad_norm": 50.25960922241211, "learning_rate": 1.3825599427412623e-05, "loss": 1.5637, "step": 2325 }, { "epoch": 0.7008648971070683, "grad_norm": 33.488555908203125, "learning_rate": 1.3974710724084459e-05, "loss": 1.3898, "step": 2350 }, { "epoch": 0.7083209066507605, "grad_norm": 43.9734001159668, "learning_rate": 1.4123822020756294e-05, "loss": 1.4057, "step": 2375 }, { "epoch": 0.7157769161944527, "grad_norm": 50.97043228149414, "learning_rate": 1.4272933317428128e-05, "loss": 1.6272, "step": 2400 }, { "epoch": 0.7232329257381449, "grad_norm": 25.401338577270508, "learning_rate": 1.4422044614099963e-05, "loss": 0.9952, "step": 2425 }, { "epoch": 0.7306889352818372, "grad_norm": 6.340377330780029, "learning_rate": 1.45711559107718e-05, "loss": 1.0306, "step": 2450 }, { "epoch": 0.7381449448255294, "grad_norm": 23.072500228881836, "learning_rate": 1.4720267207443636e-05, "loss": 0.7644, "step": 2475 }, { "epoch": 0.7456009543692216, "grad_norm": 58.61784362792969, "learning_rate": 1.4869378504115472e-05, "loss": 1.4553, "step": 2500 }, { "epoch": 0.7530569639129138, "grad_norm": 29.832626342773438, "learning_rate": 1.5018489800787308e-05, "loss": 1.1454, "step": 2525 }, { "epoch": 0.7605129734566061, "grad_norm": 43.455284118652344, "learning_rate": 1.5167601097459145e-05, "loss": 1.6609, "step": 2550 }, { "epoch": 0.7679689830002983, "grad_norm": 38.37151336669922, "learning_rate": 1.531671239413098e-05, "loss": 1.2618, "step": 2575 }, { "epoch": 0.7754249925439904, "grad_norm": 13.673453330993652, "learning_rate": 1.5465823690802816e-05, "loss": 1.2044, "step": 2600 }, { "epoch": 0.7828810020876826, "grad_norm": 38.352325439453125, "learning_rate": 1.5614934987474654e-05, "loss": 1.2281, "step": 2625 }, { "epoch": 0.7903370116313749, "grad_norm": 53.28618621826172, "learning_rate": 1.576404628414649e-05, "loss": 1.6322, "step": 2650 }, { "epoch": 0.7977930211750671, "grad_norm": 7.921282768249512, "learning_rate": 1.5913157580818326e-05, "loss": 1.06, "step": 2675 }, { "epoch": 0.8052490307187593, "grad_norm": 8.301175117492676, "learning_rate": 1.6062268877490158e-05, "loss": 1.0309, "step": 2700 }, { "epoch": 0.8127050402624515, "grad_norm": 62.71298599243164, "learning_rate": 1.6211380174161996e-05, "loss": 0.9688, "step": 2725 }, { "epoch": 0.8201610498061438, "grad_norm": 27.857236862182617, "learning_rate": 1.636049147083383e-05, "loss": 1.2604, "step": 2750 }, { "epoch": 0.827617059349836, "grad_norm": 38.996822357177734, "learning_rate": 1.6503638315638794e-05, "loss": 1.4185, "step": 2775 }, { "epoch": 0.8350730688935282, "grad_norm": 29.560543060302734, "learning_rate": 1.6652749612310632e-05, "loss": 1.6107, "step": 2800 }, { "epoch": 0.8425290784372204, "grad_norm": 17.703155517578125, "learning_rate": 1.6801860908982463e-05, "loss": 1.1194, "step": 2825 }, { "epoch": 0.8499850879809127, "grad_norm": 23.545133590698242, "learning_rate": 1.69509722056543e-05, "loss": 1.1722, "step": 2850 }, { "epoch": 0.8574410975246048, "grad_norm": 37.812320709228516, "learning_rate": 1.7100083502326136e-05, "loss": 1.6829, "step": 2875 }, { "epoch": 0.864897107068297, "grad_norm": 3.4065306186676025, "learning_rate": 1.724919479899797e-05, "loss": 1.3637, "step": 2900 }, { "epoch": 0.8723531166119892, "grad_norm": 52.31818771362305, "learning_rate": 1.739830609566981e-05, "loss": 1.4565, "step": 2925 }, { "epoch": 0.8798091261556815, "grad_norm": 3.736452341079712, "learning_rate": 1.7547417392341643e-05, "loss": 0.9151, "step": 2950 }, { "epoch": 0.8872651356993737, "grad_norm": 40.5432014465332, "learning_rate": 1.769652868901348e-05, "loss": 1.0446, "step": 2975 }, { "epoch": 0.8947211452430659, "grad_norm": 29.45325469970703, "learning_rate": 1.7845639985685316e-05, "loss": 1.0087, "step": 3000 }, { "epoch": 0.9021771547867581, "grad_norm": 22.604948043823242, "learning_rate": 1.7994751282357154e-05, "loss": 1.0438, "step": 3025 }, { "epoch": 0.9096331643304504, "grad_norm": 34.655269622802734, "learning_rate": 1.814386257902899e-05, "loss": 1.1603, "step": 3050 }, { "epoch": 0.9170891738741426, "grad_norm": 33.17934799194336, "learning_rate": 1.8292973875700823e-05, "loss": 0.9943, "step": 3075 }, { "epoch": 0.9245451834178348, "grad_norm": 11.942368507385254, "learning_rate": 1.844208517237266e-05, "loss": 1.0218, "step": 3100 }, { "epoch": 0.932001192961527, "grad_norm": 39.37548065185547, "learning_rate": 1.8591196469044496e-05, "loss": 1.4509, "step": 3125 }, { "epoch": 0.9394572025052192, "grad_norm": 16.941686630249023, "learning_rate": 1.874030776571633e-05, "loss": 0.9837, "step": 3150 }, { "epoch": 0.9469132120489114, "grad_norm": 8.383081436157227, "learning_rate": 1.8889419062388165e-05, "loss": 0.8842, "step": 3175 }, { "epoch": 0.9543692215926036, "grad_norm": 49.28553009033203, "learning_rate": 1.9038530359060003e-05, "loss": 1.2376, "step": 3200 }, { "epoch": 0.9618252311362958, "grad_norm": 12.15333366394043, "learning_rate": 1.9187641655731838e-05, "loss": 0.8409, "step": 3225 }, { "epoch": 0.9692812406799881, "grad_norm": 9.068802833557129, "learning_rate": 1.9336752952403676e-05, "loss": 1.17, "step": 3250 }, { "epoch": 0.9767372502236803, "grad_norm": 15.055663108825684, "learning_rate": 1.948586424907551e-05, "loss": 0.9645, "step": 3275 }, { "epoch": 0.9841932597673725, "grad_norm": 25.818368911743164, "learning_rate": 1.963497554574735e-05, "loss": 0.9591, "step": 3300 }, { "epoch": 0.9916492693110647, "grad_norm": 22.713308334350586, "learning_rate": 1.9784086842419183e-05, "loss": 1.187, "step": 3325 }, { "epoch": 0.999105278854757, "grad_norm": 16.34677505493164, "learning_rate": 1.9933198139091018e-05, "loss": 0.8725, "step": 3350 }, { "epoch": 1.0, "eval_gen_len": 8.8074, "eval_loss": 0.8921580910682678, "eval_rouge1": 70.8609, "eval_rouge2": 52.0236, "eval_rougeL": 70.7329, "eval_rougeLsum": 70.7231, "eval_runtime": 95.4163, "eval_samples_per_second": 17.576, "eval_steps_per_second": 4.402, "step": 3353 }, { "epoch": 1.0065612883984492, "grad_norm": 18.03757667541504, "learning_rate": 2.0082309435762856e-05, "loss": 1.0941, "step": 3375 }, { "epoch": 1.0140172979421413, "grad_norm": 16.70610809326172, "learning_rate": 2.023142073243469e-05, "loss": 0.9132, "step": 3400 }, { "epoch": 1.0214733074858335, "grad_norm": 6.246175289154053, "learning_rate": 2.0380532029106525e-05, "loss": 0.892, "step": 3425 }, { "epoch": 1.0289293170295257, "grad_norm": 24.790555953979492, "learning_rate": 2.052964332577836e-05, "loss": 0.7534, "step": 3450 }, { "epoch": 1.036385326573218, "grad_norm": 31.111759185791016, "learning_rate": 2.0678754622450198e-05, "loss": 0.6501, "step": 3475 }, { "epoch": 1.0438413361169103, "grad_norm": 29.69679069519043, "learning_rate": 2.0827865919122033e-05, "loss": 0.6393, "step": 3500 }, { "epoch": 1.0512973456606025, "grad_norm": 31.25632095336914, "learning_rate": 2.097697721579387e-05, "loss": 1.1405, "step": 3525 }, { "epoch": 1.0587533552042947, "grad_norm": 15.574368476867676, "learning_rate": 2.1126088512465705e-05, "loss": 0.9955, "step": 3550 }, { "epoch": 1.0662093647479869, "grad_norm": 4.4662370681762695, "learning_rate": 2.127519980913754e-05, "loss": 1.0895, "step": 3575 }, { "epoch": 1.073665374291679, "grad_norm": 36.54032516479492, "learning_rate": 2.1424311105809378e-05, "loss": 0.6859, "step": 3600 }, { "epoch": 1.0811213838353713, "grad_norm": 17.539880752563477, "learning_rate": 2.1573422402481213e-05, "loss": 0.7532, "step": 3625 }, { "epoch": 1.0885773933790635, "grad_norm": 9.007013320922852, "learning_rate": 2.172253369915305e-05, "loss": 0.6377, "step": 3650 }, { "epoch": 1.0960334029227556, "grad_norm": 32.22686004638672, "learning_rate": 2.1871644995824885e-05, "loss": 0.8879, "step": 3675 }, { "epoch": 1.103489412466448, "grad_norm": 18.83819580078125, "learning_rate": 2.202075629249672e-05, "loss": 0.8378, "step": 3700 }, { "epoch": 1.1109454220101402, "grad_norm": 27.475950241088867, "learning_rate": 2.2169867589168555e-05, "loss": 1.0073, "step": 3725 }, { "epoch": 1.1184014315538324, "grad_norm": 36.23466110229492, "learning_rate": 2.2318978885840393e-05, "loss": 0.6546, "step": 3750 }, { "epoch": 1.1258574410975246, "grad_norm": 23.86202049255371, "learning_rate": 2.2468090182512227e-05, "loss": 0.9016, "step": 3775 }, { "epoch": 1.1333134506412168, "grad_norm": 1.7889130115509033, "learning_rate": 2.2617201479184062e-05, "loss": 0.9408, "step": 3800 }, { "epoch": 1.140769460184909, "grad_norm": 11.396666526794434, "learning_rate": 2.27663127758559e-05, "loss": 0.843, "step": 3825 }, { "epoch": 1.1482254697286012, "grad_norm": 5.5733113288879395, "learning_rate": 2.2915424072527735e-05, "loss": 0.8923, "step": 3850 }, { "epoch": 1.1556814792722934, "grad_norm": 23.367023468017578, "learning_rate": 2.3064535369199573e-05, "loss": 0.5774, "step": 3875 }, { "epoch": 1.1631374888159858, "grad_norm": 33.5429573059082, "learning_rate": 2.3213646665871407e-05, "loss": 0.9683, "step": 3900 }, { "epoch": 1.170593498359678, "grad_norm": 37.91108703613281, "learning_rate": 2.3362757962543245e-05, "loss": 0.8493, "step": 3925 }, { "epoch": 1.1780495079033702, "grad_norm": 22.969072341918945, "learning_rate": 2.351186925921508e-05, "loss": 1.0987, "step": 3950 }, { "epoch": 1.1855055174470623, "grad_norm": 29.6101131439209, "learning_rate": 2.3660980555886915e-05, "loss": 0.8729, "step": 3975 }, { "epoch": 1.1929615269907545, "grad_norm": 9.252951622009277, "learning_rate": 2.381009185255875e-05, "loss": 0.6307, "step": 4000 }, { "epoch": 1.2004175365344467, "grad_norm": 20.931673049926758, "learning_rate": 2.3959203149230587e-05, "loss": 0.6223, "step": 4025 }, { "epoch": 1.207873546078139, "grad_norm": 14.8717679977417, "learning_rate": 2.4108314445902422e-05, "loss": 0.9005, "step": 4050 }, { "epoch": 1.2153295556218313, "grad_norm": 45.239173889160156, "learning_rate": 2.4257425742574257e-05, "loss": 0.9411, "step": 4075 }, { "epoch": 1.2227855651655235, "grad_norm": 2.455791473388672, "learning_rate": 2.4406537039246095e-05, "loss": 0.6699, "step": 4100 }, { "epoch": 1.2302415747092157, "grad_norm": 9.25529956817627, "learning_rate": 2.455564833591793e-05, "loss": 0.9491, "step": 4125 }, { "epoch": 1.2376975842529079, "grad_norm": 13.52274227142334, "learning_rate": 2.4704759632589767e-05, "loss": 1.1475, "step": 4150 }, { "epoch": 1.2451535937966, "grad_norm": 1.3682821989059448, "learning_rate": 2.4853870929261602e-05, "loss": 0.6599, "step": 4175 }, { "epoch": 1.2526096033402923, "grad_norm": 41.495574951171875, "learning_rate": 2.5002982225933437e-05, "loss": 0.8708, "step": 4200 }, { "epoch": 1.2600656128839844, "grad_norm": 10.83969497680664, "learning_rate": 2.5152093522605275e-05, "loss": 0.6794, "step": 4225 }, { "epoch": 1.2675216224276766, "grad_norm": 24.65216827392578, "learning_rate": 2.530120481927711e-05, "loss": 0.5716, "step": 4250 }, { "epoch": 1.2749776319713688, "grad_norm": 25.4478759765625, "learning_rate": 2.5450316115948947e-05, "loss": 0.8103, "step": 4275 }, { "epoch": 1.2824336415150612, "grad_norm": 35.4855842590332, "learning_rate": 2.559942741262078e-05, "loss": 0.9094, "step": 4300 }, { "epoch": 1.2898896510587534, "grad_norm": 24.363826751708984, "learning_rate": 2.574853870929262e-05, "loss": 0.9492, "step": 4325 }, { "epoch": 1.2973456606024456, "grad_norm": 0.5661748051643372, "learning_rate": 2.589765000596445e-05, "loss": 0.895, "step": 4350 }, { "epoch": 1.3048016701461378, "grad_norm": 38.63605499267578, "learning_rate": 2.6046761302636293e-05, "loss": 0.7079, "step": 4375 }, { "epoch": 1.31225767968983, "grad_norm": 25.456178665161133, "learning_rate": 2.6195872599308124e-05, "loss": 0.7659, "step": 4400 }, { "epoch": 1.3197136892335222, "grad_norm": 16.090669631958008, "learning_rate": 2.6344983895979962e-05, "loss": 0.6873, "step": 4425 }, { "epoch": 1.3271696987772144, "grad_norm": 3.9270734786987305, "learning_rate": 2.6494095192651797e-05, "loss": 0.7105, "step": 4450 }, { "epoch": 1.3346257083209068, "grad_norm": 22.171485900878906, "learning_rate": 2.664320648932363e-05, "loss": 0.6842, "step": 4475 }, { "epoch": 1.3420817178645987, "grad_norm": 9.457509994506836, "learning_rate": 2.679231778599547e-05, "loss": 0.8012, "step": 4500 }, { "epoch": 1.3495377274082911, "grad_norm": 33.985633850097656, "learning_rate": 2.69414290826673e-05, "loss": 0.8246, "step": 4525 }, { "epoch": 1.3569937369519833, "grad_norm": 21.28675079345703, "learning_rate": 2.7090540379339142e-05, "loss": 0.7623, "step": 4550 }, { "epoch": 1.3644497464956755, "grad_norm": 47.42694091796875, "learning_rate": 2.7239651676010973e-05, "loss": 1.1292, "step": 4575 }, { "epoch": 1.3719057560393677, "grad_norm": 25.696435928344727, "learning_rate": 2.7388762972682815e-05, "loss": 0.7367, "step": 4600 }, { "epoch": 1.37936176558306, "grad_norm": 43.43827819824219, "learning_rate": 2.7537874269354646e-05, "loss": 0.8869, "step": 4625 }, { "epoch": 1.386817775126752, "grad_norm": 26.207239151000977, "learning_rate": 2.7686985566026484e-05, "loss": 1.0223, "step": 4650 }, { "epoch": 1.3942737846704443, "grad_norm": 0.6299751996994019, "learning_rate": 2.783609686269832e-05, "loss": 0.4644, "step": 4675 }, { "epoch": 1.4017297942141367, "grad_norm": 4.853407382965088, "learning_rate": 2.7985208159370157e-05, "loss": 0.7567, "step": 4700 }, { "epoch": 1.4091858037578289, "grad_norm": 2.0797219276428223, "learning_rate": 2.813431945604199e-05, "loss": 0.8935, "step": 4725 }, { "epoch": 1.416641813301521, "grad_norm": 4.396637916564941, "learning_rate": 2.8283430752713826e-05, "loss": 0.6973, "step": 4750 }, { "epoch": 1.4240978228452132, "grad_norm": 27.564001083374023, "learning_rate": 2.8432542049385664e-05, "loss": 0.7045, "step": 4775 }, { "epoch": 1.4315538323889054, "grad_norm": 11.872142791748047, "learning_rate": 2.8581653346057495e-05, "loss": 0.7692, "step": 4800 }, { "epoch": 1.4390098419325976, "grad_norm": 14.561450958251953, "learning_rate": 2.8730764642729337e-05, "loss": 0.533, "step": 4825 }, { "epoch": 1.4464658514762898, "grad_norm": 5.9031147956848145, "learning_rate": 2.8879875939401168e-05, "loss": 0.6997, "step": 4850 }, { "epoch": 1.4539218610199822, "grad_norm": 17.50626564025879, "learning_rate": 2.9028987236073006e-05, "loss": 0.5615, "step": 4875 }, { "epoch": 1.4613778705636742, "grad_norm": 1.092649221420288, "learning_rate": 2.917809853274484e-05, "loss": 0.834, "step": 4900 }, { "epoch": 1.4688338801073666, "grad_norm": 1.7949668169021606, "learning_rate": 2.932720982941668e-05, "loss": 0.7914, "step": 4925 }, { "epoch": 1.4762898896510588, "grad_norm": 45.338478088378906, "learning_rate": 2.9476321126088513e-05, "loss": 0.838, "step": 4950 }, { "epoch": 1.483745899194751, "grad_norm": 42.90887451171875, "learning_rate": 2.962543242276035e-05, "loss": 0.5953, "step": 4975 }, { "epoch": 1.4912019087384432, "grad_norm": 27.481857299804688, "learning_rate": 2.9774543719432186e-05, "loss": 0.644, "step": 5000 }, { "epoch": 1.4986579182821353, "grad_norm": 12.581745147705078, "learning_rate": 2.9923655016104024e-05, "loss": 0.7452, "step": 5025 }, { "epoch": 1.5061139278258278, "grad_norm": 6.364358425140381, "learning_rate": 3.007276631277586e-05, "loss": 0.5567, "step": 5050 }, { "epoch": 1.5135699373695197, "grad_norm": 16.981321334838867, "learning_rate": 3.022187760944769e-05, "loss": 0.3983, "step": 5075 }, { "epoch": 1.5210259469132121, "grad_norm": 41.80778503417969, "learning_rate": 3.037098890611953e-05, "loss": 0.6953, "step": 5100 }, { "epoch": 1.528481956456904, "grad_norm": 12.307695388793945, "learning_rate": 3.052010020279136e-05, "loss": 0.8288, "step": 5125 }, { "epoch": 1.5359379660005965, "grad_norm": 48.170352935791016, "learning_rate": 3.06692114994632e-05, "loss": 0.7873, "step": 5150 }, { "epoch": 1.5433939755442887, "grad_norm": 11.568841934204102, "learning_rate": 3.081832279613504e-05, "loss": 0.8033, "step": 5175 }, { "epoch": 1.5508499850879809, "grad_norm": 37.305301666259766, "learning_rate": 3.0967434092806877e-05, "loss": 0.7461, "step": 5200 }, { "epoch": 1.558305994631673, "grad_norm": 47.25547790527344, "learning_rate": 3.111654538947871e-05, "loss": 0.7598, "step": 5225 }, { "epoch": 1.5657620041753653, "grad_norm": 45.243309020996094, "learning_rate": 3.1265656686150546e-05, "loss": 0.5867, "step": 5250 }, { "epoch": 1.5732180137190577, "grad_norm": 10.84839916229248, "learning_rate": 3.141476798282238e-05, "loss": 0.7368, "step": 5275 }, { "epoch": 1.5806740232627496, "grad_norm": 31.785808563232422, "learning_rate": 3.1563879279494215e-05, "loss": 0.8298, "step": 5300 }, { "epoch": 1.588130032806442, "grad_norm": 28.766090393066406, "learning_rate": 3.171299057616605e-05, "loss": 0.7021, "step": 5325 }, { "epoch": 1.5955860423501342, "grad_norm": 7.013548374176025, "learning_rate": 3.1862101872837884e-05, "loss": 0.6686, "step": 5350 }, { "epoch": 1.6030420518938264, "grad_norm": 19.23529052734375, "learning_rate": 3.201121316950972e-05, "loss": 0.9054, "step": 5375 }, { "epoch": 1.6104980614375186, "grad_norm": 19.3592529296875, "learning_rate": 3.216032446618156e-05, "loss": 0.7062, "step": 5400 }, { "epoch": 1.6179540709812108, "grad_norm": 20.657779693603516, "learning_rate": 3.23094357628534e-05, "loss": 0.6802, "step": 5425 }, { "epoch": 1.6254100805249032, "grad_norm": 27.61529541015625, "learning_rate": 3.245854705952523e-05, "loss": 0.7271, "step": 5450 }, { "epoch": 1.6328660900685952, "grad_norm": 15.712939262390137, "learning_rate": 3.260765835619707e-05, "loss": 0.5594, "step": 5475 }, { "epoch": 1.6403220996122876, "grad_norm": 45.1962776184082, "learning_rate": 3.27567696528689e-05, "loss": 0.8737, "step": 5500 }, { "epoch": 1.6477781091559796, "grad_norm": 27.638429641723633, "learning_rate": 3.2905880949540744e-05, "loss": 0.5651, "step": 5525 }, { "epoch": 1.655234118699672, "grad_norm": 9.217710494995117, "learning_rate": 3.3054992246212575e-05, "loss": 0.4875, "step": 5550 }, { "epoch": 1.6626901282433642, "grad_norm": 34.59236145019531, "learning_rate": 3.320410354288441e-05, "loss": 0.5334, "step": 5575 }, { "epoch": 1.6701461377870563, "grad_norm": 26.130611419677734, "learning_rate": 3.3353214839556244e-05, "loss": 0.5423, "step": 5600 }, { "epoch": 1.6776021473307487, "grad_norm": 28.84840965270996, "learning_rate": 3.350232613622808e-05, "loss": 0.8482, "step": 5625 }, { "epoch": 1.6850581568744407, "grad_norm": 27.17482566833496, "learning_rate": 3.365143743289992e-05, "loss": 0.7935, "step": 5650 }, { "epoch": 1.6925141664181331, "grad_norm": 8.833431243896484, "learning_rate": 3.380054872957175e-05, "loss": 0.7983, "step": 5675 }, { "epoch": 1.699970175961825, "grad_norm": 15.625532150268555, "learning_rate": 3.394966002624359e-05, "loss": 0.5971, "step": 5700 }, { "epoch": 1.7074261855055175, "grad_norm": 24.900188446044922, "learning_rate": 3.409877132291542e-05, "loss": 0.6406, "step": 5725 }, { "epoch": 1.7148821950492097, "grad_norm": 36.51063919067383, "learning_rate": 3.4247882619587266e-05, "loss": 0.5764, "step": 5750 }, { "epoch": 1.7223382045929019, "grad_norm": 33.81806564331055, "learning_rate": 3.43969939162591e-05, "loss": 0.6808, "step": 5775 }, { "epoch": 1.729794214136594, "grad_norm": 21.35527801513672, "learning_rate": 3.4546105212930935e-05, "loss": 0.6032, "step": 5800 }, { "epoch": 1.7372502236802863, "grad_norm": 0.8462117314338684, "learning_rate": 3.4695216509602766e-05, "loss": 0.7005, "step": 5825 }, { "epoch": 1.7447062332239787, "grad_norm": 13.562004089355469, "learning_rate": 3.4844327806274604e-05, "loss": 0.4941, "step": 5850 }, { "epoch": 1.7521622427676706, "grad_norm": 34.314453125, "learning_rate": 3.499343910294644e-05, "loss": 0.5255, "step": 5875 }, { "epoch": 1.759618252311363, "grad_norm": 53.929893493652344, "learning_rate": 3.514255039961828e-05, "loss": 0.5339, "step": 5900 }, { "epoch": 1.7670742618550552, "grad_norm": 0.5989261269569397, "learning_rate": 3.529166169629011e-05, "loss": 0.7101, "step": 5925 }, { "epoch": 1.7745302713987474, "grad_norm": 12.922759056091309, "learning_rate": 3.544077299296194e-05, "loss": 0.6036, "step": 5950 }, { "epoch": 1.7819862809424396, "grad_norm": 38.75687026977539, "learning_rate": 3.558988428963379e-05, "loss": 0.6559, "step": 5975 }, { "epoch": 1.7894422904861318, "grad_norm": 48.64384078979492, "learning_rate": 3.573899558630562e-05, "loss": 0.5848, "step": 6000 }, { "epoch": 1.7968983000298242, "grad_norm": 24.728282928466797, "learning_rate": 3.588810688297746e-05, "loss": 0.4153, "step": 6025 }, { "epoch": 1.8043543095735162, "grad_norm": 34.48488998413086, "learning_rate": 3.603721817964929e-05, "loss": 0.478, "step": 6050 }, { "epoch": 1.8118103191172086, "grad_norm": 25.55767250061035, "learning_rate": 3.6186329476321126e-05, "loss": 0.6322, "step": 6075 }, { "epoch": 1.8192663286609005, "grad_norm": 30.601152420043945, "learning_rate": 3.6335440772992964e-05, "loss": 0.6907, "step": 6100 }, { "epoch": 1.826722338204593, "grad_norm": 9.281543731689453, "learning_rate": 3.64845520696648e-05, "loss": 0.7105, "step": 6125 }, { "epoch": 1.8341783477482851, "grad_norm": 0.9574500918388367, "learning_rate": 3.6633663366336634e-05, "loss": 0.7844, "step": 6150 }, { "epoch": 1.8416343572919773, "grad_norm": 6.887031078338623, "learning_rate": 3.678277466300847e-05, "loss": 0.7604, "step": 6175 }, { "epoch": 1.8490903668356695, "grad_norm": 15.275524139404297, "learning_rate": 3.693188595968031e-05, "loss": 0.453, "step": 6200 }, { "epoch": 1.8565463763793617, "grad_norm": 33.609596252441406, "learning_rate": 3.708099725635215e-05, "loss": 0.9226, "step": 6225 }, { "epoch": 1.8640023859230541, "grad_norm": 12.9977388381958, "learning_rate": 3.723010855302398e-05, "loss": 0.6253, "step": 6250 }, { "epoch": 1.871458395466746, "grad_norm": 1.9839307069778442, "learning_rate": 3.737921984969581e-05, "loss": 0.5391, "step": 6275 }, { "epoch": 1.8789144050104385, "grad_norm": 48.89108657836914, "learning_rate": 3.752833114636765e-05, "loss": 0.5714, "step": 6300 }, { "epoch": 1.8863704145541307, "grad_norm": 45.01472091674805, "learning_rate": 3.7677442443039486e-05, "loss": 0.7613, "step": 6325 }, { "epoch": 1.8938264240978229, "grad_norm": 16.330175399780273, "learning_rate": 3.7826553739711324e-05, "loss": 0.4172, "step": 6350 }, { "epoch": 1.901282433641515, "grad_norm": 26.01683235168457, "learning_rate": 3.7975665036383156e-05, "loss": 0.4404, "step": 6375 }, { "epoch": 1.9087384431852072, "grad_norm": 2.129399061203003, "learning_rate": 3.8124776333054994e-05, "loss": 0.4257, "step": 6400 }, { "epoch": 1.9161944527288997, "grad_norm": 15.925755500793457, "learning_rate": 3.827388762972683e-05, "loss": 0.4837, "step": 6425 }, { "epoch": 1.9236504622725916, "grad_norm": 57.3784065246582, "learning_rate": 3.842299892639867e-05, "loss": 0.5822, "step": 6450 }, { "epoch": 1.931106471816284, "grad_norm": 20.885623931884766, "learning_rate": 3.85721102230705e-05, "loss": 0.5244, "step": 6475 }, { "epoch": 1.938562481359976, "grad_norm": 14.874778747558594, "learning_rate": 3.872122151974234e-05, "loss": 0.7411, "step": 6500 }, { "epoch": 1.9460184909036684, "grad_norm": 35.995628356933594, "learning_rate": 3.887033281641417e-05, "loss": 0.6805, "step": 6525 }, { "epoch": 1.9534745004473606, "grad_norm": 18.882844924926758, "learning_rate": 3.901944411308601e-05, "loss": 0.2871, "step": 6550 }, { "epoch": 1.9609305099910528, "grad_norm": 18.62187385559082, "learning_rate": 3.9168555409757846e-05, "loss": 0.4653, "step": 6575 }, { "epoch": 1.968386519534745, "grad_norm": 6.905665397644043, "learning_rate": 3.931766670642968e-05, "loss": 0.6991, "step": 6600 }, { "epoch": 1.9758425290784372, "grad_norm": 14.318357467651367, "learning_rate": 3.9466778003101516e-05, "loss": 0.6759, "step": 6625 }, { "epoch": 1.9832985386221296, "grad_norm": 16.711145401000977, "learning_rate": 3.9615889299773354e-05, "loss": 0.6427, "step": 6650 }, { "epoch": 1.9907545481658215, "grad_norm": 16.86481475830078, "learning_rate": 3.976500059644519e-05, "loss": 0.3138, "step": 6675 }, { "epoch": 1.998210557709514, "grad_norm": 37.8138313293457, "learning_rate": 3.991411189311702e-05, "loss": 0.6059, "step": 6700 }, { "epoch": 2.0, "eval_gen_len": 8.746, "eval_loss": 0.5014411211013794, "eval_rouge1": 77.2572, "eval_rouge2": 60.0488, "eval_rougeL": 77.0876, "eval_rougeLsum": 77.0478, "eval_runtime": 96.4359, "eval_samples_per_second": 17.39, "eval_steps_per_second": 4.355, "step": 6706 }, { "epoch": 2.005666567253206, "grad_norm": 7.6668219566345215, "learning_rate": 4.006322318978886e-05, "loss": 0.3362, "step": 6725 }, { "epoch": 2.0131225767968983, "grad_norm": 13.261967658996582, "learning_rate": 4.021233448646069e-05, "loss": 0.4647, "step": 6750 }, { "epoch": 2.0205785863405907, "grad_norm": 36.11294937133789, "learning_rate": 4.036144578313254e-05, "loss": 0.3214, "step": 6775 }, { "epoch": 2.0280345958842827, "grad_norm": 19.81526756286621, "learning_rate": 4.051055707980437e-05, "loss": 0.311, "step": 6800 }, { "epoch": 2.035490605427975, "grad_norm": 9.729710578918457, "learning_rate": 4.0659668376476206e-05, "loss": 0.4271, "step": 6825 }, { "epoch": 2.042946614971667, "grad_norm": 20.228961944580078, "learning_rate": 4.080877967314804e-05, "loss": 0.419, "step": 6850 }, { "epoch": 2.0504026245153595, "grad_norm": 1.6113264560699463, "learning_rate": 4.0957890969819876e-05, "loss": 0.4115, "step": 6875 }, { "epoch": 2.0578586340590515, "grad_norm": 11.554544448852539, "learning_rate": 4.1107002266491714e-05, "loss": 0.492, "step": 6900 }, { "epoch": 2.065314643602744, "grad_norm": 46.82448959350586, "learning_rate": 4.1256113563163545e-05, "loss": 0.3516, "step": 6925 }, { "epoch": 2.072770653146436, "grad_norm": 9.364810943603516, "learning_rate": 4.140522485983538e-05, "loss": 0.3212, "step": 6950 }, { "epoch": 2.0802266626901282, "grad_norm": 21.646303176879883, "learning_rate": 4.1554336156507214e-05, "loss": 0.5081, "step": 6975 }, { "epoch": 2.0876826722338206, "grad_norm": 9.770012855529785, "learning_rate": 4.170344745317906e-05, "loss": 0.2004, "step": 7000 }, { "epoch": 2.0951386817775126, "grad_norm": 0.1513725370168686, "learning_rate": 4.185255874985089e-05, "loss": 0.4032, "step": 7025 }, { "epoch": 2.102594691321205, "grad_norm": 26.403615951538086, "learning_rate": 4.200167004652273e-05, "loss": 0.2091, "step": 7050 }, { "epoch": 2.110050700864897, "grad_norm": 11.628287315368652, "learning_rate": 4.215078134319456e-05, "loss": 0.406, "step": 7075 }, { "epoch": 2.1175067104085894, "grad_norm": 24.525611877441406, "learning_rate": 4.22998926398664e-05, "loss": 0.4365, "step": 7100 }, { "epoch": 2.1249627199522814, "grad_norm": 31.789121627807617, "learning_rate": 4.2449003936538236e-05, "loss": 0.3611, "step": 7125 }, { "epoch": 2.1324187294959738, "grad_norm": 21.67193603515625, "learning_rate": 4.259811523321007e-05, "loss": 0.3525, "step": 7150 }, { "epoch": 2.1398747390396657, "grad_norm": 2.3932220935821533, "learning_rate": 4.2747226529881905e-05, "loss": 0.3076, "step": 7175 }, { "epoch": 2.147330748583358, "grad_norm": 24.538375854492188, "learning_rate": 4.289633782655374e-05, "loss": 0.4386, "step": 7200 }, { "epoch": 2.1547867581270506, "grad_norm": 4.2655229568481445, "learning_rate": 4.304544912322558e-05, "loss": 0.2893, "step": 7225 }, { "epoch": 2.1622427676707425, "grad_norm": 19.003803253173828, "learning_rate": 4.319456041989741e-05, "loss": 0.4218, "step": 7250 }, { "epoch": 2.169698777214435, "grad_norm": 0.4998781681060791, "learning_rate": 4.334367171656925e-05, "loss": 0.3635, "step": 7275 }, { "epoch": 2.177154786758127, "grad_norm": 10.384448051452637, "learning_rate": 4.349278301324108e-05, "loss": 0.4281, "step": 7300 }, { "epoch": 2.1846107963018193, "grad_norm": 14.134237289428711, "learning_rate": 4.3641894309912926e-05, "loss": 0.4452, "step": 7325 }, { "epoch": 2.1920668058455113, "grad_norm": 11.797569274902344, "learning_rate": 4.379100560658476e-05, "loss": 0.4506, "step": 7350 }, { "epoch": 2.1995228153892037, "grad_norm": 27.334152221679688, "learning_rate": 4.3940116903256596e-05, "loss": 0.3131, "step": 7375 }, { "epoch": 2.206978824932896, "grad_norm": 0.558601438999176, "learning_rate": 4.408922819992843e-05, "loss": 0.2482, "step": 7400 }, { "epoch": 2.214434834476588, "grad_norm": 22.792238235473633, "learning_rate": 4.4238339496600265e-05, "loss": 0.4553, "step": 7425 }, { "epoch": 2.2218908440202805, "grad_norm": 11.284687042236328, "learning_rate": 4.43874507932721e-05, "loss": 0.4019, "step": 7450 }, { "epoch": 2.2293468535639724, "grad_norm": 31.599943161010742, "learning_rate": 4.4536562089943934e-05, "loss": 0.4052, "step": 7475 }, { "epoch": 2.236802863107665, "grad_norm": 22.682697296142578, "learning_rate": 4.468567338661577e-05, "loss": 0.3055, "step": 7500 }, { "epoch": 2.244258872651357, "grad_norm": 9.347318649291992, "learning_rate": 4.4834784683287603e-05, "loss": 0.3473, "step": 7525 }, { "epoch": 2.2517148821950492, "grad_norm": 7.00664758682251, "learning_rate": 4.498389597995945e-05, "loss": 0.481, "step": 7550 }, { "epoch": 2.2591708917387416, "grad_norm": 12.196782112121582, "learning_rate": 4.513300727663128e-05, "loss": 0.2791, "step": 7575 }, { "epoch": 2.2666269012824336, "grad_norm": 1.9481580257415771, "learning_rate": 4.528211857330312e-05, "loss": 0.5509, "step": 7600 }, { "epoch": 2.274082910826126, "grad_norm": 5.687070846557617, "learning_rate": 4.543122986997495e-05, "loss": 0.438, "step": 7625 }, { "epoch": 2.281538920369818, "grad_norm": 33.834632873535156, "learning_rate": 4.558034116664679e-05, "loss": 0.3629, "step": 7650 }, { "epoch": 2.2889949299135104, "grad_norm": 13.697815895080566, "learning_rate": 4.5729452463318625e-05, "loss": 0.3274, "step": 7675 }, { "epoch": 2.2964509394572024, "grad_norm": 13.013066291809082, "learning_rate": 4.587856375999046e-05, "loss": 0.3808, "step": 7700 }, { "epoch": 2.3039069490008948, "grad_norm": 18.45660972595215, "learning_rate": 4.6027675056662294e-05, "loss": 0.4417, "step": 7725 }, { "epoch": 2.3113629585445867, "grad_norm": 16.66852569580078, "learning_rate": 4.6176786353334125e-05, "loss": 0.4158, "step": 7750 }, { "epoch": 2.318818968088279, "grad_norm": 19.178466796875, "learning_rate": 4.632589765000597e-05, "loss": 0.5195, "step": 7775 }, { "epoch": 2.3262749776319716, "grad_norm": 66.8802490234375, "learning_rate": 4.64750089466778e-05, "loss": 0.4877, "step": 7800 }, { "epoch": 2.3337309871756635, "grad_norm": 2.8636233806610107, "learning_rate": 4.662412024334964e-05, "loss": 0.3721, "step": 7825 }, { "epoch": 2.341186996719356, "grad_norm": 11.779047012329102, "learning_rate": 4.677323154002147e-05, "loss": 0.406, "step": 7850 }, { "epoch": 2.348643006263048, "grad_norm": 8.003138542175293, "learning_rate": 4.692234283669331e-05, "loss": 0.404, "step": 7875 }, { "epoch": 2.3560990158067403, "grad_norm": 15.239642143249512, "learning_rate": 4.707145413336515e-05, "loss": 0.3539, "step": 7900 }, { "epoch": 2.3635550253504323, "grad_norm": 6.205272197723389, "learning_rate": 4.7220565430036985e-05, "loss": 0.4542, "step": 7925 }, { "epoch": 2.3710110348941247, "grad_norm": 15.588403701782227, "learning_rate": 4.7369676726708816e-05, "loss": 0.5109, "step": 7950 }, { "epoch": 2.3784670444378166, "grad_norm": 10.954691886901855, "learning_rate": 4.7518788023380654e-05, "loss": 0.3682, "step": 7975 }, { "epoch": 2.385923053981509, "grad_norm": 2.190718412399292, "learning_rate": 4.766789932005249e-05, "loss": 0.4104, "step": 8000 }, { "epoch": 2.3933790635252015, "grad_norm": 2.808455467224121, "learning_rate": 4.7817010616724323e-05, "loss": 0.8046, "step": 8025 }, { "epoch": 2.4008350730688934, "grad_norm": 17.908613204956055, "learning_rate": 4.796612191339616e-05, "loss": 0.6078, "step": 8050 }, { "epoch": 2.408291082612586, "grad_norm": 28.843978881835938, "learning_rate": 4.811523321006799e-05, "loss": 0.3912, "step": 8075 }, { "epoch": 2.415747092156278, "grad_norm": 31.94698715209961, "learning_rate": 4.826434450673983e-05, "loss": 0.452, "step": 8100 }, { "epoch": 2.42320310169997, "grad_norm": 26.91057586669922, "learning_rate": 4.841345580341167e-05, "loss": 0.4344, "step": 8125 }, { "epoch": 2.4306591112436626, "grad_norm": 10.383529663085938, "learning_rate": 4.856256710008351e-05, "loss": 0.4195, "step": 8150 }, { "epoch": 2.4381151207873546, "grad_norm": 24.33693504333496, "learning_rate": 4.871167839675534e-05, "loss": 0.4018, "step": 8175 }, { "epoch": 2.445571130331047, "grad_norm": 11.268268585205078, "learning_rate": 4.8860789693427176e-05, "loss": 0.4334, "step": 8200 }, { "epoch": 2.453027139874739, "grad_norm": 41.07155990600586, "learning_rate": 4.9009900990099014e-05, "loss": 0.3055, "step": 8225 }, { "epoch": 2.4604831494184314, "grad_norm": 10.522299766540527, "learning_rate": 4.915901228677085e-05, "loss": 0.4488, "step": 8250 }, { "epoch": 2.4679391589621233, "grad_norm": 26.76223373413086, "learning_rate": 4.9308123583442683e-05, "loss": 0.3703, "step": 8275 }, { "epoch": 2.4753951685058158, "grad_norm": 16.908048629760742, "learning_rate": 4.945723488011452e-05, "loss": 0.4244, "step": 8300 }, { "epoch": 2.4828511780495077, "grad_norm": 20.12076759338379, "learning_rate": 4.960634617678635e-05, "loss": 0.4508, "step": 8325 }, { "epoch": 2.4903071875932, "grad_norm": 60.24528503417969, "learning_rate": 4.975545747345819e-05, "loss": 0.4754, "step": 8350 }, { "epoch": 2.4977631971368925, "grad_norm": 11.230194091796875, "learning_rate": 4.990456877013003e-05, "loss": 0.2251, "step": 8375 }, { "epoch": 2.5052192066805845, "grad_norm": 7.123372554779053, "learning_rate": 4.999403515283264e-05, "loss": 0.528, "step": 8400 }, { "epoch": 2.512675216224277, "grad_norm": 0.412019282579422, "learning_rate": 4.997746613292331e-05, "loss": 0.5279, "step": 8425 }, { "epoch": 2.520131225767969, "grad_norm": 40.20630645751953, "learning_rate": 4.996089711301397e-05, "loss": 0.4045, "step": 8450 }, { "epoch": 2.5275872353116613, "grad_norm": 71.36479949951172, "learning_rate": 4.9944328093104634e-05, "loss": 0.5081, "step": 8475 }, { "epoch": 2.5350432448553533, "grad_norm": 30.50684356689453, "learning_rate": 4.992775907319531e-05, "loss": 0.322, "step": 8500 }, { "epoch": 2.5424992543990457, "grad_norm": 2.659013271331787, "learning_rate": 4.991119005328597e-05, "loss": 0.3447, "step": 8525 }, { "epoch": 2.5499552639427376, "grad_norm": 0.8650076985359192, "learning_rate": 4.9894621033376634e-05, "loss": 0.4414, "step": 8550 }, { "epoch": 2.55741127348643, "grad_norm": 18.277727127075195, "learning_rate": 4.9878052013467304e-05, "loss": 0.5041, "step": 8575 }, { "epoch": 2.5648672830301225, "grad_norm": 13.272942543029785, "learning_rate": 4.9861482993557966e-05, "loss": 0.4364, "step": 8600 }, { "epoch": 2.5723232925738144, "grad_norm": 3.7234790325164795, "learning_rate": 4.9844913973648635e-05, "loss": 0.3505, "step": 8625 }, { "epoch": 2.579779302117507, "grad_norm": 1.593865156173706, "learning_rate": 4.98283449537393e-05, "loss": 0.4215, "step": 8650 }, { "epoch": 2.587235311661199, "grad_norm": 32.194034576416016, "learning_rate": 4.981177593382996e-05, "loss": 0.4747, "step": 8675 }, { "epoch": 2.594691321204891, "grad_norm": 15.71013355255127, "learning_rate": 4.979520691392063e-05, "loss": 0.2939, "step": 8700 }, { "epoch": 2.6021473307485836, "grad_norm": 13.419342041015625, "learning_rate": 4.97786378940113e-05, "loss": 0.2265, "step": 8725 }, { "epoch": 2.6096033402922756, "grad_norm": 12.071022033691406, "learning_rate": 4.976206887410196e-05, "loss": 0.3058, "step": 8750 }, { "epoch": 2.6170593498359676, "grad_norm": 27.79282569885254, "learning_rate": 4.974549985419263e-05, "loss": 0.457, "step": 8775 }, { "epoch": 2.62451535937966, "grad_norm": 36.58905792236328, "learning_rate": 4.972893083428329e-05, "loss": 0.464, "step": 8800 }, { "epoch": 2.6319713689233524, "grad_norm": 32.43028259277344, "learning_rate": 4.971236181437396e-05, "loss": 0.437, "step": 8825 }, { "epoch": 2.6394273784670443, "grad_norm": 3.9236373901367188, "learning_rate": 4.969579279446462e-05, "loss": 0.3969, "step": 8850 }, { "epoch": 2.6468833880107367, "grad_norm": 20.850723266601562, "learning_rate": 4.967922377455529e-05, "loss": 0.4677, "step": 8875 }, { "epoch": 2.6543393975544287, "grad_norm": 37.82166290283203, "learning_rate": 4.9662654754645955e-05, "loss": 0.442, "step": 8900 }, { "epoch": 2.661795407098121, "grad_norm": 65.03778076171875, "learning_rate": 4.964608573473662e-05, "loss": 0.39, "step": 8925 }, { "epoch": 2.6692514166418135, "grad_norm": 3.3093178272247314, "learning_rate": 4.9629516714827286e-05, "loss": 0.3141, "step": 8950 }, { "epoch": 2.6767074261855055, "grad_norm": 0.09177004545927048, "learning_rate": 4.9612947694917955e-05, "loss": 0.2913, "step": 8975 }, { "epoch": 2.6841634357291975, "grad_norm": 0.9577639698982239, "learning_rate": 4.959637867500862e-05, "loss": 0.2738, "step": 9000 }, { "epoch": 2.69161944527289, "grad_norm": 26.145336151123047, "learning_rate": 4.957980965509929e-05, "loss": 0.1834, "step": 9025 }, { "epoch": 2.6990754548165823, "grad_norm": 0.26785144209861755, "learning_rate": 4.956324063518995e-05, "loss": 0.4066, "step": 9050 }, { "epoch": 2.7065314643602743, "grad_norm": 10.481317520141602, "learning_rate": 4.954667161528062e-05, "loss": 0.2305, "step": 9075 }, { "epoch": 2.7139874739039667, "grad_norm": 13.711938858032227, "learning_rate": 4.953010259537128e-05, "loss": 0.3086, "step": 9100 }, { "epoch": 2.7214434834476586, "grad_norm": 28.684236526489258, "learning_rate": 4.951353357546194e-05, "loss": 0.4683, "step": 9125 }, { "epoch": 2.728899492991351, "grad_norm": 21.328371047973633, "learning_rate": 4.949696455555261e-05, "loss": 0.5316, "step": 9150 }, { "epoch": 2.7363555025350434, "grad_norm": 0.19380249083042145, "learning_rate": 4.9480395535643274e-05, "loss": 0.3556, "step": 9175 }, { "epoch": 2.7438115120787354, "grad_norm": 1.5455032587051392, "learning_rate": 4.9463826515733943e-05, "loss": 0.3856, "step": 9200 }, { "epoch": 2.751267521622428, "grad_norm": 11.467935562133789, "learning_rate": 4.944725749582461e-05, "loss": 0.2703, "step": 9225 }, { "epoch": 2.75872353116612, "grad_norm": 49.632301330566406, "learning_rate": 4.9430688475915275e-05, "loss": 0.4603, "step": 9250 }, { "epoch": 2.766179540709812, "grad_norm": 1.1420247554779053, "learning_rate": 4.9414119456005944e-05, "loss": 0.4527, "step": 9275 }, { "epoch": 2.773635550253504, "grad_norm": 29.728267669677734, "learning_rate": 4.9397550436096606e-05, "loss": 0.3515, "step": 9300 }, { "epoch": 2.7810915597971966, "grad_norm": 18.269168853759766, "learning_rate": 4.938098141618727e-05, "loss": 0.5242, "step": 9325 }, { "epoch": 2.7885475693408885, "grad_norm": 18.85061264038086, "learning_rate": 4.936441239627794e-05, "loss": 0.4382, "step": 9350 }, { "epoch": 2.796003578884581, "grad_norm": 1.695752501487732, "learning_rate": 4.93478433763686e-05, "loss": 0.3397, "step": 9375 }, { "epoch": 2.8034595884282734, "grad_norm": 12.2448148727417, "learning_rate": 4.933127435645927e-05, "loss": 0.2823, "step": 9400 }, { "epoch": 2.8109155979719653, "grad_norm": 10.061016082763672, "learning_rate": 4.931470533654993e-05, "loss": 0.2442, "step": 9425 }, { "epoch": 2.8183716075156577, "grad_norm": 7.403264999389648, "learning_rate": 4.92981363166406e-05, "loss": 0.2298, "step": 9450 }, { "epoch": 2.8258276170593497, "grad_norm": 25.536867141723633, "learning_rate": 4.928156729673127e-05, "loss": 0.3031, "step": 9475 }, { "epoch": 2.833283626603042, "grad_norm": 32.47145462036133, "learning_rate": 4.926499827682193e-05, "loss": 0.3897, "step": 9500 }, { "epoch": 2.8407396361467345, "grad_norm": 17.720930099487305, "learning_rate": 4.92484292569126e-05, "loss": 0.4836, "step": 9525 }, { "epoch": 2.8481956456904265, "grad_norm": 21.164533615112305, "learning_rate": 4.9231860237003264e-05, "loss": 0.3634, "step": 9550 }, { "epoch": 2.8556516552341185, "grad_norm": 4.224794864654541, "learning_rate": 4.9215291217093926e-05, "loss": 0.4028, "step": 9575 }, { "epoch": 2.863107664777811, "grad_norm": 17.30121612548828, "learning_rate": 4.9198722197184595e-05, "loss": 0.363, "step": 9600 }, { "epoch": 2.8705636743215033, "grad_norm": 0.28851985931396484, "learning_rate": 4.918215317727526e-05, "loss": 0.3385, "step": 9625 }, { "epoch": 2.8780196838651952, "grad_norm": 0.16066333651542664, "learning_rate": 4.9165584157365927e-05, "loss": 0.2903, "step": 9650 }, { "epoch": 2.8854756934088877, "grad_norm": 0.3555677533149719, "learning_rate": 4.914901513745659e-05, "loss": 0.3744, "step": 9675 }, { "epoch": 2.8929317029525796, "grad_norm": 10.346195220947266, "learning_rate": 4.913244611754726e-05, "loss": 0.1898, "step": 9700 }, { "epoch": 2.900387712496272, "grad_norm": 2.9600460529327393, "learning_rate": 4.911587709763793e-05, "loss": 0.5744, "step": 9725 }, { "epoch": 2.9078437220399644, "grad_norm": 27.990514755249023, "learning_rate": 4.909930807772859e-05, "loss": 0.2074, "step": 9750 }, { "epoch": 2.9152997315836564, "grad_norm": 21.985830307006836, "learning_rate": 4.908273905781925e-05, "loss": 0.2648, "step": 9775 }, { "epoch": 2.9227557411273484, "grad_norm": 9.374412536621094, "learning_rate": 4.906617003790992e-05, "loss": 0.3699, "step": 9800 }, { "epoch": 2.930211750671041, "grad_norm": 42.18376541137695, "learning_rate": 4.904960101800058e-05, "loss": 0.4782, "step": 9825 }, { "epoch": 2.937667760214733, "grad_norm": 20.689416885375977, "learning_rate": 4.903303199809125e-05, "loss": 0.3298, "step": 9850 }, { "epoch": 2.945123769758425, "grad_norm": 3.0027916431427, "learning_rate": 4.9016462978181915e-05, "loss": 0.2212, "step": 9875 }, { "epoch": 2.9525797793021176, "grad_norm": 25.889209747314453, "learning_rate": 4.899989395827258e-05, "loss": 0.4582, "step": 9900 }, { "epoch": 2.9600357888458095, "grad_norm": 7.678776264190674, "learning_rate": 4.898332493836325e-05, "loss": 0.3316, "step": 9925 }, { "epoch": 2.967491798389502, "grad_norm": 20.867313385009766, "learning_rate": 4.8966755918453915e-05, "loss": 0.1974, "step": 9950 }, { "epoch": 2.9749478079331944, "grad_norm": 0.7414972186088562, "learning_rate": 4.895018689854458e-05, "loss": 0.2738, "step": 9975 }, { "epoch": 2.9824038174768863, "grad_norm": 20.320951461791992, "learning_rate": 4.893361787863525e-05, "loss": 0.4032, "step": 10000 }, { "epoch": 2.9898598270205787, "grad_norm": 16.26959800720215, "learning_rate": 4.891704885872591e-05, "loss": 0.3909, "step": 10025 }, { "epoch": 2.9973158365642707, "grad_norm": 6.262996673583984, "learning_rate": 4.890047983881658e-05, "loss": 0.2618, "step": 10050 }, { "epoch": 3.0, "eval_gen_len": 8.7299, "eval_loss": 0.32486870884895325, "eval_rouge1": 84.5703, "eval_rouge2": 68.4496, "eval_rougeL": 84.3448, "eval_rougeLsum": 84.3457, "eval_runtime": 97.2972, "eval_samples_per_second": 17.236, "eval_steps_per_second": 4.317, "step": 10059 }, { "epoch": 3.004771846107963, "grad_norm": 15.485138893127441, "learning_rate": 4.888391081890724e-05, "loss": 0.1829, "step": 10075 }, { "epoch": 3.012227855651655, "grad_norm": 27.907909393310547, "learning_rate": 4.886734179899791e-05, "loss": 0.2219, "step": 10100 }, { "epoch": 3.0196838651953475, "grad_norm": 21.688621520996094, "learning_rate": 4.885077277908857e-05, "loss": 0.3546, "step": 10125 }, { "epoch": 3.0271398747390394, "grad_norm": 13.243431091308594, "learning_rate": 4.8834203759179234e-05, "loss": 0.2267, "step": 10150 }, { "epoch": 3.034595884282732, "grad_norm": 3.4912607669830322, "learning_rate": 4.881763473926991e-05, "loss": 0.2021, "step": 10175 }, { "epoch": 3.0420518938264243, "grad_norm": 24.04950714111328, "learning_rate": 4.880106571936057e-05, "loss": 0.2818, "step": 10200 }, { "epoch": 3.0495079033701162, "grad_norm": 7.100139141082764, "learning_rate": 4.8784496699451235e-05, "loss": 0.3174, "step": 10225 }, { "epoch": 3.0569639129138086, "grad_norm": 4.30832576751709, "learning_rate": 4.8767927679541904e-05, "loss": 0.2278, "step": 10250 }, { "epoch": 3.0644199224575006, "grad_norm": 18.782655715942383, "learning_rate": 4.8751358659632566e-05, "loss": 0.2695, "step": 10275 }, { "epoch": 3.071875932001193, "grad_norm": 33.604618072509766, "learning_rate": 4.8734789639723236e-05, "loss": 0.2521, "step": 10300 }, { "epoch": 3.079331941544885, "grad_norm": 3.4777698516845703, "learning_rate": 4.87182206198139e-05, "loss": 0.1609, "step": 10325 }, { "epoch": 3.0867879510885774, "grad_norm": 0.6923168301582336, "learning_rate": 4.870165159990456e-05, "loss": 0.2375, "step": 10350 }, { "epoch": 3.09424396063227, "grad_norm": 2.32352352142334, "learning_rate": 4.868508257999523e-05, "loss": 0.1933, "step": 10375 }, { "epoch": 3.1016999701759618, "grad_norm": 0.3760017156600952, "learning_rate": 4.86685135600859e-05, "loss": 0.159, "step": 10400 }, { "epoch": 3.109155979719654, "grad_norm": 18.588369369506836, "learning_rate": 4.865194454017656e-05, "loss": 0.276, "step": 10425 }, { "epoch": 3.116611989263346, "grad_norm": 22.050600051879883, "learning_rate": 4.863537552026723e-05, "loss": 0.2279, "step": 10450 }, { "epoch": 3.1240679988070386, "grad_norm": 63.09767150878906, "learning_rate": 4.861880650035789e-05, "loss": 0.2141, "step": 10475 }, { "epoch": 3.1315240083507305, "grad_norm": 11.172486305236816, "learning_rate": 4.860223748044856e-05, "loss": 0.1474, "step": 10500 }, { "epoch": 3.138980017894423, "grad_norm": 13.7357816696167, "learning_rate": 4.8585668460539224e-05, "loss": 0.2779, "step": 10525 }, { "epoch": 3.1464360274381153, "grad_norm": 13.144248962402344, "learning_rate": 4.8569099440629886e-05, "loss": 0.2409, "step": 10550 }, { "epoch": 3.1538920369818073, "grad_norm": 0.10206674039363861, "learning_rate": 4.8552530420720555e-05, "loss": 0.3604, "step": 10575 }, { "epoch": 3.1613480465254997, "grad_norm": 13.896751403808594, "learning_rate": 4.853596140081122e-05, "loss": 0.3053, "step": 10600 }, { "epoch": 3.1688040560691917, "grad_norm": 4.51378870010376, "learning_rate": 4.851939238090189e-05, "loss": 0.195, "step": 10625 }, { "epoch": 3.176260065612884, "grad_norm": 41.94772720336914, "learning_rate": 4.8502823360992556e-05, "loss": 0.2235, "step": 10650 }, { "epoch": 3.183716075156576, "grad_norm": 0.20179374516010284, "learning_rate": 4.848625434108322e-05, "loss": 0.1965, "step": 10675 }, { "epoch": 3.1911720847002685, "grad_norm": 0.6293454766273499, "learning_rate": 4.846968532117389e-05, "loss": 0.1513, "step": 10700 }, { "epoch": 3.1986280942439604, "grad_norm": 3.8151888847351074, "learning_rate": 4.845311630126455e-05, "loss": 0.2603, "step": 10725 }, { "epoch": 3.206084103787653, "grad_norm": 53.60226058959961, "learning_rate": 4.843654728135522e-05, "loss": 0.212, "step": 10750 }, { "epoch": 3.2135401133313453, "grad_norm": 39.32175827026367, "learning_rate": 4.841997826144588e-05, "loss": 0.2588, "step": 10775 }, { "epoch": 3.2209961228750372, "grad_norm": 4.778318881988525, "learning_rate": 4.8403409241536543e-05, "loss": 0.2521, "step": 10800 }, { "epoch": 3.2284521324187296, "grad_norm": 16.796152114868164, "learning_rate": 4.838684022162721e-05, "loss": 0.1909, "step": 10825 }, { "epoch": 3.2359081419624216, "grad_norm": 0.5204095840454102, "learning_rate": 4.8370271201717875e-05, "loss": 0.1942, "step": 10850 }, { "epoch": 3.243364151506114, "grad_norm": 15.695696830749512, "learning_rate": 4.8353702181808544e-05, "loss": 0.2888, "step": 10875 }, { "epoch": 3.250820161049806, "grad_norm": 24.48253059387207, "learning_rate": 4.833713316189921e-05, "loss": 0.3286, "step": 10900 }, { "epoch": 3.2582761705934984, "grad_norm": 0.22145067155361176, "learning_rate": 4.8320564141989875e-05, "loss": 0.2459, "step": 10925 }, { "epoch": 3.2657321801371904, "grad_norm": 23.265213012695312, "learning_rate": 4.8303995122080545e-05, "loss": 0.273, "step": 10950 }, { "epoch": 3.2731881896808828, "grad_norm": 14.237789154052734, "learning_rate": 4.828742610217121e-05, "loss": 0.1217, "step": 10975 }, { "epoch": 3.280644199224575, "grad_norm": 11.19775676727295, "learning_rate": 4.827085708226187e-05, "loss": 0.2509, "step": 11000 }, { "epoch": 3.288100208768267, "grad_norm": 12.824684143066406, "learning_rate": 4.825428806235254e-05, "loss": 0.2533, "step": 11025 }, { "epoch": 3.2955562183119596, "grad_norm": 0.6048849821090698, "learning_rate": 4.82377190424432e-05, "loss": 0.2427, "step": 11050 }, { "epoch": 3.3030122278556515, "grad_norm": 18.854202270507812, "learning_rate": 4.822115002253387e-05, "loss": 0.2239, "step": 11075 }, { "epoch": 3.310468237399344, "grad_norm": 1.0037951469421387, "learning_rate": 4.820458100262453e-05, "loss": 0.2323, "step": 11100 }, { "epoch": 3.317924246943036, "grad_norm": 12.175690650939941, "learning_rate": 4.81880119827152e-05, "loss": 0.1916, "step": 11125 }, { "epoch": 3.3253802564867283, "grad_norm": 3.9193851947784424, "learning_rate": 4.817144296280587e-05, "loss": 0.1596, "step": 11150 }, { "epoch": 3.3328362660304203, "grad_norm": 1.2663558721542358, "learning_rate": 4.815487394289653e-05, "loss": 0.2768, "step": 11175 }, { "epoch": 3.3402922755741127, "grad_norm": 1.7071452140808105, "learning_rate": 4.81383049229872e-05, "loss": 0.33, "step": 11200 }, { "epoch": 3.347748285117805, "grad_norm": 27.484474182128906, "learning_rate": 4.8121735903077864e-05, "loss": 0.4431, "step": 11225 }, { "epoch": 3.355204294661497, "grad_norm": 30.908485412597656, "learning_rate": 4.8105166883168527e-05, "loss": 0.1375, "step": 11250 }, { "epoch": 3.3626603042051895, "grad_norm": 15.244987487792969, "learning_rate": 4.8088597863259196e-05, "loss": 0.1436, "step": 11275 }, { "epoch": 3.3701163137488814, "grad_norm": 5.141242504119873, "learning_rate": 4.807202884334986e-05, "loss": 0.2911, "step": 11300 }, { "epoch": 3.377572323292574, "grad_norm": 5.202120304107666, "learning_rate": 4.805545982344053e-05, "loss": 0.1735, "step": 11325 }, { "epoch": 3.3850283328362663, "grad_norm": 0.155669167637825, "learning_rate": 4.803889080353119e-05, "loss": 0.1395, "step": 11350 }, { "epoch": 3.392484342379958, "grad_norm": 37.31657791137695, "learning_rate": 4.802232178362186e-05, "loss": 0.2422, "step": 11375 }, { "epoch": 3.3999403519236506, "grad_norm": 1.6722973585128784, "learning_rate": 4.800575276371253e-05, "loss": 0.1709, "step": 11400 }, { "epoch": 3.4073963614673426, "grad_norm": 41.64527130126953, "learning_rate": 4.798918374380319e-05, "loss": 0.2258, "step": 11425 }, { "epoch": 3.414852371011035, "grad_norm": 12.723953247070312, "learning_rate": 4.797261472389385e-05, "loss": 0.1889, "step": 11450 }, { "epoch": 3.422308380554727, "grad_norm": 35.9575080871582, "learning_rate": 4.795604570398452e-05, "loss": 0.18, "step": 11475 }, { "epoch": 3.4297643900984194, "grad_norm": 1.8803658485412598, "learning_rate": 4.7939476684075184e-05, "loss": 0.3035, "step": 11500 }, { "epoch": 3.4372203996421113, "grad_norm": 0.22066275775432587, "learning_rate": 4.792290766416585e-05, "loss": 0.1552, "step": 11525 }, { "epoch": 3.4446764091858038, "grad_norm": 2.5760536193847656, "learning_rate": 4.7906338644256515e-05, "loss": 0.1945, "step": 11550 }, { "epoch": 3.452132418729496, "grad_norm": 19.55931282043457, "learning_rate": 4.788976962434718e-05, "loss": 0.3879, "step": 11575 }, { "epoch": 3.459588428273188, "grad_norm": 0.1631491780281067, "learning_rate": 4.7873200604437854e-05, "loss": 0.184, "step": 11600 }, { "epoch": 3.4670444378168805, "grad_norm": 16.460819244384766, "learning_rate": 4.7856631584528516e-05, "loss": 0.2637, "step": 11625 }, { "epoch": 3.4745004473605725, "grad_norm": 14.154874801635742, "learning_rate": 4.784006256461918e-05, "loss": 0.2471, "step": 11650 }, { "epoch": 3.481956456904265, "grad_norm": 28.12816047668457, "learning_rate": 4.782349354470985e-05, "loss": 0.2432, "step": 11675 }, { "epoch": 3.489412466447957, "grad_norm": 4.544245719909668, "learning_rate": 4.780692452480051e-05, "loss": 0.1726, "step": 11700 }, { "epoch": 3.4968684759916493, "grad_norm": 37.361698150634766, "learning_rate": 4.779035550489118e-05, "loss": 0.2676, "step": 11725 }, { "epoch": 3.5043244855353413, "grad_norm": 0.2849110960960388, "learning_rate": 4.777378648498184e-05, "loss": 0.1267, "step": 11750 }, { "epoch": 3.5117804950790337, "grad_norm": 13.191654205322266, "learning_rate": 4.7757217465072504e-05, "loss": 0.261, "step": 11775 }, { "epoch": 3.519236504622726, "grad_norm": 17.304588317871094, "learning_rate": 4.774064844516317e-05, "loss": 0.1624, "step": 11800 }, { "epoch": 3.526692514166418, "grad_norm": 6.902221202850342, "learning_rate": 4.7724079425253835e-05, "loss": 0.2299, "step": 11825 }, { "epoch": 3.5341485237101105, "grad_norm": 29.891098022460938, "learning_rate": 4.770751040534451e-05, "loss": 0.2008, "step": 11850 }, { "epoch": 3.5416045332538024, "grad_norm": 1.6863594055175781, "learning_rate": 4.769094138543517e-05, "loss": 0.1565, "step": 11875 }, { "epoch": 3.549060542797495, "grad_norm": 16.029647827148438, "learning_rate": 4.7674372365525836e-05, "loss": 0.2082, "step": 11900 }, { "epoch": 3.5565165523411872, "grad_norm": 5.520860195159912, "learning_rate": 4.7657803345616505e-05, "loss": 0.2113, "step": 11925 }, { "epoch": 3.563972561884879, "grad_norm": 10.906929016113281, "learning_rate": 4.764123432570717e-05, "loss": 0.2652, "step": 11950 }, { "epoch": 3.571428571428571, "grad_norm": 17.15738868713379, "learning_rate": 4.7624665305797836e-05, "loss": 0.1654, "step": 11975 }, { "epoch": 3.5788845809722636, "grad_norm": 3.8852152824401855, "learning_rate": 4.76080962858885e-05, "loss": 0.3021, "step": 12000 }, { "epoch": 3.586340590515956, "grad_norm": 3.2705817222595215, "learning_rate": 4.759152726597916e-05, "loss": 0.2207, "step": 12025 }, { "epoch": 3.593796600059648, "grad_norm": 2.1786513328552246, "learning_rate": 4.757495824606983e-05, "loss": 0.2425, "step": 12050 }, { "epoch": 3.6012526096033404, "grad_norm": 1.1399081945419312, "learning_rate": 4.75583892261605e-05, "loss": 0.1534, "step": 12075 }, { "epoch": 3.6087086191470323, "grad_norm": 56.90084457397461, "learning_rate": 4.754182020625116e-05, "loss": 0.2596, "step": 12100 }, { "epoch": 3.6161646286907247, "grad_norm": 14.946979522705078, "learning_rate": 4.752525118634183e-05, "loss": 0.3018, "step": 12125 }, { "epoch": 3.623620638234417, "grad_norm": 4.24704122543335, "learning_rate": 4.750868216643249e-05, "loss": 0.2398, "step": 12150 }, { "epoch": 3.631076647778109, "grad_norm": 1.877937912940979, "learning_rate": 4.749211314652316e-05, "loss": 0.2393, "step": 12175 }, { "epoch": 3.6385326573218015, "grad_norm": 13.878131866455078, "learning_rate": 4.7475544126613824e-05, "loss": 0.269, "step": 12200 }, { "epoch": 3.6459886668654935, "grad_norm": 54.240055084228516, "learning_rate": 4.745897510670449e-05, "loss": 0.1789, "step": 12225 }, { "epoch": 3.653444676409186, "grad_norm": 50.91316604614258, "learning_rate": 4.7442406086795156e-05, "loss": 0.2266, "step": 12250 }, { "epoch": 3.660900685952878, "grad_norm": Infinity, "learning_rate": 4.7426499827682196e-05, "loss": 0.2616, "step": 12275 }, { "epoch": 3.6683566954965703, "grad_norm": 18.687328338623047, "learning_rate": 4.7409930807772865e-05, "loss": 0.2016, "step": 12300 }, { "epoch": 3.6758127050402623, "grad_norm": 0.21909315884113312, "learning_rate": 4.739336178786353e-05, "loss": 0.1519, "step": 12325 }, { "epoch": 3.6832687145839547, "grad_norm": 31.093547821044922, "learning_rate": 4.737679276795419e-05, "loss": 0.1486, "step": 12350 }, { "epoch": 3.690724724127647, "grad_norm": 19.91496467590332, "learning_rate": 4.736022374804486e-05, "loss": 0.1421, "step": 12375 }, { "epoch": 3.698180733671339, "grad_norm": 0.031140485778450966, "learning_rate": 4.734365472813552e-05, "loss": 0.174, "step": 12400 }, { "epoch": 3.7056367432150314, "grad_norm": 0.5245652198791504, "learning_rate": 4.732708570822619e-05, "loss": 0.1967, "step": 12425 }, { "epoch": 3.7130927527587234, "grad_norm": 9.509659767150879, "learning_rate": 4.731051668831685e-05, "loss": 0.3401, "step": 12450 }, { "epoch": 3.720548762302416, "grad_norm": 0.24189208447933197, "learning_rate": 4.729394766840752e-05, "loss": 0.1994, "step": 12475 }, { "epoch": 3.7280047718461082, "grad_norm": 7.9083733558654785, "learning_rate": 4.727737864849819e-05, "loss": 0.165, "step": 12500 }, { "epoch": 3.7354607813898, "grad_norm": 13.54222297668457, "learning_rate": 4.7260809628588854e-05, "loss": 0.2339, "step": 12525 }, { "epoch": 3.742916790933492, "grad_norm": 0.5903582572937012, "learning_rate": 4.7244240608679516e-05, "loss": 0.2427, "step": 12550 }, { "epoch": 3.7503728004771846, "grad_norm": 2.730581521987915, "learning_rate": 4.7227671588770185e-05, "loss": 0.1408, "step": 12575 }, { "epoch": 3.757828810020877, "grad_norm": 0.4194844365119934, "learning_rate": 4.721110256886085e-05, "loss": 0.2342, "step": 12600 }, { "epoch": 3.765284819564569, "grad_norm": 20.00274658203125, "learning_rate": 4.7194533548951517e-05, "loss": 0.0921, "step": 12625 }, { "epoch": 3.7727408291082614, "grad_norm": 0.035322315990924835, "learning_rate": 4.717796452904218e-05, "loss": 0.1742, "step": 12650 }, { "epoch": 3.7801968386519533, "grad_norm": 15.161859512329102, "learning_rate": 4.716139550913284e-05, "loss": 0.185, "step": 12675 }, { "epoch": 3.7876528481956457, "grad_norm": 13.298684120178223, "learning_rate": 4.714482648922351e-05, "loss": 0.1626, "step": 12700 }, { "epoch": 3.795108857739338, "grad_norm": 14.108867645263672, "learning_rate": 4.712825746931418e-05, "loss": 0.2064, "step": 12725 }, { "epoch": 3.80256486728303, "grad_norm": 6.5551886558532715, "learning_rate": 4.711168844940484e-05, "loss": 0.1743, "step": 12750 }, { "epoch": 3.810020876826722, "grad_norm": 1.9325826168060303, "learning_rate": 4.709511942949551e-05, "loss": 0.2071, "step": 12775 }, { "epoch": 3.8174768863704145, "grad_norm": 9.065774917602539, "learning_rate": 4.707855040958617e-05, "loss": 0.2417, "step": 12800 }, { "epoch": 3.824932895914107, "grad_norm": 16.085844039916992, "learning_rate": 4.706198138967684e-05, "loss": 0.1279, "step": 12825 }, { "epoch": 3.832388905457799, "grad_norm": 19.756853103637695, "learning_rate": 4.7045412369767505e-05, "loss": 0.2925, "step": 12850 }, { "epoch": 3.8398449150014913, "grad_norm": 0.015781521797180176, "learning_rate": 4.702884334985817e-05, "loss": 0.2662, "step": 12875 }, { "epoch": 3.8473009245451832, "grad_norm": 0.3410816192626953, "learning_rate": 4.7012274329948836e-05, "loss": 0.2415, "step": 12900 }, { "epoch": 3.8547569340888757, "grad_norm": 10.625577926635742, "learning_rate": 4.69957053100395e-05, "loss": 0.1441, "step": 12925 }, { "epoch": 3.862212943632568, "grad_norm": 28.55473518371582, "learning_rate": 4.697913629013017e-05, "loss": 0.2246, "step": 12950 }, { "epoch": 3.86966895317626, "grad_norm": 18.04960060119629, "learning_rate": 4.696256727022084e-05, "loss": 0.1946, "step": 12975 }, { "epoch": 3.8771249627199524, "grad_norm": 12.809388160705566, "learning_rate": 4.69459982503115e-05, "loss": 0.1689, "step": 13000 }, { "epoch": 3.8845809722636444, "grad_norm": 12.735445022583008, "learning_rate": 4.692942923040217e-05, "loss": 0.3456, "step": 13025 }, { "epoch": 3.892036981807337, "grad_norm": 13.631649017333984, "learning_rate": 4.691286021049283e-05, "loss": 0.1507, "step": 13050 }, { "epoch": 3.899492991351029, "grad_norm": 6.858339786529541, "learning_rate": 4.68962911905835e-05, "loss": 0.1571, "step": 13075 }, { "epoch": 3.906949000894721, "grad_norm": 0.8234581351280212, "learning_rate": 4.687972217067416e-05, "loss": 0.0485, "step": 13100 }, { "epoch": 3.914405010438413, "grad_norm": 54.14894104003906, "learning_rate": 4.6863153150764824e-05, "loss": 0.1594, "step": 13125 }, { "epoch": 3.9218610199821056, "grad_norm": 0.19911998510360718, "learning_rate": 4.6846584130855493e-05, "loss": 0.2208, "step": 13150 }, { "epoch": 3.929317029525798, "grad_norm": 12.671158790588379, "learning_rate": 4.6830015110946156e-05, "loss": 0.1298, "step": 13175 }, { "epoch": 3.93677303906949, "grad_norm": 5.566137313842773, "learning_rate": 4.6813446091036825e-05, "loss": 0.2675, "step": 13200 }, { "epoch": 3.9442290486131824, "grad_norm": 3.0645034313201904, "learning_rate": 4.6796877071127494e-05, "loss": 0.1215, "step": 13225 }, { "epoch": 3.9516850581568743, "grad_norm": 0.5267123579978943, "learning_rate": 4.6780308051218156e-05, "loss": 0.1125, "step": 13250 }, { "epoch": 3.9591410677005667, "grad_norm": 6.624309062957764, "learning_rate": 4.6763739031308826e-05, "loss": 0.197, "step": 13275 }, { "epoch": 3.966597077244259, "grad_norm": 15.308833122253418, "learning_rate": 4.674717001139949e-05, "loss": 0.0796, "step": 13300 }, { "epoch": 3.974053086787951, "grad_norm": 1.9672225713729858, "learning_rate": 4.673060099149015e-05, "loss": 0.3087, "step": 13325 }, { "epoch": 3.981509096331643, "grad_norm": 9.752317428588867, "learning_rate": 4.671403197158082e-05, "loss": 0.1106, "step": 13350 }, { "epoch": 3.9889651058753355, "grad_norm": 69.8141860961914, "learning_rate": 4.669746295167148e-05, "loss": 0.1566, "step": 13375 }, { "epoch": 3.996421115419028, "grad_norm": 18.129182815551758, "learning_rate": 4.668089393176215e-05, "loss": 0.1422, "step": 13400 }, { "epoch": 4.0, "eval_gen_len": 8.8205, "eval_loss": 0.22952935099601746, "eval_rouge1": 89.3626, "eval_rouge2": 74.5509, "eval_rougeL": 89.1898, "eval_rougeLsum": 89.2039, "eval_runtime": 105.2444, "eval_samples_per_second": 15.934, "eval_steps_per_second": 3.991, "step": 13412 }, { "epoch": 4.00387712496272, "grad_norm": 4.1664204597473145, "learning_rate": 4.666432491185281e-05, "loss": 0.0988, "step": 13425 }, { "epoch": 4.011333134506412, "grad_norm": 1.5676534175872803, "learning_rate": 4.664775589194348e-05, "loss": 0.0775, "step": 13450 }, { "epoch": 4.018789144050104, "grad_norm": 1.9181686639785767, "learning_rate": 4.663118687203415e-05, "loss": 0.1013, "step": 13475 }, { "epoch": 4.026245153593797, "grad_norm": 37.71769332885742, "learning_rate": 4.6614617852124814e-05, "loss": 0.1236, "step": 13500 }, { "epoch": 4.033701163137489, "grad_norm": 12.9027099609375, "learning_rate": 4.659804883221548e-05, "loss": 0.2319, "step": 13525 }, { "epoch": 4.0411571726811815, "grad_norm": 0.3448346257209778, "learning_rate": 4.6581479812306145e-05, "loss": 0.1614, "step": 13550 }, { "epoch": 4.048613182224873, "grad_norm": 0.2613756060600281, "learning_rate": 4.656491079239681e-05, "loss": 0.0753, "step": 13575 }, { "epoch": 4.056069191768565, "grad_norm": 12.620153427124023, "learning_rate": 4.6549004533283855e-05, "loss": 0.1857, "step": 13600 }, { "epoch": 4.063525201312258, "grad_norm": 49.89236068725586, "learning_rate": 4.653243551337452e-05, "loss": 0.2494, "step": 13625 }, { "epoch": 4.07098121085595, "grad_norm": 0.03416460007429123, "learning_rate": 4.651586649346518e-05, "loss": 0.1864, "step": 13650 }, { "epoch": 4.078437220399642, "grad_norm": 23.923843383789062, "learning_rate": 4.649929747355585e-05, "loss": 0.1432, "step": 13675 }, { "epoch": 4.085893229943334, "grad_norm": 26.50322723388672, "learning_rate": 4.648272845364651e-05, "loss": 0.0544, "step": 13700 }, { "epoch": 4.093349239487027, "grad_norm": 2.477444887161255, "learning_rate": 4.646615943373718e-05, "loss": 0.0846, "step": 13725 }, { "epoch": 4.100805249030719, "grad_norm": 4.058847904205322, "learning_rate": 4.644959041382784e-05, "loss": 0.1216, "step": 13750 }, { "epoch": 4.108261258574411, "grad_norm": 22.271705627441406, "learning_rate": 4.6433021393918505e-05, "loss": 0.1599, "step": 13775 }, { "epoch": 4.115717268118103, "grad_norm": 14.830306053161621, "learning_rate": 4.6416452374009174e-05, "loss": 0.1047, "step": 13800 }, { "epoch": 4.123173277661795, "grad_norm": 20.900365829467773, "learning_rate": 4.6399883354099836e-05, "loss": 0.2165, "step": 13825 }, { "epoch": 4.130629287205488, "grad_norm": 0.04809415712952614, "learning_rate": 4.6383314334190505e-05, "loss": 0.0797, "step": 13850 }, { "epoch": 4.13808529674918, "grad_norm": 2.7082746028900146, "learning_rate": 4.6366745314281174e-05, "loss": 0.2539, "step": 13875 }, { "epoch": 4.145541306292872, "grad_norm": 5.5213823318481445, "learning_rate": 4.635017629437184e-05, "loss": 0.0967, "step": 13900 }, { "epoch": 4.152997315836564, "grad_norm": 0.5523321628570557, "learning_rate": 4.6333607274462506e-05, "loss": 0.115, "step": 13925 }, { "epoch": 4.1604533253802565, "grad_norm": 2.0624325275421143, "learning_rate": 4.631703825455317e-05, "loss": 0.1811, "step": 13950 }, { "epoch": 4.167909334923949, "grad_norm": 0.551288366317749, "learning_rate": 4.630046923464384e-05, "loss": 0.1378, "step": 13975 }, { "epoch": 4.175365344467641, "grad_norm": 0.5739544034004211, "learning_rate": 4.62839002147345e-05, "loss": 0.0912, "step": 14000 }, { "epoch": 4.182821354011333, "grad_norm": 23.730623245239258, "learning_rate": 4.626733119482516e-05, "loss": 0.2853, "step": 14025 }, { "epoch": 4.190277363555025, "grad_norm": 10.519808769226074, "learning_rate": 4.625076217491583e-05, "loss": 0.1026, "step": 14050 }, { "epoch": 4.197733373098718, "grad_norm": 0.21397335827350616, "learning_rate": 4.62341931550065e-05, "loss": 0.0714, "step": 14075 }, { "epoch": 4.20518938264241, "grad_norm": 1.646802306175232, "learning_rate": 4.621762413509716e-05, "loss": 0.1204, "step": 14100 }, { "epoch": 4.2126453921861025, "grad_norm": 11.778765678405762, "learning_rate": 4.620105511518783e-05, "loss": 0.092, "step": 14125 }, { "epoch": 4.220101401729794, "grad_norm": 22.594240188598633, "learning_rate": 4.6184486095278494e-05, "loss": 0.1312, "step": 14150 }, { "epoch": 4.227557411273486, "grad_norm": 0.8660998940467834, "learning_rate": 4.616791707536916e-05, "loss": 0.1609, "step": 14175 }, { "epoch": 4.235013420817179, "grad_norm": 31.39341926574707, "learning_rate": 4.6151348055459826e-05, "loss": 0.2667, "step": 14200 }, { "epoch": 4.242469430360871, "grad_norm": 0.04638830944895744, "learning_rate": 4.613477903555049e-05, "loss": 0.1241, "step": 14225 }, { "epoch": 4.249925439904563, "grad_norm": 1.1062257289886475, "learning_rate": 4.611821001564116e-05, "loss": 0.1065, "step": 14250 }, { "epoch": 4.257381449448255, "grad_norm": 1.4014554023742676, "learning_rate": 4.610164099573182e-05, "loss": 0.117, "step": 14275 }, { "epoch": 4.2648374589919475, "grad_norm": 10.264939308166504, "learning_rate": 4.608507197582249e-05, "loss": 0.1761, "step": 14300 }, { "epoch": 4.27229346853564, "grad_norm": 5.444455623626709, "learning_rate": 4.606850295591316e-05, "loss": 0.1822, "step": 14325 }, { "epoch": 4.2797494780793315, "grad_norm": 0.25484806299209595, "learning_rate": 4.605193393600382e-05, "loss": 0.0569, "step": 14350 }, { "epoch": 4.287205487623024, "grad_norm": 13.141998291015625, "learning_rate": 4.603536491609449e-05, "loss": 0.1328, "step": 14375 }, { "epoch": 4.294661497166716, "grad_norm": 0.047426436096429825, "learning_rate": 4.601879589618515e-05, "loss": 0.1656, "step": 14400 }, { "epoch": 4.302117506710409, "grad_norm": 14.1134614944458, "learning_rate": 4.6002226876275814e-05, "loss": 0.1843, "step": 14425 }, { "epoch": 4.309573516254101, "grad_norm": 0.0767190232872963, "learning_rate": 4.598565785636648e-05, "loss": 0.1311, "step": 14450 }, { "epoch": 4.317029525797793, "grad_norm": 1.5797250270843506, "learning_rate": 4.5969088836457145e-05, "loss": 0.1161, "step": 14475 }, { "epoch": 4.324485535341485, "grad_norm": 0.14433561265468597, "learning_rate": 4.5952519816547814e-05, "loss": 0.1597, "step": 14500 }, { "epoch": 4.3319415448851775, "grad_norm": 18.34807014465332, "learning_rate": 4.593595079663848e-05, "loss": 0.1026, "step": 14525 }, { "epoch": 4.33939755442887, "grad_norm": 0.033856164664030075, "learning_rate": 4.5919381776729146e-05, "loss": 0.1124, "step": 14550 }, { "epoch": 4.346853563972562, "grad_norm": 3.9369986057281494, "learning_rate": 4.5902812756819815e-05, "loss": 0.1223, "step": 14575 }, { "epoch": 4.354309573516254, "grad_norm": 1.2985237836837769, "learning_rate": 4.588624373691048e-05, "loss": 0.0947, "step": 14600 }, { "epoch": 4.361765583059946, "grad_norm": 0.030779710039496422, "learning_rate": 4.5869674717001146e-05, "loss": 0.1627, "step": 14625 }, { "epoch": 4.369221592603639, "grad_norm": 0.5389582514762878, "learning_rate": 4.585310569709181e-05, "loss": 0.1019, "step": 14650 }, { "epoch": 4.376677602147331, "grad_norm": 47.4750862121582, "learning_rate": 4.583653667718247e-05, "loss": 0.2532, "step": 14675 }, { "epoch": 4.384133611691023, "grad_norm": 0.20324255526065826, "learning_rate": 4.581996765727314e-05, "loss": 0.1099, "step": 14700 }, { "epoch": 4.391589621234715, "grad_norm": 8.618117332458496, "learning_rate": 4.58033986373638e-05, "loss": 0.1625, "step": 14725 }, { "epoch": 4.399045630778407, "grad_norm": 14.791823387145996, "learning_rate": 4.578682961745447e-05, "loss": 0.1611, "step": 14750 }, { "epoch": 4.4065016403221, "grad_norm": 1.8301466703414917, "learning_rate": 4.5770260597545134e-05, "loss": 0.1291, "step": 14775 }, { "epoch": 4.413957649865792, "grad_norm": 0.02634044922888279, "learning_rate": 4.57536915776358e-05, "loss": 0.0426, "step": 14800 }, { "epoch": 4.421413659409484, "grad_norm": 8.729630470275879, "learning_rate": 4.573712255772647e-05, "loss": 0.1576, "step": 14825 }, { "epoch": 4.428869668953176, "grad_norm": 8.50086498260498, "learning_rate": 4.5720553537817135e-05, "loss": 0.2184, "step": 14850 }, { "epoch": 4.4363256784968685, "grad_norm": 2.6643874645233154, "learning_rate": 4.57039845179078e-05, "loss": 0.1499, "step": 14875 }, { "epoch": 4.443781688040561, "grad_norm": 0.253903329372406, "learning_rate": 4.5687415497998466e-05, "loss": 0.1432, "step": 14900 }, { "epoch": 4.4512376975842525, "grad_norm": 6.4706573486328125, "learning_rate": 4.567084647808913e-05, "loss": 0.1689, "step": 14925 }, { "epoch": 4.458693707127945, "grad_norm": 3.9063432216644287, "learning_rate": 4.56542774581798e-05, "loss": 0.1393, "step": 14950 }, { "epoch": 4.466149716671637, "grad_norm": 5.083062648773193, "learning_rate": 4.563770843827046e-05, "loss": 0.0821, "step": 14975 }, { "epoch": 4.47360572621533, "grad_norm": 31.4875545501709, "learning_rate": 4.562113941836112e-05, "loss": 0.2829, "step": 15000 }, { "epoch": 4.481061735759022, "grad_norm": 0.055298592895269394, "learning_rate": 4.560457039845179e-05, "loss": 0.0949, "step": 15025 }, { "epoch": 4.488517745302714, "grad_norm": 9.65718936920166, "learning_rate": 4.558800137854246e-05, "loss": 0.159, "step": 15050 }, { "epoch": 4.495973754846406, "grad_norm": 9.367506980895996, "learning_rate": 4.557143235863312e-05, "loss": 0.1376, "step": 15075 }, { "epoch": 4.5034297643900985, "grad_norm": 0.32751259207725525, "learning_rate": 4.555486333872379e-05, "loss": 0.1999, "step": 15100 }, { "epoch": 4.510885773933791, "grad_norm": 12.641383171081543, "learning_rate": 4.5538294318814454e-05, "loss": 0.1897, "step": 15125 }, { "epoch": 4.518341783477483, "grad_norm": 0.09187845885753632, "learning_rate": 4.552172529890512e-05, "loss": 0.111, "step": 15150 }, { "epoch": 4.525797793021175, "grad_norm": 10.361546516418457, "learning_rate": 4.5505156278995786e-05, "loss": 0.1307, "step": 15175 }, { "epoch": 4.533253802564867, "grad_norm": 6.859467506408691, "learning_rate": 4.5488587259086455e-05, "loss": 0.0639, "step": 15200 }, { "epoch": 4.54070981210856, "grad_norm": 9.48729419708252, "learning_rate": 4.547201823917712e-05, "loss": 0.1368, "step": 15225 }, { "epoch": 4.548165821652252, "grad_norm": 0.25108686089515686, "learning_rate": 4.545544921926778e-05, "loss": 0.1616, "step": 15250 }, { "epoch": 4.5556218311959435, "grad_norm": 33.95174026489258, "learning_rate": 4.5438880199358455e-05, "loss": 0.1568, "step": 15275 }, { "epoch": 4.563077840739636, "grad_norm": 5.364670753479004, "learning_rate": 4.542231117944912e-05, "loss": 0.1048, "step": 15300 }, { "epoch": 4.570533850283328, "grad_norm": 1.4906331300735474, "learning_rate": 4.540574215953978e-05, "loss": 0.1542, "step": 15325 }, { "epoch": 4.577989859827021, "grad_norm": 21.04407501220703, "learning_rate": 4.538917313963045e-05, "loss": 0.0685, "step": 15350 }, { "epoch": 4.585445869370712, "grad_norm": 11.777868270874023, "learning_rate": 4.537260411972111e-05, "loss": 0.1628, "step": 15375 }, { "epoch": 4.592901878914405, "grad_norm": 13.60123348236084, "learning_rate": 4.535603509981178e-05, "loss": 0.19, "step": 15400 }, { "epoch": 4.600357888458097, "grad_norm": 12.014949798583984, "learning_rate": 4.533946607990244e-05, "loss": 0.1144, "step": 15425 }, { "epoch": 4.6078138980017895, "grad_norm": 21.47185516357422, "learning_rate": 4.5322897059993105e-05, "loss": 0.1211, "step": 15450 }, { "epoch": 4.615269907545482, "grad_norm": 0.010081280022859573, "learning_rate": 4.5306328040083774e-05, "loss": 0.1062, "step": 15475 }, { "epoch": 4.6227259170891735, "grad_norm": 0.2724536657333374, "learning_rate": 4.528975902017444e-05, "loss": 0.1029, "step": 15500 }, { "epoch": 4.630181926632866, "grad_norm": 0.1282346248626709, "learning_rate": 4.5273190000265106e-05, "loss": 0.1125, "step": 15525 }, { "epoch": 4.637637936176558, "grad_norm": 8.942920684814453, "learning_rate": 4.5256620980355775e-05, "loss": 0.1662, "step": 15550 }, { "epoch": 4.645093945720251, "grad_norm": 9.61270809173584, "learning_rate": 4.524005196044644e-05, "loss": 0.0608, "step": 15575 }, { "epoch": 4.652549955263943, "grad_norm": 0.3739064931869507, "learning_rate": 4.5223482940537106e-05, "loss": 0.1371, "step": 15600 }, { "epoch": 4.660005964807635, "grad_norm": 18.457700729370117, "learning_rate": 4.520691392062777e-05, "loss": 0.1166, "step": 15625 }, { "epoch": 4.667461974351327, "grad_norm": 10.917533874511719, "learning_rate": 4.519034490071843e-05, "loss": 0.1959, "step": 15650 }, { "epoch": 4.6749179838950194, "grad_norm": 3.7748892307281494, "learning_rate": 4.51737758808091e-05, "loss": 0.076, "step": 15675 }, { "epoch": 4.682373993438712, "grad_norm": 10.774211883544922, "learning_rate": 4.515720686089976e-05, "loss": 0.1744, "step": 15700 }, { "epoch": 4.689830002982404, "grad_norm": 4.676548480987549, "learning_rate": 4.514063784099043e-05, "loss": 0.0786, "step": 15725 }, { "epoch": 4.697286012526096, "grad_norm": 0.1515558362007141, "learning_rate": 4.51240688210811e-05, "loss": 0.0936, "step": 15750 }, { "epoch": 4.704742022069788, "grad_norm": 0.661237895488739, "learning_rate": 4.510749980117176e-05, "loss": 0.1791, "step": 15775 }, { "epoch": 4.712198031613481, "grad_norm": 64.2503890991211, "learning_rate": 4.509093078126243e-05, "loss": 0.1088, "step": 15800 }, { "epoch": 4.719654041157173, "grad_norm": 6.597042560577393, "learning_rate": 4.5074361761353095e-05, "loss": 0.0908, "step": 15825 }, { "epoch": 4.7271100507008645, "grad_norm": 11.98141860961914, "learning_rate": 4.5057792741443764e-05, "loss": 0.1698, "step": 15850 }, { "epoch": 4.734566060244557, "grad_norm": 102.98745727539062, "learning_rate": 4.5041223721534426e-05, "loss": 0.1326, "step": 15875 }, { "epoch": 4.742022069788249, "grad_norm": 21.18263816833496, "learning_rate": 4.502465470162509e-05, "loss": 0.1476, "step": 15900 }, { "epoch": 4.749478079331942, "grad_norm": 5.757909774780273, "learning_rate": 4.500808568171576e-05, "loss": 0.0975, "step": 15925 }, { "epoch": 4.756934088875633, "grad_norm": 0.33464935421943665, "learning_rate": 4.499151666180642e-05, "loss": 0.1271, "step": 15950 }, { "epoch": 4.764390098419326, "grad_norm": 30.5048885345459, "learning_rate": 4.497494764189709e-05, "loss": 0.1143, "step": 15975 }, { "epoch": 4.771846107963018, "grad_norm": 0.5966852307319641, "learning_rate": 4.495837862198776e-05, "loss": 0.0551, "step": 16000 }, { "epoch": 4.7793021175067105, "grad_norm": 19.64974594116211, "learning_rate": 4.494180960207842e-05, "loss": 0.1292, "step": 16025 }, { "epoch": 4.786758127050403, "grad_norm": 18.990375518798828, "learning_rate": 4.492524058216909e-05, "loss": 0.0927, "step": 16050 }, { "epoch": 4.7942141365940945, "grad_norm": 3.3394312858581543, "learning_rate": 4.490867156225975e-05, "loss": 0.1558, "step": 16075 }, { "epoch": 4.801670146137787, "grad_norm": 1.1377098560333252, "learning_rate": 4.4892102542350414e-05, "loss": 0.1145, "step": 16100 }, { "epoch": 4.809126155681479, "grad_norm": 35.98493957519531, "learning_rate": 4.4875533522441083e-05, "loss": 0.1942, "step": 16125 }, { "epoch": 4.816582165225172, "grad_norm": 12.814781188964844, "learning_rate": 4.4858964502531746e-05, "loss": 0.2208, "step": 16150 }, { "epoch": 4.824038174768864, "grad_norm": 20.98240852355957, "learning_rate": 4.4842395482622415e-05, "loss": 0.0748, "step": 16175 }, { "epoch": 4.831494184312556, "grad_norm": 23.819093704223633, "learning_rate": 4.482582646271308e-05, "loss": 0.1131, "step": 16200 }, { "epoch": 4.838950193856248, "grad_norm": 3.9502758979797363, "learning_rate": 4.4809257442803746e-05, "loss": 0.0948, "step": 16225 }, { "epoch": 4.84640620339994, "grad_norm": 56.03023147583008, "learning_rate": 4.4792688422894415e-05, "loss": 0.1912, "step": 16250 }, { "epoch": 4.853862212943633, "grad_norm": 16.82088851928711, "learning_rate": 4.477611940298508e-05, "loss": 0.084, "step": 16275 }, { "epoch": 4.861318222487325, "grad_norm": 4.70719575881958, "learning_rate": 4.475955038307574e-05, "loss": 0.1498, "step": 16300 }, { "epoch": 4.868774232031017, "grad_norm": 0.018220912665128708, "learning_rate": 4.474298136316641e-05, "loss": 0.166, "step": 16325 }, { "epoch": 4.876230241574709, "grad_norm": 2.55483078956604, "learning_rate": 4.472641234325707e-05, "loss": 0.0708, "step": 16350 }, { "epoch": 4.883686251118402, "grad_norm": 6.328634738922119, "learning_rate": 4.470984332334774e-05, "loss": 0.0798, "step": 16375 }, { "epoch": 4.891142260662094, "grad_norm": 4.580233573913574, "learning_rate": 4.46932743034384e-05, "loss": 0.0574, "step": 16400 }, { "epoch": 4.8985982702057855, "grad_norm": 0.2286739945411682, "learning_rate": 4.467670528352907e-05, "loss": 0.1437, "step": 16425 }, { "epoch": 4.906054279749478, "grad_norm": 0.08337617665529251, "learning_rate": 4.4660136263619735e-05, "loss": 0.109, "step": 16450 }, { "epoch": 4.91351028929317, "grad_norm": 4.643275260925293, "learning_rate": 4.4643567243710404e-05, "loss": 0.0927, "step": 16475 }, { "epoch": 4.920966298836863, "grad_norm": 50.29481506347656, "learning_rate": 4.462699822380107e-05, "loss": 0.1549, "step": 16500 }, { "epoch": 4.928422308380554, "grad_norm": 9.458456993103027, "learning_rate": 4.4610429203891735e-05, "loss": 0.0536, "step": 16525 }, { "epoch": 4.935878317924247, "grad_norm": 0.6447744369506836, "learning_rate": 4.45938601839824e-05, "loss": 0.0894, "step": 16550 }, { "epoch": 4.943334327467939, "grad_norm": 3.911870241165161, "learning_rate": 4.4577291164073067e-05, "loss": 0.1486, "step": 16575 }, { "epoch": 4.9507903370116315, "grad_norm": 0.10835100710391998, "learning_rate": 4.456072214416373e-05, "loss": 0.0847, "step": 16600 }, { "epoch": 4.958246346555324, "grad_norm": 9.483917236328125, "learning_rate": 4.45441531242544e-05, "loss": 0.1105, "step": 16625 }, { "epoch": 4.9657023560990154, "grad_norm": 10.36311149597168, "learning_rate": 4.452758410434506e-05, "loss": 0.0985, "step": 16650 }, { "epoch": 4.973158365642708, "grad_norm": 0.43916112184524536, "learning_rate": 4.451101508443572e-05, "loss": 0.0887, "step": 16675 }, { "epoch": 4.9806143751864, "grad_norm": 0.1739131361246109, "learning_rate": 4.449444606452639e-05, "loss": 0.0717, "step": 16700 }, { "epoch": 4.988070384730093, "grad_norm": 30.31475257873535, "learning_rate": 4.447787704461706e-05, "loss": 0.0884, "step": 16725 }, { "epoch": 4.995526394273785, "grad_norm": 14.318930625915527, "learning_rate": 4.4461970785504095e-05, "loss": 0.1252, "step": 16750 }, { "epoch": 5.0, "eval_gen_len": 8.7686, "eval_loss": 0.15973380208015442, "eval_rouge1": 92.6945, "eval_rouge2": 78.5817, "eval_rougeL": 92.5764, "eval_rougeLsum": 92.5683, "eval_runtime": 99.8794, "eval_samples_per_second": 16.79, "eval_steps_per_second": 4.205, "step": 16765 }, { "epoch": 5.002982403817477, "grad_norm": 0.05757031589746475, "learning_rate": 4.4445401765594764e-05, "loss": 0.1751, "step": 16775 }, { "epoch": 5.010438413361169, "grad_norm": 0.6733642220497131, "learning_rate": 4.4428832745685426e-05, "loss": 0.1266, "step": 16800 }, { "epoch": 5.017894422904861, "grad_norm": 0.2373235821723938, "learning_rate": 4.4412263725776095e-05, "loss": 0.0473, "step": 16825 }, { "epoch": 5.025350432448554, "grad_norm": 0.2211294174194336, "learning_rate": 4.439569470586676e-05, "loss": 0.1415, "step": 16850 }, { "epoch": 5.032806441992245, "grad_norm": 1.2345913648605347, "learning_rate": 4.437912568595743e-05, "loss": 0.0372, "step": 16875 }, { "epoch": 5.040262451535938, "grad_norm": 0.27145493030548096, "learning_rate": 4.4362556666048096e-05, "loss": 0.0315, "step": 16900 }, { "epoch": 5.04771846107963, "grad_norm": 7.357295513153076, "learning_rate": 4.434598764613876e-05, "loss": 0.0723, "step": 16925 }, { "epoch": 5.055174470623323, "grad_norm": 5.996601581573486, "learning_rate": 4.432941862622943e-05, "loss": 0.0668, "step": 16950 }, { "epoch": 5.062630480167015, "grad_norm": 0.10849784314632416, "learning_rate": 4.431284960632009e-05, "loss": 0.0573, "step": 16975 }, { "epoch": 5.0700864897107065, "grad_norm": 1.5720083713531494, "learning_rate": 4.429628058641075e-05, "loss": 0.0592, "step": 17000 }, { "epoch": 5.077542499254399, "grad_norm": 0.05373723804950714, "learning_rate": 4.427971156650142e-05, "loss": 0.1364, "step": 17025 }, { "epoch": 5.084998508798091, "grad_norm": 0.47185277938842773, "learning_rate": 4.4263142546592083e-05, "loss": 0.118, "step": 17050 }, { "epoch": 5.092454518341784, "grad_norm": 23.991382598876953, "learning_rate": 4.424657352668275e-05, "loss": 0.0927, "step": 17075 }, { "epoch": 5.099910527885475, "grad_norm": 0.018344825133681297, "learning_rate": 4.4230004506773415e-05, "loss": 0.0498, "step": 17100 }, { "epoch": 5.107366537429168, "grad_norm": 30.450063705444336, "learning_rate": 4.4213435486864084e-05, "loss": 0.0683, "step": 17125 }, { "epoch": 5.11482254697286, "grad_norm": 0.7160254120826721, "learning_rate": 4.419686646695475e-05, "loss": 0.1258, "step": 17150 }, { "epoch": 5.1222785565165525, "grad_norm": 5.697810649871826, "learning_rate": 4.4180297447045415e-05, "loss": 0.0782, "step": 17175 }, { "epoch": 5.129734566060245, "grad_norm": 24.201627731323242, "learning_rate": 4.416372842713608e-05, "loss": 0.0378, "step": 17200 }, { "epoch": 5.137190575603936, "grad_norm": 16.44838523864746, "learning_rate": 4.414715940722675e-05, "loss": 0.1738, "step": 17225 }, { "epoch": 5.144646585147629, "grad_norm": 2.063767910003662, "learning_rate": 4.413059038731741e-05, "loss": 0.0564, "step": 17250 }, { "epoch": 5.152102594691321, "grad_norm": 1.0960617065429688, "learning_rate": 4.411402136740808e-05, "loss": 0.106, "step": 17275 }, { "epoch": 5.159558604235014, "grad_norm": 0.9119987487792969, "learning_rate": 4.409745234749874e-05, "loss": 0.1341, "step": 17300 }, { "epoch": 5.167014613778706, "grad_norm": 25.07423210144043, "learning_rate": 4.40808833275894e-05, "loss": 0.0704, "step": 17325 }, { "epoch": 5.174470623322398, "grad_norm": 3.1737473011016846, "learning_rate": 4.406431430768008e-05, "loss": 0.1202, "step": 17350 }, { "epoch": 5.18192663286609, "grad_norm": 13.149360656738281, "learning_rate": 4.404774528777074e-05, "loss": 0.0952, "step": 17375 }, { "epoch": 5.189382642409782, "grad_norm": 0.8912478089332581, "learning_rate": 4.403117626786141e-05, "loss": 0.0528, "step": 17400 }, { "epoch": 5.196838651953475, "grad_norm": 0.5555346012115479, "learning_rate": 4.401460724795207e-05, "loss": 0.0441, "step": 17425 }, { "epoch": 5.204294661497166, "grad_norm": 0.02102746069431305, "learning_rate": 4.3998038228042735e-05, "loss": 0.0984, "step": 17450 }, { "epoch": 5.211750671040859, "grad_norm": 0.06021396070718765, "learning_rate": 4.3981469208133404e-05, "loss": 0.0662, "step": 17475 }, { "epoch": 5.219206680584551, "grad_norm": 0.17239326238632202, "learning_rate": 4.3964900188224067e-05, "loss": 0.0938, "step": 17500 }, { "epoch": 5.226662690128244, "grad_norm": 8.74494457244873, "learning_rate": 4.3948331168314736e-05, "loss": 0.1012, "step": 17525 }, { "epoch": 5.234118699671936, "grad_norm": 0.18227869272232056, "learning_rate": 4.39317621484054e-05, "loss": 0.0969, "step": 17550 }, { "epoch": 5.2415747092156275, "grad_norm": 0.35041165351867676, "learning_rate": 4.391519312849606e-05, "loss": 0.1368, "step": 17575 }, { "epoch": 5.24903071875932, "grad_norm": 36.02559280395508, "learning_rate": 4.3898624108586736e-05, "loss": 0.1031, "step": 17600 }, { "epoch": 5.256486728303012, "grad_norm": 28.382909774780273, "learning_rate": 4.38820550886774e-05, "loss": 0.168, "step": 17625 }, { "epoch": 5.263942737846705, "grad_norm": 0.79286128282547, "learning_rate": 4.386548606876806e-05, "loss": 0.0353, "step": 17650 }, { "epoch": 5.271398747390396, "grad_norm": 0.4146468937397003, "learning_rate": 4.384891704885873e-05, "loss": 0.0466, "step": 17675 }, { "epoch": 5.278854756934089, "grad_norm": 9.44299030303955, "learning_rate": 4.383234802894939e-05, "loss": 0.1078, "step": 17700 }, { "epoch": 5.286310766477781, "grad_norm": 1.2989798784255981, "learning_rate": 4.381577900904006e-05, "loss": 0.0732, "step": 17725 }, { "epoch": 5.2937667760214735, "grad_norm": 19.005168914794922, "learning_rate": 4.3799209989130724e-05, "loss": 0.0778, "step": 17750 }, { "epoch": 5.301222785565166, "grad_norm": 15.506356239318848, "learning_rate": 4.3782640969221386e-05, "loss": 0.0421, "step": 17775 }, { "epoch": 5.308678795108857, "grad_norm": 6.192285537719727, "learning_rate": 4.3766071949312055e-05, "loss": 0.0594, "step": 17800 }, { "epoch": 5.31613480465255, "grad_norm": 0.025958608835935593, "learning_rate": 4.3749502929402724e-05, "loss": 0.1048, "step": 17825 }, { "epoch": 5.323590814196242, "grad_norm": 0.09452486038208008, "learning_rate": 4.373293390949339e-05, "loss": 0.1083, "step": 17850 }, { "epoch": 5.331046823739935, "grad_norm": 5.524946212768555, "learning_rate": 4.3716364889584056e-05, "loss": 0.0898, "step": 17875 }, { "epoch": 5.338502833283627, "grad_norm": 14.375529289245605, "learning_rate": 4.369979586967472e-05, "loss": 0.0287, "step": 17900 }, { "epoch": 5.345958842827319, "grad_norm": 13.109046936035156, "learning_rate": 4.368322684976539e-05, "loss": 0.047, "step": 17925 }, { "epoch": 5.353414852371011, "grad_norm": 0.10136231780052185, "learning_rate": 4.366665782985605e-05, "loss": 0.1218, "step": 17950 }, { "epoch": 5.360870861914703, "grad_norm": 0.01339312270283699, "learning_rate": 4.365008880994671e-05, "loss": 0.1445, "step": 17975 }, { "epoch": 5.368326871458396, "grad_norm": 0.42180967330932617, "learning_rate": 4.363351979003738e-05, "loss": 0.0626, "step": 18000 }, { "epoch": 5.375782881002087, "grad_norm": 0.004577248357236385, "learning_rate": 4.3616950770128044e-05, "loss": 0.0161, "step": 18025 }, { "epoch": 5.38323889054578, "grad_norm": 0.25045108795166016, "learning_rate": 4.360038175021871e-05, "loss": 0.0347, "step": 18050 }, { "epoch": 5.390694900089472, "grad_norm": 0.6512510180473328, "learning_rate": 4.358381273030938e-05, "loss": 0.0538, "step": 18075 }, { "epoch": 5.398150909633165, "grad_norm": 0.0632275938987732, "learning_rate": 4.3567243710400044e-05, "loss": 0.0475, "step": 18100 }, { "epoch": 5.405606919176856, "grad_norm": 3.314922332763672, "learning_rate": 4.355067469049071e-05, "loss": 0.0581, "step": 18125 }, { "epoch": 5.4130629287205485, "grad_norm": 3.7075135707855225, "learning_rate": 4.3534105670581376e-05, "loss": 0.1561, "step": 18150 }, { "epoch": 5.420518938264241, "grad_norm": 1.4350308179855347, "learning_rate": 4.3517536650672045e-05, "loss": 0.0651, "step": 18175 }, { "epoch": 5.427974947807933, "grad_norm": 15.598840713500977, "learning_rate": 4.350096763076271e-05, "loss": 0.0744, "step": 18200 }, { "epoch": 5.435430957351626, "grad_norm": 19.05609130859375, "learning_rate": 4.348439861085337e-05, "loss": 0.051, "step": 18225 }, { "epoch": 5.442886966895317, "grad_norm": 13.1383695602417, "learning_rate": 4.346782959094404e-05, "loss": 0.2101, "step": 18250 }, { "epoch": 5.45034297643901, "grad_norm": 2.7254810333251953, "learning_rate": 4.34512605710347e-05, "loss": 0.1575, "step": 18275 }, { "epoch": 5.457798985982702, "grad_norm": 0.05944235250353813, "learning_rate": 4.343469155112537e-05, "loss": 0.1236, "step": 18300 }, { "epoch": 5.4652549955263945, "grad_norm": 0.77425616979599, "learning_rate": 4.341812253121604e-05, "loss": 0.0598, "step": 18325 }, { "epoch": 5.472711005070087, "grad_norm": 13.926398277282715, "learning_rate": 4.34015535113067e-05, "loss": 0.0586, "step": 18350 }, { "epoch": 5.480167014613778, "grad_norm": 0.29317107796669006, "learning_rate": 4.338498449139737e-05, "loss": 0.1206, "step": 18375 }, { "epoch": 5.487623024157471, "grad_norm": 9.615321159362793, "learning_rate": 4.336841547148803e-05, "loss": 0.112, "step": 18400 }, { "epoch": 5.495079033701163, "grad_norm": 3.3480887413024902, "learning_rate": 4.3351846451578695e-05, "loss": 0.092, "step": 18425 }, { "epoch": 5.502535043244856, "grad_norm": 5.509705543518066, "learning_rate": 4.3335277431669364e-05, "loss": 0.0675, "step": 18450 }, { "epoch": 5.509991052788548, "grad_norm": 14.98397159576416, "learning_rate": 4.331870841176003e-05, "loss": 0.1012, "step": 18475 }, { "epoch": 5.51744706233224, "grad_norm": 14.701458930969238, "learning_rate": 4.3302139391850696e-05, "loss": 0.0987, "step": 18500 }, { "epoch": 5.524903071875932, "grad_norm": 0.031948402523994446, "learning_rate": 4.328557037194136e-05, "loss": 0.1196, "step": 18525 }, { "epoch": 5.532359081419624, "grad_norm": 0.2000201940536499, "learning_rate": 4.326900135203203e-05, "loss": 0.1658, "step": 18550 }, { "epoch": 5.539815090963317, "grad_norm": 1.2646598815917969, "learning_rate": 4.3252432332122696e-05, "loss": 0.0475, "step": 18575 }, { "epoch": 5.547271100507008, "grad_norm": 0.10240360349416733, "learning_rate": 4.323586331221336e-05, "loss": 0.0751, "step": 18600 }, { "epoch": 5.554727110050701, "grad_norm": 1.4732797145843506, "learning_rate": 4.321929429230403e-05, "loss": 0.1033, "step": 18625 }, { "epoch": 5.562183119594393, "grad_norm": 0.16163279116153717, "learning_rate": 4.320272527239469e-05, "loss": 0.0994, "step": 18650 }, { "epoch": 5.569639129138086, "grad_norm": 36.78316116333008, "learning_rate": 4.318615625248535e-05, "loss": 0.133, "step": 18675 }, { "epoch": 5.577095138681777, "grad_norm": 8.2725830078125, "learning_rate": 4.316958723257602e-05, "loss": 0.0815, "step": 18700 }, { "epoch": 5.5845511482254695, "grad_norm": 0.0331939198076725, "learning_rate": 4.3153018212666684e-05, "loss": 0.0573, "step": 18725 }, { "epoch": 5.592007157769162, "grad_norm": 0.17003242671489716, "learning_rate": 4.313644919275735e-05, "loss": 0.095, "step": 18750 }, { "epoch": 5.599463167312854, "grad_norm": 0.3271353542804718, "learning_rate": 4.3119880172848015e-05, "loss": 0.1235, "step": 18775 }, { "epoch": 5.606919176856547, "grad_norm": 1.6304597854614258, "learning_rate": 4.3103311152938685e-05, "loss": 0.1232, "step": 18800 }, { "epoch": 5.614375186400238, "grad_norm": 2.366298198699951, "learning_rate": 4.3086742133029354e-05, "loss": 0.0471, "step": 18825 }, { "epoch": 5.621831195943931, "grad_norm": 0.019990181550383568, "learning_rate": 4.3070173113120016e-05, "loss": 0.1426, "step": 18850 }, { "epoch": 5.629287205487623, "grad_norm": 0.05211897939443588, "learning_rate": 4.305360409321068e-05, "loss": 0.0902, "step": 18875 }, { "epoch": 5.6367432150313155, "grad_norm": 71.19368743896484, "learning_rate": 4.303703507330135e-05, "loss": 0.0776, "step": 18900 }, { "epoch": 5.644199224575008, "grad_norm": 2.616161823272705, "learning_rate": 4.302046605339201e-05, "loss": 0.1408, "step": 18925 }, { "epoch": 5.651655234118699, "grad_norm": 24.18864631652832, "learning_rate": 4.300389703348268e-05, "loss": 0.0636, "step": 18950 }, { "epoch": 5.659111243662392, "grad_norm": 0.19227628409862518, "learning_rate": 4.298732801357334e-05, "loss": 0.049, "step": 18975 }, { "epoch": 5.666567253206084, "grad_norm": 32.69465255737305, "learning_rate": 4.2970758993664004e-05, "loss": 0.0648, "step": 19000 }, { "epoch": 5.674023262749777, "grad_norm": 0.5582588315010071, "learning_rate": 4.295418997375468e-05, "loss": 0.0635, "step": 19025 }, { "epoch": 5.681479272293468, "grad_norm": 1.391935110092163, "learning_rate": 4.293762095384534e-05, "loss": 0.0883, "step": 19050 }, { "epoch": 5.688935281837161, "grad_norm": 10.445085525512695, "learning_rate": 4.2921051933936004e-05, "loss": 0.05, "step": 19075 }, { "epoch": 5.696391291380853, "grad_norm": 0.07640068978071213, "learning_rate": 4.290448291402667e-05, "loss": 0.0986, "step": 19100 }, { "epoch": 5.703847300924545, "grad_norm": 0.01860329695045948, "learning_rate": 4.2887913894117336e-05, "loss": 0.106, "step": 19125 }, { "epoch": 5.711303310468237, "grad_norm": 0.2838321626186371, "learning_rate": 4.2871344874208005e-05, "loss": 0.0648, "step": 19150 }, { "epoch": 5.718759320011929, "grad_norm": 1.3784078359603882, "learning_rate": 4.285477585429867e-05, "loss": 0.0581, "step": 19175 }, { "epoch": 5.726215329555622, "grad_norm": 0.13906244933605194, "learning_rate": 4.283820683438933e-05, "loss": 0.0994, "step": 19200 }, { "epoch": 5.733671339099314, "grad_norm": 0.06269329786300659, "learning_rate": 4.282163781448e-05, "loss": 0.0369, "step": 19225 }, { "epoch": 5.741127348643007, "grad_norm": 23.20893669128418, "learning_rate": 4.280506879457066e-05, "loss": 0.0794, "step": 19250 }, { "epoch": 5.748583358186698, "grad_norm": 0.008146079257130623, "learning_rate": 4.278849977466134e-05, "loss": 0.122, "step": 19275 }, { "epoch": 5.7560393677303905, "grad_norm": 3.9998602867126465, "learning_rate": 4.2771930754752e-05, "loss": 0.0627, "step": 19300 }, { "epoch": 5.763495377274083, "grad_norm": 9.522217750549316, "learning_rate": 4.275536173484266e-05, "loss": 0.1013, "step": 19325 }, { "epoch": 5.770951386817775, "grad_norm": 17.462121963500977, "learning_rate": 4.273879271493333e-05, "loss": 0.0862, "step": 19350 }, { "epoch": 5.778407396361468, "grad_norm": 0.30210548639297485, "learning_rate": 4.272222369502399e-05, "loss": 0.049, "step": 19375 }, { "epoch": 5.785863405905159, "grad_norm": 15.636837005615234, "learning_rate": 4.270565467511466e-05, "loss": 0.0787, "step": 19400 }, { "epoch": 5.793319415448852, "grad_norm": 0.0845949798822403, "learning_rate": 4.2689085655205324e-05, "loss": 0.0861, "step": 19425 }, { "epoch": 5.800775424992544, "grad_norm": 0.0145711749792099, "learning_rate": 4.267251663529599e-05, "loss": 0.1099, "step": 19450 }, { "epoch": 5.8082314345362365, "grad_norm": 0.0563751682639122, "learning_rate": 4.2655947615386656e-05, "loss": 0.0463, "step": 19475 }, { "epoch": 5.815687444079929, "grad_norm": 0.12543386220932007, "learning_rate": 4.2639378595477325e-05, "loss": 0.0881, "step": 19500 }, { "epoch": 5.82314345362362, "grad_norm": 0.13315843045711517, "learning_rate": 4.262280957556799e-05, "loss": 0.0471, "step": 19525 }, { "epoch": 5.830599463167313, "grad_norm": 0.006420004181563854, "learning_rate": 4.2606240555658656e-05, "loss": 0.0775, "step": 19550 }, { "epoch": 5.838055472711005, "grad_norm": 10.033127784729004, "learning_rate": 4.258967153574932e-05, "loss": 0.0631, "step": 19575 }, { "epoch": 5.845511482254698, "grad_norm": 10.855030059814453, "learning_rate": 4.257310251583999e-05, "loss": 0.074, "step": 19600 }, { "epoch": 5.852967491798389, "grad_norm": 0.44868308305740356, "learning_rate": 4.255653349593065e-05, "loss": 0.123, "step": 19625 }, { "epoch": 5.860423501342082, "grad_norm": 0.07777903974056244, "learning_rate": 4.253996447602131e-05, "loss": 0.0452, "step": 19650 }, { "epoch": 5.867879510885774, "grad_norm": 0.0545571930706501, "learning_rate": 4.252339545611198e-05, "loss": 0.0985, "step": 19675 }, { "epoch": 5.875335520429466, "grad_norm": 12.324911117553711, "learning_rate": 4.2506826436202644e-05, "loss": 0.1197, "step": 19700 }, { "epoch": 5.882791529973158, "grad_norm": 0.026907717809081078, "learning_rate": 4.249025741629331e-05, "loss": 0.1382, "step": 19725 }, { "epoch": 5.89024753951685, "grad_norm": 0.031041543930768967, "learning_rate": 4.247368839638398e-05, "loss": 0.0367, "step": 19750 }, { "epoch": 5.897703549060543, "grad_norm": 0.1477086991071701, "learning_rate": 4.2457119376474645e-05, "loss": 0.0242, "step": 19775 }, { "epoch": 5.905159558604235, "grad_norm": 0.009802543558180332, "learning_rate": 4.2440550356565314e-05, "loss": 0.1238, "step": 19800 }, { "epoch": 5.9126155681479275, "grad_norm": 2.768169403076172, "learning_rate": 4.2423981336655976e-05, "loss": 0.142, "step": 19825 }, { "epoch": 5.920071577691619, "grad_norm": 0.06762892007827759, "learning_rate": 4.2407412316746645e-05, "loss": 0.0501, "step": 19850 }, { "epoch": 5.9275275872353115, "grad_norm": 0.09432929754257202, "learning_rate": 4.239084329683731e-05, "loss": 0.0721, "step": 19875 }, { "epoch": 5.934983596779004, "grad_norm": 0.21210744976997375, "learning_rate": 4.237427427692797e-05, "loss": 0.0142, "step": 19900 }, { "epoch": 5.942439606322696, "grad_norm": 0.15370802581310272, "learning_rate": 4.235770525701864e-05, "loss": 0.0276, "step": 19925 }, { "epoch": 5.949895615866389, "grad_norm": 0.012401612475514412, "learning_rate": 4.23411362371093e-05, "loss": 0.0661, "step": 19950 }, { "epoch": 5.95735162541008, "grad_norm": 0.019147785380482674, "learning_rate": 4.232456721719997e-05, "loss": 0.0537, "step": 19975 }, { "epoch": 5.964807634953773, "grad_norm": 1.0347957611083984, "learning_rate": 4.230799819729064e-05, "loss": 0.0106, "step": 20000 }, { "epoch": 5.972263644497465, "grad_norm": 3.7031664848327637, "learning_rate": 4.22914291773813e-05, "loss": 0.0296, "step": 20025 }, { "epoch": 5.9797196540411575, "grad_norm": 0.052466195076704025, "learning_rate": 4.227486015747197e-05, "loss": 0.1013, "step": 20050 }, { "epoch": 5.98717566358485, "grad_norm": 0.08741900324821472, "learning_rate": 4.2258291137562633e-05, "loss": 0.1086, "step": 20075 }, { "epoch": 5.994631673128541, "grad_norm": 0.196999192237854, "learning_rate": 4.2241722117653296e-05, "loss": 0.03, "step": 20100 }, { "epoch": 6.0, "eval_gen_len": 9.1741, "eval_loss": 0.11874907463788986, "eval_rouge1": 94.9431, "eval_rouge2": 81.0015, "eval_rougeL": 94.8452, "eval_rougeLsum": 94.8177, "eval_runtime": 105.0274, "eval_samples_per_second": 15.967, "eval_steps_per_second": 3.999, "step": 20118 }, { "epoch": 6.002087682672234, "grad_norm": 0.01878800056874752, "learning_rate": 4.2225153097743965e-05, "loss": 0.0218, "step": 20125 }, { "epoch": 6.009543692215926, "grad_norm": 0.08164286613464355, "learning_rate": 4.220858407783463e-05, "loss": 0.0477, "step": 20150 }, { "epoch": 6.016999701759619, "grad_norm": 1.362046241760254, "learning_rate": 4.2192015057925296e-05, "loss": 0.1691, "step": 20175 }, { "epoch": 6.02445571130331, "grad_norm": 0.0867367535829544, "learning_rate": 4.217544603801596e-05, "loss": 0.0223, "step": 20200 }, { "epoch": 6.031911720847003, "grad_norm": 0.010886842384934425, "learning_rate": 4.215887701810663e-05, "loss": 0.0478, "step": 20225 }, { "epoch": 6.039367730390695, "grad_norm": 0.7621909379959106, "learning_rate": 4.21423079981973e-05, "loss": 0.0103, "step": 20250 }, { "epoch": 6.046823739934387, "grad_norm": 1.5744153261184692, "learning_rate": 4.212573897828796e-05, "loss": 0.0457, "step": 20275 }, { "epoch": 6.054279749478079, "grad_norm": 0.11433689296245575, "learning_rate": 4.210916995837862e-05, "loss": 0.0506, "step": 20300 }, { "epoch": 6.061735759021771, "grad_norm": 26.32907485961914, "learning_rate": 4.209260093846929e-05, "loss": 0.0815, "step": 20325 }, { "epoch": 6.069191768565464, "grad_norm": 1.6335394382476807, "learning_rate": 4.207603191855995e-05, "loss": 0.0328, "step": 20350 }, { "epoch": 6.076647778109156, "grad_norm": 0.54072505235672, "learning_rate": 4.205946289865062e-05, "loss": 0.109, "step": 20375 }, { "epoch": 6.0841037876528485, "grad_norm": 23.399791717529297, "learning_rate": 4.2042893878741285e-05, "loss": 0.0436, "step": 20400 }, { "epoch": 6.09155979719654, "grad_norm": 0.04539789631962776, "learning_rate": 4.202632485883195e-05, "loss": 0.0483, "step": 20425 }, { "epoch": 6.0990158067402325, "grad_norm": 0.020925790071487427, "learning_rate": 4.2009755838922616e-05, "loss": 0.0376, "step": 20450 }, { "epoch": 6.106471816283925, "grad_norm": 0.5620167851448059, "learning_rate": 4.1993186819013285e-05, "loss": 0.0955, "step": 20475 }, { "epoch": 6.113927825827617, "grad_norm": 1.8948103189468384, "learning_rate": 4.1976617799103954e-05, "loss": 0.1628, "step": 20500 }, { "epoch": 6.12138383537131, "grad_norm": 5.853365421295166, "learning_rate": 4.1960048779194617e-05, "loss": 0.0407, "step": 20525 }, { "epoch": 6.128839844915001, "grad_norm": 0.35002636909484863, "learning_rate": 4.194347975928528e-05, "loss": 0.0887, "step": 20550 }, { "epoch": 6.136295854458694, "grad_norm": 0.3504277169704437, "learning_rate": 4.192691073937595e-05, "loss": 0.0452, "step": 20575 }, { "epoch": 6.143751864002386, "grad_norm": 2.05971097946167, "learning_rate": 4.191034171946661e-05, "loss": 0.0589, "step": 20600 }, { "epoch": 6.1512078735460785, "grad_norm": 0.01998194307088852, "learning_rate": 4.189377269955728e-05, "loss": 0.0412, "step": 20625 }, { "epoch": 6.15866388308977, "grad_norm": 0.07505607604980469, "learning_rate": 4.187720367964794e-05, "loss": 0.0812, "step": 20650 }, { "epoch": 6.166119892633462, "grad_norm": 0.5407578349113464, "learning_rate": 4.1860634659738604e-05, "loss": 0.044, "step": 20675 }, { "epoch": 6.173575902177155, "grad_norm": 6.510289669036865, "learning_rate": 4.184406563982928e-05, "loss": 0.0505, "step": 20700 }, { "epoch": 6.181031911720847, "grad_norm": 0.7390807867050171, "learning_rate": 4.182749661991994e-05, "loss": 0.0763, "step": 20725 }, { "epoch": 6.18848792126454, "grad_norm": 0.037136584520339966, "learning_rate": 4.1810927600010605e-05, "loss": 0.0052, "step": 20750 }, { "epoch": 6.195943930808231, "grad_norm": 0.13674665987491608, "learning_rate": 4.1794358580101274e-05, "loss": 0.0511, "step": 20775 }, { "epoch": 6.2033999403519235, "grad_norm": 0.5490770936012268, "learning_rate": 4.1777789560191936e-05, "loss": 0.0973, "step": 20800 }, { "epoch": 6.210855949895616, "grad_norm": 0.005674040876328945, "learning_rate": 4.1761220540282605e-05, "loss": 0.0257, "step": 20825 }, { "epoch": 6.218311959439308, "grad_norm": 8.376826286315918, "learning_rate": 4.174465152037327e-05, "loss": 0.068, "step": 20850 }, { "epoch": 6.225767968983, "grad_norm": 0.03015846200287342, "learning_rate": 4.172808250046393e-05, "loss": 0.0285, "step": 20875 }, { "epoch": 6.233223978526692, "grad_norm": 0.01474801730364561, "learning_rate": 4.17115134805546e-05, "loss": 0.0302, "step": 20900 }, { "epoch": 6.240679988070385, "grad_norm": 16.602705001831055, "learning_rate": 4.169494446064526e-05, "loss": 0.0815, "step": 20925 }, { "epoch": 6.248135997614077, "grad_norm": 2.8422799110412598, "learning_rate": 4.167837544073593e-05, "loss": 0.0528, "step": 20950 }, { "epoch": 6.2555920071577695, "grad_norm": 17.248394012451172, "learning_rate": 4.16618064208266e-05, "loss": 0.0298, "step": 20975 }, { "epoch": 6.263048016701461, "grad_norm": 0.010792219080030918, "learning_rate": 4.164523740091726e-05, "loss": 0.0098, "step": 21000 }, { "epoch": 6.2705040262451535, "grad_norm": 0.020565340295433998, "learning_rate": 4.162866838100793e-05, "loss": 0.019, "step": 21025 }, { "epoch": 6.277960035788846, "grad_norm": 0.03458723425865173, "learning_rate": 4.1612099361098594e-05, "loss": 0.0572, "step": 21050 }, { "epoch": 6.285416045332538, "grad_norm": 0.5358602404594421, "learning_rate": 4.159553034118926e-05, "loss": 0.0625, "step": 21075 }, { "epoch": 6.292872054876231, "grad_norm": 0.7117275595664978, "learning_rate": 4.1578961321279925e-05, "loss": 0.0479, "step": 21100 }, { "epoch": 6.300328064419922, "grad_norm": 0.46912163496017456, "learning_rate": 4.156239230137059e-05, "loss": 0.1382, "step": 21125 }, { "epoch": 6.307784073963615, "grad_norm": 17.23985481262207, "learning_rate": 4.1546486042257635e-05, "loss": 0.0598, "step": 21150 }, { "epoch": 6.315240083507307, "grad_norm": 1.2074980735778809, "learning_rate": 4.15299170223483e-05, "loss": 0.0741, "step": 21175 }, { "epoch": 6.322696093050999, "grad_norm": 2.5435595512390137, "learning_rate": 4.151334800243896e-05, "loss": 0.1226, "step": 21200 }, { "epoch": 6.330152102594691, "grad_norm": 0.0682421550154686, "learning_rate": 4.149677898252963e-05, "loss": 0.0185, "step": 21225 }, { "epoch": 6.337608112138383, "grad_norm": 16.758506774902344, "learning_rate": 4.148020996262029e-05, "loss": 0.0491, "step": 21250 }, { "epoch": 6.345064121682076, "grad_norm": 5.860011577606201, "learning_rate": 4.146364094271096e-05, "loss": 0.0211, "step": 21275 }, { "epoch": 6.352520131225768, "grad_norm": 0.9169291853904724, "learning_rate": 4.144707192280162e-05, "loss": 0.0625, "step": 21300 }, { "epoch": 6.35997614076946, "grad_norm": 20.915584564208984, "learning_rate": 4.1430502902892285e-05, "loss": 0.0336, "step": 21325 }, { "epoch": 6.367432150313152, "grad_norm": 0.12377249449491501, "learning_rate": 4.141393388298296e-05, "loss": 0.0972, "step": 21350 }, { "epoch": 6.3748881598568445, "grad_norm": 0.2923804223537445, "learning_rate": 4.139736486307362e-05, "loss": 0.0771, "step": 21375 }, { "epoch": 6.382344169400537, "grad_norm": 0.0026703316252678633, "learning_rate": 4.1380795843164285e-05, "loss": 0.065, "step": 21400 }, { "epoch": 6.389800178944229, "grad_norm": 1.1541820764541626, "learning_rate": 4.1364226823254954e-05, "loss": 0.0171, "step": 21425 }, { "epoch": 6.397256188487921, "grad_norm": 0.010893816128373146, "learning_rate": 4.1347657803345617e-05, "loss": 0.1095, "step": 21450 }, { "epoch": 6.404712198031613, "grad_norm": 0.030142832547426224, "learning_rate": 4.1331088783436286e-05, "loss": 0.0563, "step": 21475 }, { "epoch": 6.412168207575306, "grad_norm": 0.29954442381858826, "learning_rate": 4.131451976352695e-05, "loss": 0.0529, "step": 21500 }, { "epoch": 6.419624217118998, "grad_norm": 10.40064525604248, "learning_rate": 4.129795074361762e-05, "loss": 0.0602, "step": 21525 }, { "epoch": 6.4270802266626905, "grad_norm": 0.021467700600624084, "learning_rate": 4.128138172370828e-05, "loss": 0.0142, "step": 21550 }, { "epoch": 6.434536236206382, "grad_norm": 8.21902847290039, "learning_rate": 4.126481270379895e-05, "loss": 0.1391, "step": 21575 }, { "epoch": 6.4419922457500745, "grad_norm": 0.9277015924453735, "learning_rate": 4.124824368388962e-05, "loss": 0.1091, "step": 21600 }, { "epoch": 6.449448255293767, "grad_norm": 0.10828567296266556, "learning_rate": 4.123167466398028e-05, "loss": 0.02, "step": 21625 }, { "epoch": 6.456904264837459, "grad_norm": 5.262417793273926, "learning_rate": 4.121510564407094e-05, "loss": 0.1392, "step": 21650 }, { "epoch": 6.464360274381151, "grad_norm": 0.07524223625659943, "learning_rate": 4.119853662416161e-05, "loss": 0.054, "step": 21675 }, { "epoch": 6.471816283924843, "grad_norm": 0.028518134728074074, "learning_rate": 4.1181967604252274e-05, "loss": 0.0425, "step": 21700 }, { "epoch": 6.479272293468536, "grad_norm": 24.74271011352539, "learning_rate": 4.116539858434294e-05, "loss": 0.0752, "step": 21725 }, { "epoch": 6.486728303012228, "grad_norm": 0.003951622173190117, "learning_rate": 4.1148829564433605e-05, "loss": 0.0282, "step": 21750 }, { "epoch": 6.49418431255592, "grad_norm": 14.138594627380371, "learning_rate": 4.113226054452427e-05, "loss": 0.0706, "step": 21775 }, { "epoch": 6.501640322099612, "grad_norm": 0.9741306304931641, "learning_rate": 4.111569152461494e-05, "loss": 0.1514, "step": 21800 }, { "epoch": 6.509096331643304, "grad_norm": 1.9363592863082886, "learning_rate": 4.1099122504705606e-05, "loss": 0.0842, "step": 21825 }, { "epoch": 6.516552341186997, "grad_norm": 20.144515991210938, "learning_rate": 4.108255348479627e-05, "loss": 0.1134, "step": 21850 }, { "epoch": 6.524008350730689, "grad_norm": 3.8856167793273926, "learning_rate": 4.106598446488694e-05, "loss": 0.0526, "step": 21875 }, { "epoch": 6.531464360274381, "grad_norm": 0.09439099580049515, "learning_rate": 4.10494154449776e-05, "loss": 0.0112, "step": 21900 }, { "epoch": 6.538920369818073, "grad_norm": 0.07617989182472229, "learning_rate": 4.103284642506827e-05, "loss": 0.0834, "step": 21925 }, { "epoch": 6.5463763793617655, "grad_norm": 19.814706802368164, "learning_rate": 4.101627740515893e-05, "loss": 0.0665, "step": 21950 }, { "epoch": 6.553832388905458, "grad_norm": 0.009026892483234406, "learning_rate": 4.0999708385249594e-05, "loss": 0.0685, "step": 21975 }, { "epoch": 6.56128839844915, "grad_norm": 1.3233736753463745, "learning_rate": 4.098313936534026e-05, "loss": 0.0855, "step": 22000 }, { "epoch": 6.568744407992842, "grad_norm": 0.004098168108612299, "learning_rate": 4.0966570345430925e-05, "loss": 0.0975, "step": 22025 }, { "epoch": 6.576200417536534, "grad_norm": 18.999271392822266, "learning_rate": 4.0950001325521594e-05, "loss": 0.0434, "step": 22050 }, { "epoch": 6.583656427080227, "grad_norm": 0.08481686562299728, "learning_rate": 4.093343230561226e-05, "loss": 0.053, "step": 22075 }, { "epoch": 6.591112436623919, "grad_norm": 0.03606973588466644, "learning_rate": 4.0916863285702926e-05, "loss": 0.1021, "step": 22100 }, { "epoch": 6.5985684461676115, "grad_norm": 3.600905656814575, "learning_rate": 4.0900294265793595e-05, "loss": 0.0289, "step": 22125 }, { "epoch": 6.606024455711303, "grad_norm": 0.6694594025611877, "learning_rate": 4.088372524588426e-05, "loss": 0.0297, "step": 22150 }, { "epoch": 6.613480465254995, "grad_norm": 0.31841492652893066, "learning_rate": 4.0867156225974926e-05, "loss": 0.0578, "step": 22175 }, { "epoch": 6.620936474798688, "grad_norm": 12.335127830505371, "learning_rate": 4.085058720606559e-05, "loss": 0.0594, "step": 22200 }, { "epoch": 6.62839248434238, "grad_norm": 14.127710342407227, "learning_rate": 4.083401818615625e-05, "loss": 0.0516, "step": 22225 }, { "epoch": 6.635848493886072, "grad_norm": 0.029252486303448677, "learning_rate": 4.081744916624692e-05, "loss": 0.0599, "step": 22250 }, { "epoch": 6.643304503429764, "grad_norm": 0.5378937721252441, "learning_rate": 4.080088014633758e-05, "loss": 0.1345, "step": 22275 }, { "epoch": 6.650760512973457, "grad_norm": 0.13244609534740448, "learning_rate": 4.078431112642825e-05, "loss": 0.0515, "step": 22300 }, { "epoch": 6.658216522517149, "grad_norm": 0.2065218836069107, "learning_rate": 4.076774210651892e-05, "loss": 0.0365, "step": 22325 }, { "epoch": 6.6656725320608405, "grad_norm": 0.010511617176234722, "learning_rate": 4.075117308660958e-05, "loss": 0.0429, "step": 22350 }, { "epoch": 6.673128541604533, "grad_norm": 0.7169992327690125, "learning_rate": 4.073460406670025e-05, "loss": 0.142, "step": 22375 }, { "epoch": 6.680584551148225, "grad_norm": 12.551905632019043, "learning_rate": 4.0718035046790914e-05, "loss": 0.101, "step": 22400 }, { "epoch": 6.688040560691918, "grad_norm": 0.05723453685641289, "learning_rate": 4.070146602688158e-05, "loss": 0.0566, "step": 22425 }, { "epoch": 6.69549657023561, "grad_norm": 0.6166514754295349, "learning_rate": 4.0684897006972246e-05, "loss": 0.0914, "step": 22450 }, { "epoch": 6.702952579779302, "grad_norm": 0.01288004219532013, "learning_rate": 4.066832798706291e-05, "loss": 0.03, "step": 22475 }, { "epoch": 6.710408589322994, "grad_norm": 0.07274222373962402, "learning_rate": 4.065175896715358e-05, "loss": 0.0967, "step": 22500 }, { "epoch": 6.7178645988666865, "grad_norm": 15.432951927185059, "learning_rate": 4.063518994724424e-05, "loss": 0.0478, "step": 22525 }, { "epoch": 6.725320608410379, "grad_norm": 0.8928155303001404, "learning_rate": 4.061862092733491e-05, "loss": 0.0682, "step": 22550 }, { "epoch": 6.732776617954071, "grad_norm": 0.044634025543928146, "learning_rate": 4.060205190742558e-05, "loss": 0.0232, "step": 22575 }, { "epoch": 6.740232627497763, "grad_norm": 6.356382369995117, "learning_rate": 4.058548288751624e-05, "loss": 0.0411, "step": 22600 }, { "epoch": 6.747688637041455, "grad_norm": 0.5113846659660339, "learning_rate": 4.05689138676069e-05, "loss": 0.0493, "step": 22625 }, { "epoch": 6.755144646585148, "grad_norm": 3.461223602294922, "learning_rate": 4.055234484769757e-05, "loss": 0.0776, "step": 22650 }, { "epoch": 6.76260065612884, "grad_norm": 28.651147842407227, "learning_rate": 4.0535775827788234e-05, "loss": 0.0689, "step": 22675 }, { "epoch": 6.7700566656725325, "grad_norm": 5.866575241088867, "learning_rate": 4.05192068078789e-05, "loss": 0.0753, "step": 22700 }, { "epoch": 6.777512675216224, "grad_norm": 0.052426777780056, "learning_rate": 4.0502637787969565e-05, "loss": 0.0496, "step": 22725 }, { "epoch": 6.784968684759916, "grad_norm": 8.888331413269043, "learning_rate": 4.0486068768060235e-05, "loss": 0.0399, "step": 22750 }, { "epoch": 6.792424694303609, "grad_norm": 11.797131538391113, "learning_rate": 4.0469499748150904e-05, "loss": 0.0764, "step": 22775 }, { "epoch": 6.799880703847301, "grad_norm": 22.82054328918457, "learning_rate": 4.0452930728241566e-05, "loss": 0.0664, "step": 22800 }, { "epoch": 6.807336713390993, "grad_norm": 2.2873129844665527, "learning_rate": 4.0436361708332235e-05, "loss": 0.0954, "step": 22825 }, { "epoch": 6.814792722934685, "grad_norm": 9.829035758972168, "learning_rate": 4.04197926884229e-05, "loss": 0.0213, "step": 22850 }, { "epoch": 6.822248732478378, "grad_norm": 0.41712069511413574, "learning_rate": 4.040322366851356e-05, "loss": 0.0699, "step": 22875 }, { "epoch": 6.82970474202207, "grad_norm": 22.233823776245117, "learning_rate": 4.038665464860423e-05, "loss": 0.0725, "step": 22900 }, { "epoch": 6.8371607515657615, "grad_norm": 0.4478222131729126, "learning_rate": 4.037008562869489e-05, "loss": 0.043, "step": 22925 }, { "epoch": 6.844616761109454, "grad_norm": 10.205151557922363, "learning_rate": 4.035351660878556e-05, "loss": 0.0923, "step": 22950 }, { "epoch": 6.852072770653146, "grad_norm": 14.356264114379883, "learning_rate": 4.033694758887622e-05, "loss": 0.0779, "step": 22975 }, { "epoch": 6.859528780196839, "grad_norm": 5.969383239746094, "learning_rate": 4.0320378568966885e-05, "loss": 0.0468, "step": 23000 }, { "epoch": 6.866984789740531, "grad_norm": 0.151869997382164, "learning_rate": 4.030380954905756e-05, "loss": 0.0227, "step": 23025 }, { "epoch": 6.874440799284223, "grad_norm": 12.691407203674316, "learning_rate": 4.028724052914822e-05, "loss": 0.0199, "step": 23050 }, { "epoch": 6.881896808827915, "grad_norm": 0.6804265975952148, "learning_rate": 4.0270671509238886e-05, "loss": 0.0734, "step": 23075 }, { "epoch": 6.8893528183716075, "grad_norm": 0.10523873567581177, "learning_rate": 4.0254102489329555e-05, "loss": 0.0646, "step": 23100 }, { "epoch": 6.8968088279153, "grad_norm": 6.399729251861572, "learning_rate": 4.023753346942022e-05, "loss": 0.0336, "step": 23125 }, { "epoch": 6.904264837458992, "grad_norm": 0.018156565725803375, "learning_rate": 4.0220964449510886e-05, "loss": 0.0434, "step": 23150 }, { "epoch": 6.911720847002684, "grad_norm": 0.04066069424152374, "learning_rate": 4.020439542960155e-05, "loss": 0.0835, "step": 23175 }, { "epoch": 6.919176856546376, "grad_norm": 0.20896196365356445, "learning_rate": 4.018782640969221e-05, "loss": 0.0646, "step": 23200 }, { "epoch": 6.926632866090069, "grad_norm": 0.18396757543087006, "learning_rate": 4.017125738978288e-05, "loss": 0.0936, "step": 23225 }, { "epoch": 6.934088875633761, "grad_norm": 0.08867733180522919, "learning_rate": 4.015468836987355e-05, "loss": 0.1058, "step": 23250 }, { "epoch": 6.9415448851774535, "grad_norm": 0.16155028343200684, "learning_rate": 4.013811934996422e-05, "loss": 0.1159, "step": 23275 }, { "epoch": 6.949000894721145, "grad_norm": 10.4935941696167, "learning_rate": 4.012155033005488e-05, "loss": 0.0857, "step": 23300 }, { "epoch": 6.956456904264837, "grad_norm": 11.887359619140625, "learning_rate": 4.010498131014554e-05, "loss": 0.0939, "step": 23325 }, { "epoch": 6.96391291380853, "grad_norm": 0.025529278442263603, "learning_rate": 4.008841229023621e-05, "loss": 0.1037, "step": 23350 }, { "epoch": 6.971368923352222, "grad_norm": 0.14388221502304077, "learning_rate": 4.0071843270326874e-05, "loss": 0.0505, "step": 23375 }, { "epoch": 6.978824932895914, "grad_norm": 0.007163532543927431, "learning_rate": 4.0055274250417544e-05, "loss": 0.0659, "step": 23400 }, { "epoch": 6.986280942439606, "grad_norm": 0.7755250334739685, "learning_rate": 4.0038705230508206e-05, "loss": 0.0595, "step": 23425 }, { "epoch": 6.993736951983299, "grad_norm": 0.010912524536252022, "learning_rate": 4.002213621059887e-05, "loss": 0.0422, "step": 23450 }, { "epoch": 7.0, "eval_gen_len": 8.7078, "eval_loss": 0.11466038972139359, "eval_rouge1": 95.8676, "eval_rouge2": 81.828, "eval_rougeL": 95.8089, "eval_rougeLsum": 95.7724, "eval_runtime": 99.9262, "eval_samples_per_second": 16.782, "eval_steps_per_second": 4.203, "step": 23471 }, { "epoch": 7.001192961526991, "grad_norm": 2.111431837081909, "learning_rate": 4.000556719068954e-05, "loss": 0.0429, "step": 23475 }, { "epoch": 7.008648971070683, "grad_norm": 0.029296431690454483, "learning_rate": 3.9988998170780207e-05, "loss": 0.0895, "step": 23500 }, { "epoch": 7.016104980614375, "grad_norm": 0.07177238911390305, "learning_rate": 3.997242915087087e-05, "loss": 0.0412, "step": 23525 }, { "epoch": 7.023560990158067, "grad_norm": 0.22311842441558838, "learning_rate": 3.995586013096154e-05, "loss": 0.0392, "step": 23550 }, { "epoch": 7.03101699970176, "grad_norm": 15.43985366821289, "learning_rate": 3.99392911110522e-05, "loss": 0.0376, "step": 23575 }, { "epoch": 7.038473009245452, "grad_norm": 0.008796346373856068, "learning_rate": 3.992272209114287e-05, "loss": 0.0655, "step": 23600 }, { "epoch": 7.045929018789144, "grad_norm": 0.4163694977760315, "learning_rate": 3.990615307123353e-05, "loss": 0.0532, "step": 23625 }, { "epoch": 7.053385028332836, "grad_norm": 0.447316437959671, "learning_rate": 3.9889584051324194e-05, "loss": 0.0332, "step": 23650 }, { "epoch": 7.0608410378765285, "grad_norm": 0.3781053423881531, "learning_rate": 3.987301503141486e-05, "loss": 0.0359, "step": 23675 }, { "epoch": 7.068297047420221, "grad_norm": 0.03129143640398979, "learning_rate": 3.9856446011505526e-05, "loss": 0.0626, "step": 23700 }, { "epoch": 7.075753056963913, "grad_norm": 15.764562606811523, "learning_rate": 3.9839876991596195e-05, "loss": 0.0376, "step": 23725 }, { "epoch": 7.083209066507605, "grad_norm": 0.12252432852983475, "learning_rate": 3.9823307971686864e-05, "loss": 0.003, "step": 23750 }, { "epoch": 7.090665076051297, "grad_norm": 0.2217705249786377, "learning_rate": 3.9806738951777526e-05, "loss": 0.0337, "step": 23775 }, { "epoch": 7.09812108559499, "grad_norm": 1.2270210981369019, "learning_rate": 3.9790169931868195e-05, "loss": 0.0035, "step": 23800 }, { "epoch": 7.105577095138682, "grad_norm": 0.040991149842739105, "learning_rate": 3.977360091195886e-05, "loss": 0.0075, "step": 23825 }, { "epoch": 7.113033104682374, "grad_norm": 0.00795942172408104, "learning_rate": 3.975703189204952e-05, "loss": 0.0672, "step": 23850 }, { "epoch": 7.120489114226066, "grad_norm": 18.399192810058594, "learning_rate": 3.974046287214019e-05, "loss": 0.039, "step": 23875 }, { "epoch": 7.127945123769758, "grad_norm": 0.009747338481247425, "learning_rate": 3.972389385223085e-05, "loss": 0.0342, "step": 23900 }, { "epoch": 7.135401133313451, "grad_norm": 0.12228219211101532, "learning_rate": 3.970732483232152e-05, "loss": 0.0089, "step": 23925 }, { "epoch": 7.142857142857143, "grad_norm": 0.5154474377632141, "learning_rate": 3.969075581241218e-05, "loss": 0.0216, "step": 23950 }, { "epoch": 7.150313152400835, "grad_norm": 48.93901824951172, "learning_rate": 3.967418679250285e-05, "loss": 0.0758, "step": 23975 }, { "epoch": 7.157769161944527, "grad_norm": 0.3987184464931488, "learning_rate": 3.965761777259352e-05, "loss": 0.032, "step": 24000 }, { "epoch": 7.16522517148822, "grad_norm": 6.687148094177246, "learning_rate": 3.9641048752684183e-05, "loss": 0.0444, "step": 24025 }, { "epoch": 7.172681181031912, "grad_norm": 0.00944861862808466, "learning_rate": 3.962447973277485e-05, "loss": 0.0525, "step": 24050 }, { "epoch": 7.1801371905756035, "grad_norm": 0.3815286457538605, "learning_rate": 3.9607910712865515e-05, "loss": 0.0218, "step": 24075 }, { "epoch": 7.187593200119296, "grad_norm": 0.008050195872783661, "learning_rate": 3.959134169295618e-05, "loss": 0.093, "step": 24100 }, { "epoch": 7.195049209662988, "grad_norm": 13.898192405700684, "learning_rate": 3.9574772673046846e-05, "loss": 0.04, "step": 24125 }, { "epoch": 7.202505219206681, "grad_norm": 0.0883483812212944, "learning_rate": 3.955820365313751e-05, "loss": 0.0139, "step": 24150 }, { "epoch": 7.209961228750373, "grad_norm": 2.729593515396118, "learning_rate": 3.954163463322818e-05, "loss": 0.0189, "step": 24175 }, { "epoch": 7.217417238294065, "grad_norm": 24.50420570373535, "learning_rate": 3.952506561331884e-05, "loss": 0.0689, "step": 24200 }, { "epoch": 7.224873247837757, "grad_norm": 0.010202550329267979, "learning_rate": 3.950849659340951e-05, "loss": 0.0345, "step": 24225 }, { "epoch": 7.2323292573814495, "grad_norm": 0.20225048065185547, "learning_rate": 3.949259033429655e-05, "loss": 0.0682, "step": 24250 }, { "epoch": 7.239785266925142, "grad_norm": 18.914243698120117, "learning_rate": 3.947602131438721e-05, "loss": 0.0571, "step": 24275 }, { "epoch": 7.247241276468834, "grad_norm": 0.03825189918279648, "learning_rate": 3.9459452294477874e-05, "loss": 0.026, "step": 24300 }, { "epoch": 7.254697286012526, "grad_norm": 16.95638656616211, "learning_rate": 3.9442883274568544e-05, "loss": 0.1054, "step": 24325 }, { "epoch": 7.262153295556218, "grad_norm": 0.016385719180107117, "learning_rate": 3.9426314254659206e-05, "loss": 0.0445, "step": 24350 }, { "epoch": 7.269609305099911, "grad_norm": 0.0067216139286756516, "learning_rate": 3.940974523474988e-05, "loss": 0.0768, "step": 24375 }, { "epoch": 7.277065314643603, "grad_norm": 0.004308663308620453, "learning_rate": 3.9393176214840544e-05, "loss": 0.0128, "step": 24400 }, { "epoch": 7.284521324187295, "grad_norm": 0.029721522703766823, "learning_rate": 3.9376607194931207e-05, "loss": 0.0407, "step": 24425 }, { "epoch": 7.291977333730987, "grad_norm": 0.32637977600097656, "learning_rate": 3.9360038175021876e-05, "loss": 0.0234, "step": 24450 }, { "epoch": 7.299433343274679, "grad_norm": 0.228067547082901, "learning_rate": 3.934346915511254e-05, "loss": 0.0528, "step": 24475 }, { "epoch": 7.306889352818372, "grad_norm": 0.0018244112143293023, "learning_rate": 3.932690013520321e-05, "loss": 0.0208, "step": 24500 }, { "epoch": 7.314345362362064, "grad_norm": 0.01213071309030056, "learning_rate": 3.931033111529387e-05, "loss": 0.0363, "step": 24525 }, { "epoch": 7.321801371905756, "grad_norm": 4.35712194442749, "learning_rate": 3.929376209538453e-05, "loss": 0.0419, "step": 24550 }, { "epoch": 7.329257381449448, "grad_norm": 0.019649688154459, "learning_rate": 3.92771930754752e-05, "loss": 0.011, "step": 24575 }, { "epoch": 7.336713390993141, "grad_norm": 0.2837681770324707, "learning_rate": 3.926062405556586e-05, "loss": 0.0448, "step": 24600 }, { "epoch": 7.344169400536833, "grad_norm": 0.0030554018449038267, "learning_rate": 3.924405503565653e-05, "loss": 0.0038, "step": 24625 }, { "epoch": 7.3516254100805245, "grad_norm": 0.3149765431880951, "learning_rate": 3.92274860157472e-05, "loss": 0.0096, "step": 24650 }, { "epoch": 7.359081419624217, "grad_norm": 0.08300528675317764, "learning_rate": 3.9210916995837864e-05, "loss": 0.0606, "step": 24675 }, { "epoch": 7.366537429167909, "grad_norm": 0.7167775630950928, "learning_rate": 3.919434797592853e-05, "loss": 0.0071, "step": 24700 }, { "epoch": 7.373993438711602, "grad_norm": 0.042454127222299576, "learning_rate": 3.9177778956019195e-05, "loss": 0.1068, "step": 24725 }, { "epoch": 7.381449448255294, "grad_norm": 0.003952869679778814, "learning_rate": 3.916120993610986e-05, "loss": 0.0236, "step": 24750 }, { "epoch": 7.388905457798986, "grad_norm": 0.006623697001487017, "learning_rate": 3.914464091620053e-05, "loss": 0.0432, "step": 24775 }, { "epoch": 7.396361467342678, "grad_norm": 0.2648797631263733, "learning_rate": 3.912807189629119e-05, "loss": 0.1026, "step": 24800 }, { "epoch": 7.4038174768863705, "grad_norm": 0.038570746779441833, "learning_rate": 3.911150287638186e-05, "loss": 0.0174, "step": 24825 }, { "epoch": 7.411273486430063, "grad_norm": 0.11163607984781265, "learning_rate": 3.909493385647253e-05, "loss": 0.0581, "step": 24850 }, { "epoch": 7.418729495973755, "grad_norm": 18.279457092285156, "learning_rate": 3.907836483656319e-05, "loss": 0.0378, "step": 24875 }, { "epoch": 7.426185505517447, "grad_norm": 14.651384353637695, "learning_rate": 3.906179581665386e-05, "loss": 0.0884, "step": 24900 }, { "epoch": 7.433641515061139, "grad_norm": 0.08544855564832687, "learning_rate": 3.904522679674452e-05, "loss": 0.043, "step": 24925 }, { "epoch": 7.441097524604832, "grad_norm": 0.5073758363723755, "learning_rate": 3.9028657776835183e-05, "loss": 0.0312, "step": 24950 }, { "epoch": 7.448553534148524, "grad_norm": 0.7192637920379639, "learning_rate": 3.901208875692585e-05, "loss": 0.0272, "step": 24975 }, { "epoch": 7.456009543692216, "grad_norm": 7.956757068634033, "learning_rate": 3.8995519737016515e-05, "loss": 0.0965, "step": 25000 }, { "epoch": 7.463465553235908, "grad_norm": 0.006529694423079491, "learning_rate": 3.8978950717107184e-05, "loss": 0.0162, "step": 25025 }, { "epoch": 7.4709215627796, "grad_norm": 0.005817771423608065, "learning_rate": 3.8962381697197846e-05, "loss": 0.0576, "step": 25050 }, { "epoch": 7.478377572323293, "grad_norm": 0.005918608978390694, "learning_rate": 3.8945812677288516e-05, "loss": 0.0196, "step": 25075 }, { "epoch": 7.485833581866984, "grad_norm": 0.0037487195804715157, "learning_rate": 3.8929243657379185e-05, "loss": 0.0313, "step": 25100 }, { "epoch": 7.493289591410677, "grad_norm": 0.044559087604284286, "learning_rate": 3.891267463746985e-05, "loss": 0.0353, "step": 25125 }, { "epoch": 7.500745600954369, "grad_norm": 0.0025634621270000935, "learning_rate": 3.8896105617560516e-05, "loss": 0.0253, "step": 25150 }, { "epoch": 7.508201610498062, "grad_norm": 18.160240173339844, "learning_rate": 3.887953659765118e-05, "loss": 0.0418, "step": 25175 }, { "epoch": 7.515657620041754, "grad_norm": 0.011235961690545082, "learning_rate": 3.886296757774184e-05, "loss": 0.0758, "step": 25200 }, { "epoch": 7.5231136295854455, "grad_norm": 6.458895683288574, "learning_rate": 3.884639855783251e-05, "loss": 0.0425, "step": 25225 }, { "epoch": 7.530569639129138, "grad_norm": 0.8765074610710144, "learning_rate": 3.882982953792317e-05, "loss": 0.0208, "step": 25250 }, { "epoch": 7.53802564867283, "grad_norm": 12.993375778198242, "learning_rate": 3.881326051801384e-05, "loss": 0.0333, "step": 25275 }, { "epoch": 7.545481658216523, "grad_norm": 0.10430316627025604, "learning_rate": 3.8796691498104504e-05, "loss": 0.2056, "step": 25300 }, { "epoch": 7.552937667760215, "grad_norm": 0.07876244187355042, "learning_rate": 3.878012247819517e-05, "loss": 0.0489, "step": 25325 }, { "epoch": 7.560393677303907, "grad_norm": 16.660846710205078, "learning_rate": 3.876355345828584e-05, "loss": 0.2723, "step": 25350 }, { "epoch": 7.567849686847599, "grad_norm": 0.11059949547052383, "learning_rate": 3.8746984438376504e-05, "loss": 0.0338, "step": 25375 }, { "epoch": 7.5753056963912915, "grad_norm": 1.6685644388198853, "learning_rate": 3.873041541846717e-05, "loss": 0.0309, "step": 25400 }, { "epoch": 7.582761705934984, "grad_norm": 0.3328234851360321, "learning_rate": 3.8713846398557836e-05, "loss": 0.0713, "step": 25425 }, { "epoch": 7.590217715478676, "grad_norm": 10.004615783691406, "learning_rate": 3.86972773786485e-05, "loss": 0.0459, "step": 25450 }, { "epoch": 7.597673725022368, "grad_norm": 8.179096221923828, "learning_rate": 3.868070835873917e-05, "loss": 0.02, "step": 25475 }, { "epoch": 7.60512973456606, "grad_norm": 0.036157842725515366, "learning_rate": 3.866413933882983e-05, "loss": 0.0704, "step": 25500 }, { "epoch": 7.612585744109753, "grad_norm": 0.1007198840379715, "learning_rate": 3.864757031892049e-05, "loss": 0.019, "step": 25525 }, { "epoch": 7.620041753653445, "grad_norm": 0.057942017912864685, "learning_rate": 3.863100129901116e-05, "loss": 0.0588, "step": 25550 }, { "epoch": 7.627497763197137, "grad_norm": 1.3576432466506958, "learning_rate": 3.861443227910183e-05, "loss": 0.0124, "step": 25575 }, { "epoch": 7.634953772740829, "grad_norm": 0.00611503841355443, "learning_rate": 3.85978632591925e-05, "loss": 0.0646, "step": 25600 }, { "epoch": 7.642409782284521, "grad_norm": 0.041820377111434937, "learning_rate": 3.858129423928316e-05, "loss": 0.0667, "step": 25625 }, { "epoch": 7.649865791828214, "grad_norm": 4.039573669433594, "learning_rate": 3.8564725219373824e-05, "loss": 0.0215, "step": 25650 }, { "epoch": 7.657321801371905, "grad_norm": 7.467697620391846, "learning_rate": 3.854815619946449e-05, "loss": 0.0054, "step": 25675 }, { "epoch": 7.664777810915598, "grad_norm": 0.01705421693623066, "learning_rate": 3.8531587179555155e-05, "loss": 0.0225, "step": 25700 }, { "epoch": 7.67223382045929, "grad_norm": 0.049625929445028305, "learning_rate": 3.8515018159645825e-05, "loss": 0.0338, "step": 25725 }, { "epoch": 7.6796898300029826, "grad_norm": 0.2381235659122467, "learning_rate": 3.849844913973649e-05, "loss": 0.043, "step": 25750 }, { "epoch": 7.687145839546675, "grad_norm": 17.741899490356445, "learning_rate": 3.848188011982715e-05, "loss": 0.0511, "step": 25775 }, { "epoch": 7.6946018490903665, "grad_norm": 0.02506117708981037, "learning_rate": 3.846531109991782e-05, "loss": 0.0804, "step": 25800 }, { "epoch": 7.702057858634059, "grad_norm": 0.018649157136678696, "learning_rate": 3.844874208000849e-05, "loss": 0.0479, "step": 25825 }, { "epoch": 7.709513868177751, "grad_norm": 0.1378479301929474, "learning_rate": 3.843217306009915e-05, "loss": 0.0695, "step": 25850 }, { "epoch": 7.716969877721444, "grad_norm": 0.040336690843105316, "learning_rate": 3.841560404018982e-05, "loss": 0.0016, "step": 25875 }, { "epoch": 7.724425887265136, "grad_norm": 0.6665530800819397, "learning_rate": 3.839903502028048e-05, "loss": 0.0375, "step": 25900 }, { "epoch": 7.731881896808828, "grad_norm": 20.52546501159668, "learning_rate": 3.838246600037115e-05, "loss": 0.0571, "step": 25925 }, { "epoch": 7.73933790635252, "grad_norm": 2.0904996395111084, "learning_rate": 3.836589698046181e-05, "loss": 0.0273, "step": 25950 }, { "epoch": 7.7467939158962125, "grad_norm": 10.46048355102539, "learning_rate": 3.8349327960552475e-05, "loss": 0.089, "step": 25975 }, { "epoch": 7.754249925439905, "grad_norm": 0.045314982533454895, "learning_rate": 3.8332758940643144e-05, "loss": 0.0376, "step": 26000 }, { "epoch": 7.761705934983596, "grad_norm": 11.569523811340332, "learning_rate": 3.8316189920733807e-05, "loss": 0.07, "step": 26025 }, { "epoch": 7.769161944527289, "grad_norm": 0.2440641224384308, "learning_rate": 3.8299620900824476e-05, "loss": 0.0491, "step": 26050 }, { "epoch": 7.776617954070981, "grad_norm": 5.467372894287109, "learning_rate": 3.8283051880915145e-05, "loss": 0.0103, "step": 26075 }, { "epoch": 7.784073963614674, "grad_norm": 0.07725433260202408, "learning_rate": 3.826648286100581e-05, "loss": 0.0621, "step": 26100 }, { "epoch": 7.791529973158365, "grad_norm": 0.26000112295150757, "learning_rate": 3.8249913841096476e-05, "loss": 0.0456, "step": 26125 }, { "epoch": 7.798985982702058, "grad_norm": 0.05612451583147049, "learning_rate": 3.823334482118714e-05, "loss": 0.0386, "step": 26150 }, { "epoch": 7.80644199224575, "grad_norm": 0.3714951276779175, "learning_rate": 3.821677580127781e-05, "loss": 0.0403, "step": 26175 }, { "epoch": 7.813898001789442, "grad_norm": 0.0035816803574562073, "learning_rate": 3.820020678136847e-05, "loss": 0.0083, "step": 26200 }, { "epoch": 7.821354011333135, "grad_norm": 10.295557975769043, "learning_rate": 3.818363776145913e-05, "loss": 0.0669, "step": 26225 }, { "epoch": 7.828810020876826, "grad_norm": 0.02752000279724598, "learning_rate": 3.81670687415498e-05, "loss": 0.0259, "step": 26250 }, { "epoch": 7.836266030420519, "grad_norm": 0.008338281884789467, "learning_rate": 3.8150499721640464e-05, "loss": 0.0456, "step": 26275 }, { "epoch": 7.843722039964211, "grad_norm": 0.04406023770570755, "learning_rate": 3.813393070173113e-05, "loss": 0.0075, "step": 26300 }, { "epoch": 7.8511780495079035, "grad_norm": 3.312938928604126, "learning_rate": 3.81173616818218e-05, "loss": 0.0602, "step": 26325 }, { "epoch": 7.858634059051596, "grad_norm": 14.139311790466309, "learning_rate": 3.8100792661912464e-05, "loss": 0.0345, "step": 26350 }, { "epoch": 7.8660900685952875, "grad_norm": 0.5736969113349915, "learning_rate": 3.8084223642003134e-05, "loss": 0.0441, "step": 26375 }, { "epoch": 7.87354607813898, "grad_norm": 1.0199308395385742, "learning_rate": 3.8067654622093796e-05, "loss": 0.096, "step": 26400 }, { "epoch": 7.881002087682672, "grad_norm": 36.68028259277344, "learning_rate": 3.805108560218446e-05, "loss": 0.0979, "step": 26425 }, { "epoch": 7.888458097226365, "grad_norm": 0.0884442925453186, "learning_rate": 3.8035179343071505e-05, "loss": 0.0646, "step": 26450 }, { "epoch": 7.895914106770057, "grad_norm": 4.98067045211792, "learning_rate": 3.801861032316217e-05, "loss": 0.0605, "step": 26475 }, { "epoch": 7.903370116313749, "grad_norm": 0.042504098266363144, "learning_rate": 3.800204130325283e-05, "loss": 0.0354, "step": 26500 }, { "epoch": 7.910826125857441, "grad_norm": 0.008564049378037453, "learning_rate": 3.79854722833435e-05, "loss": 0.0323, "step": 26525 }, { "epoch": 7.9182821354011335, "grad_norm": 0.30509287118911743, "learning_rate": 3.796890326343416e-05, "loss": 0.0237, "step": 26550 }, { "epoch": 7.925738144944826, "grad_norm": 11.284287452697754, "learning_rate": 3.795233424352483e-05, "loss": 0.0339, "step": 26575 }, { "epoch": 7.933194154488517, "grad_norm": 0.008676270954310894, "learning_rate": 3.793576522361549e-05, "loss": 0.04, "step": 26600 }, { "epoch": 7.94065016403221, "grad_norm": 1.3062283992767334, "learning_rate": 3.7919196203706155e-05, "loss": 0.0101, "step": 26625 }, { "epoch": 7.948106173575902, "grad_norm": 0.21713188290596008, "learning_rate": 3.7902627183796825e-05, "loss": 0.1142, "step": 26650 }, { "epoch": 7.955562183119595, "grad_norm": 0.06104138121008873, "learning_rate": 3.788605816388749e-05, "loss": 0.0221, "step": 26675 }, { "epoch": 7.963018192663286, "grad_norm": 0.0560293085873127, "learning_rate": 3.786948914397816e-05, "loss": 0.0229, "step": 26700 }, { "epoch": 7.9704742022069786, "grad_norm": 0.013059995137155056, "learning_rate": 3.7852920124068825e-05, "loss": 0.0233, "step": 26725 }, { "epoch": 7.977930211750671, "grad_norm": 13.456666946411133, "learning_rate": 3.783635110415949e-05, "loss": 0.0191, "step": 26750 }, { "epoch": 7.985386221294363, "grad_norm": 11.048165321350098, "learning_rate": 3.7819782084250157e-05, "loss": 0.0974, "step": 26775 }, { "epoch": 7.992842230838056, "grad_norm": 0.036631595343351364, "learning_rate": 3.780321306434082e-05, "loss": 0.0245, "step": 26800 }, { "epoch": 8.0, "eval_gen_len": 8.6917, "eval_loss": 0.10363117605447769, "eval_rouge1": 95.9929, "eval_rouge2": 82.7618, "eval_rougeL": 95.9431, "eval_rougeLsum": 95.9431, "eval_runtime": 96.085, "eval_samples_per_second": 17.453, "eval_steps_per_second": 4.371, "step": 26824 }, { "epoch": 8.000298240381747, "grad_norm": 1.841543197631836, "learning_rate": 3.778664404443149e-05, "loss": 0.0217, "step": 26825 }, { "epoch": 8.00775424992544, "grad_norm": 0.07992962747812271, "learning_rate": 3.777007502452215e-05, "loss": 0.0396, "step": 26850 }, { "epoch": 8.015210259469132, "grad_norm": 0.06587328016757965, "learning_rate": 3.775350600461281e-05, "loss": 0.0426, "step": 26875 }, { "epoch": 8.022666269012824, "grad_norm": 0.03052508272230625, "learning_rate": 3.773693698470348e-05, "loss": 0.0516, "step": 26900 }, { "epoch": 8.030122278556517, "grad_norm": 1.1213061809539795, "learning_rate": 3.772036796479415e-05, "loss": 0.0276, "step": 26925 }, { "epoch": 8.037578288100208, "grad_norm": 5.817966938018799, "learning_rate": 3.770379894488481e-05, "loss": 0.0083, "step": 26950 }, { "epoch": 8.045034297643902, "grad_norm": 0.009192215278744698, "learning_rate": 3.768722992497548e-05, "loss": 0.0139, "step": 26975 }, { "epoch": 8.052490307187593, "grad_norm": 0.2627590000629425, "learning_rate": 3.7670660905066145e-05, "loss": 0.0077, "step": 27000 }, { "epoch": 8.059946316731285, "grad_norm": 11.902652740478516, "learning_rate": 3.7654091885156814e-05, "loss": 0.0139, "step": 27025 }, { "epoch": 8.067402326274978, "grad_norm": 0.4330286979675293, "learning_rate": 3.7637522865247476e-05, "loss": 0.0087, "step": 27050 }, { "epoch": 8.07485833581867, "grad_norm": 10.134819984436035, "learning_rate": 3.762095384533814e-05, "loss": 0.0643, "step": 27075 }, { "epoch": 8.082314345362363, "grad_norm": 0.8864652514457703, "learning_rate": 3.760438482542881e-05, "loss": 0.0159, "step": 27100 }, { "epoch": 8.089770354906054, "grad_norm": 0.05642193183302879, "learning_rate": 3.758781580551947e-05, "loss": 0.0017, "step": 27125 }, { "epoch": 8.097226364449746, "grad_norm": 0.07227706164121628, "learning_rate": 3.757124678561014e-05, "loss": 0.0028, "step": 27150 }, { "epoch": 8.10468237399344, "grad_norm": 0.06436634063720703, "learning_rate": 3.755467776570081e-05, "loss": 0.0141, "step": 27175 }, { "epoch": 8.11213838353713, "grad_norm": 0.09210552275180817, "learning_rate": 3.753810874579147e-05, "loss": 0.0365, "step": 27200 }, { "epoch": 8.119594393080822, "grad_norm": 0.0035904215183109045, "learning_rate": 3.752153972588214e-05, "loss": 0.0304, "step": 27225 }, { "epoch": 8.127050402624516, "grad_norm": 67.14472198486328, "learning_rate": 3.75049707059728e-05, "loss": 0.0539, "step": 27250 }, { "epoch": 8.134506412168207, "grad_norm": 0.2653847336769104, "learning_rate": 3.748840168606347e-05, "loss": 0.0454, "step": 27275 }, { "epoch": 8.1419624217119, "grad_norm": 0.029458891600370407, "learning_rate": 3.7471832666154134e-05, "loss": 0.0262, "step": 27300 }, { "epoch": 8.149418431255592, "grad_norm": 0.08708936721086502, "learning_rate": 3.7455263646244796e-05, "loss": 0.0798, "step": 27325 }, { "epoch": 8.156874440799283, "grad_norm": 0.065219946205616, "learning_rate": 3.7438694626335465e-05, "loss": 0.022, "step": 27350 }, { "epoch": 8.164330450342977, "grad_norm": 0.05592311546206474, "learning_rate": 3.742212560642613e-05, "loss": 0.039, "step": 27375 }, { "epoch": 8.171786459886668, "grad_norm": 5.338498592376709, "learning_rate": 3.7405556586516796e-05, "loss": 0.0417, "step": 27400 }, { "epoch": 8.179242469430362, "grad_norm": 0.0057130069471895695, "learning_rate": 3.7388987566607466e-05, "loss": 0.0232, "step": 27425 }, { "epoch": 8.186698478974053, "grad_norm": 14.00313949584961, "learning_rate": 3.737241854669813e-05, "loss": 0.0464, "step": 27450 }, { "epoch": 8.194154488517745, "grad_norm": 0.15262223780155182, "learning_rate": 3.735651228758517e-05, "loss": 0.0663, "step": 27475 }, { "epoch": 8.201610498061438, "grad_norm": 0.01996547356247902, "learning_rate": 3.733994326767583e-05, "loss": 0.0176, "step": 27500 }, { "epoch": 8.20906650760513, "grad_norm": 0.022581912577152252, "learning_rate": 3.732337424776649e-05, "loss": 0.0375, "step": 27525 }, { "epoch": 8.216522517148823, "grad_norm": 0.019385678693652153, "learning_rate": 3.730680522785716e-05, "loss": 0.0203, "step": 27550 }, { "epoch": 8.223978526692514, "grad_norm": 0.02384263090789318, "learning_rate": 3.729023620794783e-05, "loss": 0.0365, "step": 27575 }, { "epoch": 8.231434536236206, "grad_norm": 0.004445483908057213, "learning_rate": 3.7273667188038494e-05, "loss": 0.0422, "step": 27600 }, { "epoch": 8.238890545779899, "grad_norm": 0.005683319177478552, "learning_rate": 3.725709816812916e-05, "loss": 0.0154, "step": 27625 }, { "epoch": 8.24634655532359, "grad_norm": 0.013689364306628704, "learning_rate": 3.7240529148219825e-05, "loss": 0.044, "step": 27650 }, { "epoch": 8.253802564867282, "grad_norm": 0.01799396611750126, "learning_rate": 3.7223960128310494e-05, "loss": 0.0294, "step": 27675 }, { "epoch": 8.261258574410975, "grad_norm": 0.0023462409153580666, "learning_rate": 3.7207391108401157e-05, "loss": 0.0425, "step": 27700 }, { "epoch": 8.268714583954667, "grad_norm": 0.02442399598658085, "learning_rate": 3.7190822088491826e-05, "loss": 0.003, "step": 27725 }, { "epoch": 8.27617059349836, "grad_norm": 0.022840287536382675, "learning_rate": 3.717425306858249e-05, "loss": 0.0277, "step": 27750 }, { "epoch": 8.283626603042052, "grad_norm": 0.3185962736606598, "learning_rate": 3.715768404867315e-05, "loss": 0.0601, "step": 27775 }, { "epoch": 8.291082612585743, "grad_norm": 0.03330976143479347, "learning_rate": 3.714111502876382e-05, "loss": 0.0329, "step": 27800 }, { "epoch": 8.298538622129437, "grad_norm": 0.036625757813453674, "learning_rate": 3.712454600885449e-05, "loss": 0.0315, "step": 27825 }, { "epoch": 8.305994631673128, "grad_norm": 0.5965529084205627, "learning_rate": 3.710797698894515e-05, "loss": 0.0215, "step": 27850 }, { "epoch": 8.313450641216821, "grad_norm": 0.04731619358062744, "learning_rate": 3.709140796903582e-05, "loss": 0.0041, "step": 27875 }, { "epoch": 8.320906650760513, "grad_norm": 0.20525537431240082, "learning_rate": 3.707483894912648e-05, "loss": 0.0223, "step": 27900 }, { "epoch": 8.328362660304204, "grad_norm": 4.396222114562988, "learning_rate": 3.705826992921715e-05, "loss": 0.0133, "step": 27925 }, { "epoch": 8.335818669847898, "grad_norm": 0.1394113004207611, "learning_rate": 3.7041700909307814e-05, "loss": 0.0265, "step": 27950 }, { "epoch": 8.34327467939159, "grad_norm": 81.92708587646484, "learning_rate": 3.7025131889398476e-05, "loss": 0.0188, "step": 27975 }, { "epoch": 8.350730688935283, "grad_norm": 0.15940868854522705, "learning_rate": 3.7008562869489145e-05, "loss": 0.0396, "step": 28000 }, { "epoch": 8.358186698478974, "grad_norm": 0.8631670475006104, "learning_rate": 3.699199384957981e-05, "loss": 0.0064, "step": 28025 }, { "epoch": 8.365642708022666, "grad_norm": 0.06859345734119415, "learning_rate": 3.697542482967048e-05, "loss": 0.0251, "step": 28050 }, { "epoch": 8.373098717566359, "grad_norm": 0.007882620207965374, "learning_rate": 3.6958855809761146e-05, "loss": 0.0147, "step": 28075 }, { "epoch": 8.38055472711005, "grad_norm": 0.0038322643376886845, "learning_rate": 3.694228678985181e-05, "loss": 0.1021, "step": 28100 }, { "epoch": 8.388010736653744, "grad_norm": 1.0349613428115845, "learning_rate": 3.692571776994248e-05, "loss": 0.071, "step": 28125 }, { "epoch": 8.395466746197435, "grad_norm": 6.251139163970947, "learning_rate": 3.690914875003314e-05, "loss": 0.0183, "step": 28150 }, { "epoch": 8.402922755741127, "grad_norm": 31.23532485961914, "learning_rate": 3.68925797301238e-05, "loss": 0.0176, "step": 28175 }, { "epoch": 8.41037876528482, "grad_norm": 0.018022626638412476, "learning_rate": 3.687601071021447e-05, "loss": 0.0045, "step": 28200 }, { "epoch": 8.417834774828512, "grad_norm": 0.022061647847294807, "learning_rate": 3.6859441690305134e-05, "loss": 0.0017, "step": 28225 }, { "epoch": 8.425290784372205, "grad_norm": 7.409425258636475, "learning_rate": 3.68428726703958e-05, "loss": 0.0454, "step": 28250 }, { "epoch": 8.432746793915896, "grad_norm": 0.879426896572113, "learning_rate": 3.6826303650486465e-05, "loss": 0.002, "step": 28275 }, { "epoch": 8.440202803459588, "grad_norm": 0.8661222457885742, "learning_rate": 3.6809734630577134e-05, "loss": 0.0513, "step": 28300 }, { "epoch": 8.447658813003281, "grad_norm": 0.04469098895788193, "learning_rate": 3.67931656106678e-05, "loss": 0.0269, "step": 28325 }, { "epoch": 8.455114822546973, "grad_norm": 0.0013134300243109465, "learning_rate": 3.6776596590758466e-05, "loss": 0.0306, "step": 28350 }, { "epoch": 8.462570832090664, "grad_norm": 0.17150649428367615, "learning_rate": 3.6760027570849135e-05, "loss": 0.0687, "step": 28375 }, { "epoch": 8.470026841634358, "grad_norm": 0.03961332514882088, "learning_rate": 3.67434585509398e-05, "loss": 0.007, "step": 28400 }, { "epoch": 8.47748285117805, "grad_norm": 0.6865983009338379, "learning_rate": 3.672688953103046e-05, "loss": 0.0586, "step": 28425 }, { "epoch": 8.484938860721742, "grad_norm": 0.21008718013763428, "learning_rate": 3.671032051112113e-05, "loss": 0.0705, "step": 28450 }, { "epoch": 8.492394870265434, "grad_norm": 12.806836128234863, "learning_rate": 3.669375149121179e-05, "loss": 0.0316, "step": 28475 }, { "epoch": 8.499850879809125, "grad_norm": 0.010018163360655308, "learning_rate": 3.667718247130246e-05, "loss": 0.0202, "step": 28500 }, { "epoch": 8.507306889352819, "grad_norm": 0.01900198683142662, "learning_rate": 3.666061345139313e-05, "loss": 0.0562, "step": 28525 }, { "epoch": 8.51476289889651, "grad_norm": 0.003071998944506049, "learning_rate": 3.664404443148379e-05, "loss": 0.0144, "step": 28550 }, { "epoch": 8.522218908440204, "grad_norm": 0.02706441655755043, "learning_rate": 3.662747541157446e-05, "loss": 0.0166, "step": 28575 }, { "epoch": 8.529674917983895, "grad_norm": 9.968490600585938, "learning_rate": 3.661090639166512e-05, "loss": 0.0216, "step": 28600 }, { "epoch": 8.537130927527587, "grad_norm": 0.0901758223772049, "learning_rate": 3.6594337371755785e-05, "loss": 0.0452, "step": 28625 }, { "epoch": 8.54458693707128, "grad_norm": 0.07754506915807724, "learning_rate": 3.6577768351846454e-05, "loss": 0.0099, "step": 28650 }, { "epoch": 8.552042946614971, "grad_norm": 0.004118072800338268, "learning_rate": 3.656119933193712e-05, "loss": 0.022, "step": 28675 }, { "epoch": 8.559498956158663, "grad_norm": 0.017649231478571892, "learning_rate": 3.6544630312027786e-05, "loss": 0.0456, "step": 28700 }, { "epoch": 8.566954965702356, "grad_norm": 0.0039413124322891235, "learning_rate": 3.652806129211845e-05, "loss": 0.0079, "step": 28725 }, { "epoch": 8.574410975246048, "grad_norm": 0.003399114590138197, "learning_rate": 3.651149227220911e-05, "loss": 0.0245, "step": 28750 }, { "epoch": 8.581866984789741, "grad_norm": 44.0549430847168, "learning_rate": 3.6494923252299786e-05, "loss": 0.0432, "step": 28775 }, { "epoch": 8.589322994333433, "grad_norm": 0.0016269719926640391, "learning_rate": 3.647835423239045e-05, "loss": 0.0119, "step": 28800 }, { "epoch": 8.596779003877124, "grad_norm": 0.00740943755954504, "learning_rate": 3.646178521248111e-05, "loss": 0.0185, "step": 28825 }, { "epoch": 8.604235013420817, "grad_norm": 0.24125465750694275, "learning_rate": 3.644521619257178e-05, "loss": 0.0082, "step": 28850 }, { "epoch": 8.611691022964509, "grad_norm": 2.92952299118042, "learning_rate": 3.642864717266244e-05, "loss": 0.0682, "step": 28875 }, { "epoch": 8.619147032508202, "grad_norm": 10.032451629638672, "learning_rate": 3.641207815275311e-05, "loss": 0.0066, "step": 28900 }, { "epoch": 8.626603042051894, "grad_norm": 0.9171352982521057, "learning_rate": 3.6395509132843774e-05, "loss": 0.0286, "step": 28925 }, { "epoch": 8.634059051595585, "grad_norm": 0.013421298936009407, "learning_rate": 3.637894011293444e-05, "loss": 0.0504, "step": 28950 }, { "epoch": 8.641515061139279, "grad_norm": 0.06524740159511566, "learning_rate": 3.6362371093025105e-05, "loss": 0.0185, "step": 28975 }, { "epoch": 8.64897107068297, "grad_norm": 0.033882249146699905, "learning_rate": 3.6345802073115775e-05, "loss": 0.0157, "step": 29000 }, { "epoch": 8.656427080226663, "grad_norm": 0.06428802013397217, "learning_rate": 3.6329233053206444e-05, "loss": 0.0338, "step": 29025 }, { "epoch": 8.663883089770355, "grad_norm": 0.01688966527581215, "learning_rate": 3.6312664033297106e-05, "loss": 0.0063, "step": 29050 }, { "epoch": 8.671339099314046, "grad_norm": 23.71369743347168, "learning_rate": 3.629609501338777e-05, "loss": 0.031, "step": 29075 }, { "epoch": 8.67879510885774, "grad_norm": 0.0019492580322548747, "learning_rate": 3.627952599347844e-05, "loss": 0.0749, "step": 29100 }, { "epoch": 8.686251118401431, "grad_norm": 15.162439346313477, "learning_rate": 3.62629569735691e-05, "loss": 0.0487, "step": 29125 }, { "epoch": 8.693707127945125, "grad_norm": 0.5042064189910889, "learning_rate": 3.624638795365977e-05, "loss": 0.023, "step": 29150 }, { "epoch": 8.701163137488816, "grad_norm": 3.6821882724761963, "learning_rate": 3.622981893375043e-05, "loss": 0.0714, "step": 29175 }, { "epoch": 8.708619147032508, "grad_norm": 6.568551540374756, "learning_rate": 3.6213249913841094e-05, "loss": 0.0341, "step": 29200 }, { "epoch": 8.716075156576201, "grad_norm": 0.3266686201095581, "learning_rate": 3.619668089393176e-05, "loss": 0.0676, "step": 29225 }, { "epoch": 8.723531166119892, "grad_norm": 0.014193633571267128, "learning_rate": 3.618011187402243e-05, "loss": 0.0166, "step": 29250 }, { "epoch": 8.730987175663586, "grad_norm": 0.0672958567738533, "learning_rate": 3.6163542854113094e-05, "loss": 0.0032, "step": 29275 }, { "epoch": 8.738443185207277, "grad_norm": 0.007883368991315365, "learning_rate": 3.614697383420376e-05, "loss": 0.0449, "step": 29300 }, { "epoch": 8.745899194750969, "grad_norm": 20.07937240600586, "learning_rate": 3.6130404814294426e-05, "loss": 0.0445, "step": 29325 }, { "epoch": 8.753355204294662, "grad_norm": 5.050912857055664, "learning_rate": 3.6113835794385095e-05, "loss": 0.0058, "step": 29350 }, { "epoch": 8.760811213838354, "grad_norm": 0.019583450630307198, "learning_rate": 3.609726677447576e-05, "loss": 0.0092, "step": 29375 }, { "epoch": 8.768267223382045, "grad_norm": 6.366359710693359, "learning_rate": 3.608069775456642e-05, "loss": 0.0361, "step": 29400 }, { "epoch": 8.775723232925738, "grad_norm": 0.012687691487371922, "learning_rate": 3.606412873465709e-05, "loss": 0.0471, "step": 29425 }, { "epoch": 8.78317924246943, "grad_norm": 0.04783850535750389, "learning_rate": 3.604755971474775e-05, "loss": 0.0375, "step": 29450 }, { "epoch": 8.790635252013123, "grad_norm": 0.058531004935503006, "learning_rate": 3.603099069483842e-05, "loss": 0.0109, "step": 29475 }, { "epoch": 8.798091261556815, "grad_norm": 0.006072982680052519, "learning_rate": 3.601442167492909e-05, "loss": 0.0275, "step": 29500 }, { "epoch": 8.805547271100506, "grad_norm": 0.03280794247984886, "learning_rate": 3.599785265501975e-05, "loss": 0.0142, "step": 29525 }, { "epoch": 8.8130032806442, "grad_norm": 0.36898598074913025, "learning_rate": 3.598128363511042e-05, "loss": 0.0063, "step": 29550 }, { "epoch": 8.820459290187891, "grad_norm": 2.096160888671875, "learning_rate": 3.596471461520108e-05, "loss": 0.012, "step": 29575 }, { "epoch": 8.827915299731584, "grad_norm": 0.62769615650177, "learning_rate": 3.594814559529175e-05, "loss": 0.0109, "step": 29600 }, { "epoch": 8.835371309275276, "grad_norm": 9.831634521484375, "learning_rate": 3.5931576575382414e-05, "loss": 0.0797, "step": 29625 }, { "epoch": 8.842827318818967, "grad_norm": 9.49618911743164, "learning_rate": 3.591500755547308e-05, "loss": 0.0206, "step": 29650 }, { "epoch": 8.85028332836266, "grad_norm": 73.54988098144531, "learning_rate": 3.5898438535563746e-05, "loss": 0.0308, "step": 29675 }, { "epoch": 8.857739337906352, "grad_norm": 0.0006858339766040444, "learning_rate": 3.588186951565441e-05, "loss": 0.02, "step": 29700 }, { "epoch": 8.865195347450046, "grad_norm": 0.012461572885513306, "learning_rate": 3.586530049574508e-05, "loss": 0.0373, "step": 29725 }, { "epoch": 8.872651356993737, "grad_norm": 0.04995008185505867, "learning_rate": 3.5848731475835747e-05, "loss": 0.0527, "step": 29750 }, { "epoch": 8.880107366537429, "grad_norm": 2.493194103240967, "learning_rate": 3.583216245592641e-05, "loss": 0.0208, "step": 29775 }, { "epoch": 8.887563376081122, "grad_norm": 0.01812615804374218, "learning_rate": 3.581559343601708e-05, "loss": 0.0165, "step": 29800 }, { "epoch": 8.895019385624813, "grad_norm": 0.01689509116113186, "learning_rate": 3.579902441610774e-05, "loss": 0.0085, "step": 29825 }, { "epoch": 8.902475395168505, "grad_norm": 0.011237064376473427, "learning_rate": 3.57824553961984e-05, "loss": 0.0257, "step": 29850 }, { "epoch": 8.909931404712198, "grad_norm": 0.009161061607301235, "learning_rate": 3.576588637628907e-05, "loss": 0.0231, "step": 29875 }, { "epoch": 8.91738741425589, "grad_norm": 0.9806844592094421, "learning_rate": 3.5749317356379734e-05, "loss": 0.0841, "step": 29900 }, { "epoch": 8.924843423799583, "grad_norm": 0.01535357441753149, "learning_rate": 3.57327483364704e-05, "loss": 0.0207, "step": 29925 }, { "epoch": 8.932299433343275, "grad_norm": 0.054871998727321625, "learning_rate": 3.5716179316561066e-05, "loss": 0.0079, "step": 29950 }, { "epoch": 8.939755442886966, "grad_norm": 0.011252072639763355, "learning_rate": 3.5699610296651735e-05, "loss": 0.05, "step": 29975 }, { "epoch": 8.94721145243066, "grad_norm": 0.03132103383541107, "learning_rate": 3.5683041276742404e-05, "loss": 0.0051, "step": 30000 }, { "epoch": 8.954667461974351, "grad_norm": 0.07881677895784378, "learning_rate": 3.5666472256833066e-05, "loss": 0.0274, "step": 30025 }, { "epoch": 8.962123471518044, "grad_norm": 0.00470997067168355, "learning_rate": 3.564990323692373e-05, "loss": 0.029, "step": 30050 }, { "epoch": 8.969579481061736, "grad_norm": 4.142818450927734, "learning_rate": 3.56333342170144e-05, "loss": 0.0153, "step": 30075 }, { "epoch": 8.977035490605427, "grad_norm": 1.2706577777862549, "learning_rate": 3.561676519710506e-05, "loss": 0.0767, "step": 30100 }, { "epoch": 8.98449150014912, "grad_norm": 0.01267695240676403, "learning_rate": 3.560019617719573e-05, "loss": 0.0846, "step": 30125 }, { "epoch": 8.991947509692812, "grad_norm": 7.920314788818359, "learning_rate": 3.558362715728639e-05, "loss": 0.0442, "step": 30150 }, { "epoch": 8.999403519236505, "grad_norm": 34.99578094482422, "learning_rate": 3.556705813737706e-05, "loss": 0.0273, "step": 30175 }, { "epoch": 9.0, "eval_gen_len": 8.7102, "eval_loss": 0.083512082695961, "eval_rouge1": 97.0896, "eval_rouge2": 84.4122, "eval_rougeL": 97.0507, "eval_rougeLsum": 97.0591, "eval_runtime": 101.145, "eval_samples_per_second": 16.58, "eval_steps_per_second": 4.152, "step": 30177 }, { "epoch": 9.006859528780197, "grad_norm": 0.011261310428380966, "learning_rate": 3.555048911746773e-05, "loss": 0.0355, "step": 30200 }, { "epoch": 9.014315538323888, "grad_norm": 0.014239492826163769, "learning_rate": 3.553392009755839e-05, "loss": 0.0229, "step": 30225 }, { "epoch": 9.021771547867582, "grad_norm": 0.02628404088318348, "learning_rate": 3.551735107764906e-05, "loss": 0.0127, "step": 30250 }, { "epoch": 9.029227557411273, "grad_norm": 0.0303230881690979, "learning_rate": 3.5500782057739723e-05, "loss": 0.0052, "step": 30275 }, { "epoch": 9.036683566954967, "grad_norm": 7.405888080596924, "learning_rate": 3.5484213037830386e-05, "loss": 0.0111, "step": 30300 }, { "epoch": 9.044139576498658, "grad_norm": 2.99228835105896, "learning_rate": 3.5467644017921055e-05, "loss": 0.034, "step": 30325 }, { "epoch": 9.05159558604235, "grad_norm": 0.05243349075317383, "learning_rate": 3.545107499801172e-05, "loss": 0.017, "step": 30350 }, { "epoch": 9.059051595586043, "grad_norm": 0.019075891003012657, "learning_rate": 3.5434505978102386e-05, "loss": 0.004, "step": 30375 }, { "epoch": 9.066507605129734, "grad_norm": 0.004510013852268457, "learning_rate": 3.541793695819305e-05, "loss": 0.01, "step": 30400 }, { "epoch": 9.073963614673426, "grad_norm": 0.47157201170921326, "learning_rate": 3.540136793828371e-05, "loss": 0.0433, "step": 30425 }, { "epoch": 9.08141962421712, "grad_norm": 0.05149286240339279, "learning_rate": 3.538479891837439e-05, "loss": 0.0823, "step": 30450 }, { "epoch": 9.08887563376081, "grad_norm": 9.489185333251953, "learning_rate": 3.536822989846505e-05, "loss": 0.013, "step": 30475 }, { "epoch": 9.096331643304504, "grad_norm": 32.13996505737305, "learning_rate": 3.535166087855571e-05, "loss": 0.0236, "step": 30500 }, { "epoch": 9.103787652848196, "grad_norm": 0.18573585152626038, "learning_rate": 3.533509185864638e-05, "loss": 0.0031, "step": 30525 }, { "epoch": 9.111243662391887, "grad_norm": 24.975109100341797, "learning_rate": 3.531852283873704e-05, "loss": 0.0096, "step": 30550 }, { "epoch": 9.11869967193558, "grad_norm": 0.7033310532569885, "learning_rate": 3.530195381882771e-05, "loss": 0.0272, "step": 30575 }, { "epoch": 9.126155681479272, "grad_norm": 0.0882677286863327, "learning_rate": 3.5285384798918375e-05, "loss": 0.0099, "step": 30600 }, { "epoch": 9.133611691022965, "grad_norm": 2.4805209636688232, "learning_rate": 3.526881577900904e-05, "loss": 0.0021, "step": 30625 }, { "epoch": 9.141067700566657, "grad_norm": 0.05557962879538536, "learning_rate": 3.5252246759099706e-05, "loss": 0.015, "step": 30650 }, { "epoch": 9.148523710110348, "grad_norm": 2.1198525428771973, "learning_rate": 3.5235677739190375e-05, "loss": 0.0088, "step": 30675 }, { "epoch": 9.155979719654042, "grad_norm": 87.86044311523438, "learning_rate": 3.5219108719281044e-05, "loss": 0.0189, "step": 30700 }, { "epoch": 9.163435729197733, "grad_norm": 0.35490408539772034, "learning_rate": 3.520253969937171e-05, "loss": 0.0026, "step": 30725 }, { "epoch": 9.170891738741426, "grad_norm": 0.006703643128275871, "learning_rate": 3.518597067946237e-05, "loss": 0.0296, "step": 30750 }, { "epoch": 9.178347748285118, "grad_norm": 0.014011339284479618, "learning_rate": 3.516940165955304e-05, "loss": 0.03, "step": 30775 }, { "epoch": 9.18580375782881, "grad_norm": 4.419519901275635, "learning_rate": 3.51528326396437e-05, "loss": 0.089, "step": 30800 }, { "epoch": 9.193259767372503, "grad_norm": 0.008075419813394547, "learning_rate": 3.513626361973437e-05, "loss": 0.005, "step": 30825 }, { "epoch": 9.200715776916194, "grad_norm": 0.0033533978275954723, "learning_rate": 3.511969459982503e-05, "loss": 0.0052, "step": 30850 }, { "epoch": 9.208171786459888, "grad_norm": 0.022313714027404785, "learning_rate": 3.5103125579915694e-05, "loss": 0.0247, "step": 30875 }, { "epoch": 9.215627796003579, "grad_norm": 0.16721826791763306, "learning_rate": 3.508655656000636e-05, "loss": 0.0271, "step": 30900 }, { "epoch": 9.22308380554727, "grad_norm": 0.0008401995291933417, "learning_rate": 3.506998754009703e-05, "loss": 0.0128, "step": 30925 }, { "epoch": 9.230539815090964, "grad_norm": 0.008809903636574745, "learning_rate": 3.5053418520187695e-05, "loss": 0.0095, "step": 30950 }, { "epoch": 9.237995824634655, "grad_norm": 0.013414003886282444, "learning_rate": 3.5036849500278364e-05, "loss": 0.0012, "step": 30975 }, { "epoch": 9.245451834178347, "grad_norm": 0.009434329345822334, "learning_rate": 3.5020280480369026e-05, "loss": 0.0576, "step": 31000 }, { "epoch": 9.25290784372204, "grad_norm": 0.014045946300029755, "learning_rate": 3.5003711460459695e-05, "loss": 0.0188, "step": 31025 }, { "epoch": 9.260363853265732, "grad_norm": 0.044111546128988266, "learning_rate": 3.498714244055036e-05, "loss": 0.0047, "step": 31050 }, { "epoch": 9.267819862809425, "grad_norm": 0.03687797114253044, "learning_rate": 3.497057342064102e-05, "loss": 0.0169, "step": 31075 }, { "epoch": 9.275275872353117, "grad_norm": 0.012163372710347176, "learning_rate": 3.495400440073169e-05, "loss": 0.0256, "step": 31100 }, { "epoch": 9.282731881896808, "grad_norm": 0.020992450416088104, "learning_rate": 3.493743538082235e-05, "loss": 0.0166, "step": 31125 }, { "epoch": 9.290187891440501, "grad_norm": 0.00904077384620905, "learning_rate": 3.492086636091302e-05, "loss": 0.0555, "step": 31150 }, { "epoch": 9.297643900984193, "grad_norm": 0.13449884951114655, "learning_rate": 3.490429734100369e-05, "loss": 0.0331, "step": 31175 }, { "epoch": 9.305099910527886, "grad_norm": 0.013724715448915958, "learning_rate": 3.488772832109435e-05, "loss": 0.025, "step": 31200 }, { "epoch": 9.312555920071578, "grad_norm": 0.006786949001252651, "learning_rate": 3.487115930118502e-05, "loss": 0.0283, "step": 31225 }, { "epoch": 9.32001192961527, "grad_norm": 0.006112619303166866, "learning_rate": 3.4854590281275684e-05, "loss": 0.0127, "step": 31250 }, { "epoch": 9.327467939158963, "grad_norm": 0.018585694953799248, "learning_rate": 3.4838021261366346e-05, "loss": 0.0153, "step": 31275 }, { "epoch": 9.334923948702654, "grad_norm": 0.0026641006115823984, "learning_rate": 3.4821452241457015e-05, "loss": 0.0022, "step": 31300 }, { "epoch": 9.342379958246347, "grad_norm": 0.20816652476787567, "learning_rate": 3.480488322154768e-05, "loss": 0.009, "step": 31325 }, { "epoch": 9.349835967790039, "grad_norm": 0.0028136121109128, "learning_rate": 3.4788314201638347e-05, "loss": 0.0331, "step": 31350 }, { "epoch": 9.35729197733373, "grad_norm": 0.02150745317339897, "learning_rate": 3.477174518172901e-05, "loss": 0.0458, "step": 31375 }, { "epoch": 9.364747986877424, "grad_norm": 17.131425857543945, "learning_rate": 3.475517616181968e-05, "loss": 0.0879, "step": 31400 }, { "epoch": 9.372203996421115, "grad_norm": 0.10048453509807587, "learning_rate": 3.473860714191035e-05, "loss": 0.0168, "step": 31425 }, { "epoch": 9.379660005964809, "grad_norm": 0.005317870993167162, "learning_rate": 3.472203812200101e-05, "loss": 0.0402, "step": 31450 }, { "epoch": 9.3871160155085, "grad_norm": 0.11025191098451614, "learning_rate": 3.470546910209168e-05, "loss": 0.0188, "step": 31475 }, { "epoch": 9.394572025052192, "grad_norm": 0.024496447294950485, "learning_rate": 3.468890008218234e-05, "loss": 0.0118, "step": 31500 }, { "epoch": 9.402028034595885, "grad_norm": 6.208194732666016, "learning_rate": 3.4672331062273e-05, "loss": 0.0385, "step": 31525 }, { "epoch": 9.409484044139576, "grad_norm": 0.18992580473423004, "learning_rate": 3.465576204236367e-05, "loss": 0.0086, "step": 31550 }, { "epoch": 9.416940053683268, "grad_norm": 24.698410034179688, "learning_rate": 3.4639193022454335e-05, "loss": 0.0273, "step": 31575 }, { "epoch": 9.424396063226961, "grad_norm": 3.067220687866211, "learning_rate": 3.4622624002545004e-05, "loss": 0.0416, "step": 31600 }, { "epoch": 9.431852072770653, "grad_norm": 0.010118064470589161, "learning_rate": 3.4606054982635666e-05, "loss": 0.0051, "step": 31625 }, { "epoch": 9.439308082314346, "grad_norm": 0.014647743664681911, "learning_rate": 3.4589485962726335e-05, "loss": 0.0412, "step": 31650 }, { "epoch": 9.446764091858038, "grad_norm": 0.017802445217967033, "learning_rate": 3.4572916942817004e-05, "loss": 0.0313, "step": 31675 }, { "epoch": 9.454220101401729, "grad_norm": 0.0036217891611158848, "learning_rate": 3.455634792290767e-05, "loss": 0.0569, "step": 31700 }, { "epoch": 9.461676110945422, "grad_norm": 13.685757637023926, "learning_rate": 3.453977890299833e-05, "loss": 0.0208, "step": 31725 }, { "epoch": 9.469132120489114, "grad_norm": 0.021553946658968925, "learning_rate": 3.4523209883089e-05, "loss": 0.0549, "step": 31750 }, { "epoch": 9.476588130032807, "grad_norm": 0.2268332988023758, "learning_rate": 3.450664086317966e-05, "loss": 0.0073, "step": 31775 }, { "epoch": 9.484044139576499, "grad_norm": 0.022020496428012848, "learning_rate": 3.449007184327033e-05, "loss": 0.05, "step": 31800 }, { "epoch": 9.49150014912019, "grad_norm": 0.0033605294302105904, "learning_rate": 3.447350282336099e-05, "loss": 0.0224, "step": 31825 }, { "epoch": 9.498956158663884, "grad_norm": 0.0036527118645608425, "learning_rate": 3.4456933803451654e-05, "loss": 0.0974, "step": 31850 }, { "epoch": 9.506412168207575, "grad_norm": 0.6867750287055969, "learning_rate": 3.444036478354233e-05, "loss": 0.0149, "step": 31875 }, { "epoch": 9.513868177751267, "grad_norm": 0.02161332778632641, "learning_rate": 3.442379576363299e-05, "loss": 0.028, "step": 31900 }, { "epoch": 9.52132418729496, "grad_norm": 0.021856382489204407, "learning_rate": 3.440722674372366e-05, "loss": 0.0012, "step": 31925 }, { "epoch": 9.528780196838651, "grad_norm": 0.5613261461257935, "learning_rate": 3.4390657723814324e-05, "loss": 0.0201, "step": 31950 }, { "epoch": 9.536236206382345, "grad_norm": 0.010432520881295204, "learning_rate": 3.4374088703904986e-05, "loss": 0.0159, "step": 31975 }, { "epoch": 9.543692215926036, "grad_norm": 0.5688920021057129, "learning_rate": 3.4357519683995656e-05, "loss": 0.047, "step": 32000 }, { "epoch": 9.551148225469728, "grad_norm": 0.00614794809371233, "learning_rate": 3.434095066408632e-05, "loss": 0.0608, "step": 32025 }, { "epoch": 9.558604235013421, "grad_norm": 0.6179513931274414, "learning_rate": 3.432438164417699e-05, "loss": 0.0079, "step": 32050 }, { "epoch": 9.566060244557113, "grad_norm": 0.5426047444343567, "learning_rate": 3.430781262426765e-05, "loss": 0.0386, "step": 32075 }, { "epoch": 9.573516254100806, "grad_norm": 0.3112524747848511, "learning_rate": 3.429124360435831e-05, "loss": 0.0048, "step": 32100 }, { "epoch": 9.580972263644497, "grad_norm": 1.4666061401367188, "learning_rate": 3.427467458444899e-05, "loss": 0.0011, "step": 32125 }, { "epoch": 9.588428273188189, "grad_norm": 0.025465501472353935, "learning_rate": 3.425810556453965e-05, "loss": 0.0112, "step": 32150 }, { "epoch": 9.595884282731882, "grad_norm": 0.23355644941329956, "learning_rate": 3.424153654463031e-05, "loss": 0.0242, "step": 32175 }, { "epoch": 9.603340292275574, "grad_norm": 0.0009142689523287117, "learning_rate": 3.422496752472098e-05, "loss": 0.005, "step": 32200 }, { "epoch": 9.610796301819267, "grad_norm": 0.06617454439401627, "learning_rate": 3.4208398504811644e-05, "loss": 0.0086, "step": 32225 }, { "epoch": 9.618252311362959, "grad_norm": 0.006406477652490139, "learning_rate": 3.419182948490231e-05, "loss": 0.0039, "step": 32250 }, { "epoch": 9.62570832090665, "grad_norm": 1.2433578968048096, "learning_rate": 3.4175260464992975e-05, "loss": 0.02, "step": 32275 }, { "epoch": 9.633164330450343, "grad_norm": 0.021782483905553818, "learning_rate": 3.415869144508364e-05, "loss": 0.0145, "step": 32300 }, { "epoch": 9.640620339994035, "grad_norm": 0.009916610084474087, "learning_rate": 3.414212242517431e-05, "loss": 0.046, "step": 32325 }, { "epoch": 9.648076349537728, "grad_norm": 0.008093140088021755, "learning_rate": 3.412621616606135e-05, "loss": 0.0117, "step": 32350 }, { "epoch": 9.65553235908142, "grad_norm": 0.025815371423959732, "learning_rate": 3.4109647146152016e-05, "loss": 0.0066, "step": 32375 }, { "epoch": 9.662988368625111, "grad_norm": 20.948083877563477, "learning_rate": 3.409307812624268e-05, "loss": 0.0199, "step": 32400 }, { "epoch": 9.670444378168805, "grad_norm": 0.0008856813074089587, "learning_rate": 3.407650910633334e-05, "loss": 0.0092, "step": 32425 }, { "epoch": 9.677900387712496, "grad_norm": 14.290371894836426, "learning_rate": 3.405994008642401e-05, "loss": 0.0845, "step": 32450 }, { "epoch": 9.68535639725619, "grad_norm": 0.29901570081710815, "learning_rate": 3.404337106651467e-05, "loss": 0.0091, "step": 32475 }, { "epoch": 9.69281240679988, "grad_norm": 1.902666687965393, "learning_rate": 3.402680204660534e-05, "loss": 0.0014, "step": 32500 }, { "epoch": 9.700268416343572, "grad_norm": 5.127783298492432, "learning_rate": 3.401023302669601e-05, "loss": 0.0075, "step": 32525 }, { "epoch": 9.707724425887266, "grad_norm": 0.020370395854115486, "learning_rate": 3.399366400678667e-05, "loss": 0.0011, "step": 32550 }, { "epoch": 9.715180435430957, "grad_norm": 0.3405543565750122, "learning_rate": 3.397709498687734e-05, "loss": 0.0062, "step": 32575 }, { "epoch": 9.722636444974649, "grad_norm": 0.06298086047172546, "learning_rate": 3.3960525966968004e-05, "loss": 0.0021, "step": 32600 }, { "epoch": 9.730092454518342, "grad_norm": 0.14343926310539246, "learning_rate": 3.394395694705867e-05, "loss": 0.042, "step": 32625 }, { "epoch": 9.737548464062034, "grad_norm": 0.02531730756163597, "learning_rate": 3.3927387927149336e-05, "loss": 0.0063, "step": 32650 }, { "epoch": 9.745004473605727, "grad_norm": 0.04750616475939751, "learning_rate": 3.391081890724e-05, "loss": 0.0276, "step": 32675 }, { "epoch": 9.752460483149418, "grad_norm": 11.076581954956055, "learning_rate": 3.389424988733067e-05, "loss": 0.0445, "step": 32700 }, { "epoch": 9.75991649269311, "grad_norm": 0.025029189884662628, "learning_rate": 3.387768086742133e-05, "loss": 0.0137, "step": 32725 }, { "epoch": 9.767372502236803, "grad_norm": 0.040830034762620926, "learning_rate": 3.386111184751199e-05, "loss": 0.016, "step": 32750 }, { "epoch": 9.774828511780495, "grad_norm": 0.01969616673886776, "learning_rate": 3.384454282760267e-05, "loss": 0.0174, "step": 32775 }, { "epoch": 9.782284521324188, "grad_norm": 0.44630536437034607, "learning_rate": 3.382797380769333e-05, "loss": 0.012, "step": 32800 }, { "epoch": 9.78974053086788, "grad_norm": 0.4425484836101532, "learning_rate": 3.381140478778399e-05, "loss": 0.0531, "step": 32825 }, { "epoch": 9.797196540411571, "grad_norm": 0.02152046374976635, "learning_rate": 3.379549852867103e-05, "loss": 0.0148, "step": 32850 }, { "epoch": 9.804652549955264, "grad_norm": 0.0015139617025852203, "learning_rate": 3.3778929508761695e-05, "loss": 0.0025, "step": 32875 }, { "epoch": 9.812108559498956, "grad_norm": 25.084253311157227, "learning_rate": 3.3762360488852365e-05, "loss": 0.0937, "step": 32900 }, { "epoch": 9.81956456904265, "grad_norm": 0.060568299144506454, "learning_rate": 3.3745791468943034e-05, "loss": 0.01, "step": 32925 }, { "epoch": 9.82702057858634, "grad_norm": 0.004180490970611572, "learning_rate": 3.3729222449033696e-05, "loss": 0.0175, "step": 32950 }, { "epoch": 9.834476588130032, "grad_norm": 17.96302032470703, "learning_rate": 3.3712653429124365e-05, "loss": 0.0473, "step": 32975 }, { "epoch": 9.841932597673726, "grad_norm": 0.015103708952665329, "learning_rate": 3.369608440921503e-05, "loss": 0.057, "step": 33000 }, { "epoch": 9.849388607217417, "grad_norm": 0.014436209574341774, "learning_rate": 3.3679515389305697e-05, "loss": 0.0202, "step": 33025 }, { "epoch": 9.856844616761109, "grad_norm": 48.11013412475586, "learning_rate": 3.366294636939636e-05, "loss": 0.0201, "step": 33050 }, { "epoch": 9.864300626304802, "grad_norm": 2.6800339221954346, "learning_rate": 3.364637734948702e-05, "loss": 0.0421, "step": 33075 }, { "epoch": 9.871756635848493, "grad_norm": 0.014932164922356606, "learning_rate": 3.362980832957769e-05, "loss": 0.0651, "step": 33100 }, { "epoch": 9.879212645392187, "grad_norm": 0.0073710051365196705, "learning_rate": 3.361323930966835e-05, "loss": 0.0468, "step": 33125 }, { "epoch": 9.886668654935878, "grad_norm": 0.00784632284194231, "learning_rate": 3.359667028975902e-05, "loss": 0.065, "step": 33150 }, { "epoch": 9.89412466447957, "grad_norm": 0.044525280594825745, "learning_rate": 3.358010126984969e-05, "loss": 0.0249, "step": 33175 }, { "epoch": 9.901580674023263, "grad_norm": 0.011400883086025715, "learning_rate": 3.356353224994035e-05, "loss": 0.0155, "step": 33200 }, { "epoch": 9.909036683566955, "grad_norm": 0.01919802837073803, "learning_rate": 3.354696323003102e-05, "loss": 0.0059, "step": 33225 }, { "epoch": 9.916492693110648, "grad_norm": 0.0051742009818553925, "learning_rate": 3.3530394210121685e-05, "loss": 0.034, "step": 33250 }, { "epoch": 9.92394870265434, "grad_norm": 0.00105185154825449, "learning_rate": 3.351382519021235e-05, "loss": 0.0175, "step": 33275 }, { "epoch": 9.931404712198031, "grad_norm": 0.006342086009681225, "learning_rate": 3.3497256170303016e-05, "loss": 0.052, "step": 33300 }, { "epoch": 9.938860721741724, "grad_norm": 0.01718548871576786, "learning_rate": 3.348068715039368e-05, "loss": 0.0029, "step": 33325 }, { "epoch": 9.946316731285416, "grad_norm": 3.649813413619995, "learning_rate": 3.346411813048435e-05, "loss": 0.0411, "step": 33350 }, { "epoch": 9.953772740829109, "grad_norm": 9.431161880493164, "learning_rate": 3.344754911057501e-05, "loss": 0.0482, "step": 33375 }, { "epoch": 9.9612287503728, "grad_norm": 0.0046329195611178875, "learning_rate": 3.343098009066568e-05, "loss": 0.0231, "step": 33400 }, { "epoch": 9.968684759916492, "grad_norm": 5.078273296356201, "learning_rate": 3.341441107075635e-05, "loss": 0.0056, "step": 33425 }, { "epoch": 9.976140769460185, "grad_norm": 0.02794848009943962, "learning_rate": 3.339784205084701e-05, "loss": 0.0175, "step": 33450 }, { "epoch": 9.983596779003877, "grad_norm": 0.06508468836545944, "learning_rate": 3.338127303093768e-05, "loss": 0.0421, "step": 33475 }, { "epoch": 9.99105278854757, "grad_norm": 0.950072169303894, "learning_rate": 3.336470401102834e-05, "loss": 0.0948, "step": 33500 }, { "epoch": 9.998508798091262, "grad_norm": 0.0035361736081540585, "learning_rate": 3.3348134991119004e-05, "loss": 0.0623, "step": 33525 }, { "epoch": 10.0, "eval_gen_len": 8.703, "eval_loss": 0.08018776774406433, "eval_rouge1": 97.4139, "eval_rouge2": 84.4925, "eval_rougeL": 97.3164, "eval_rougeLsum": 97.3129, "eval_runtime": 99.5841, "eval_samples_per_second": 16.84, "eval_steps_per_second": 4.218, "step": 33530 }, { "epoch": 10.005964807634953, "grad_norm": 0.003375839442014694, "learning_rate": 3.3331565971209674e-05, "loss": 0.0126, "step": 33550 }, { "epoch": 10.013420817178647, "grad_norm": 0.009126723743975163, "learning_rate": 3.3314996951300336e-05, "loss": 0.0245, "step": 33575 }, { "epoch": 10.020876826722338, "grad_norm": 1.7665882110595703, "learning_rate": 3.3298427931391005e-05, "loss": 0.0474, "step": 33600 }, { "epoch": 10.02833283626603, "grad_norm": 0.022015083581209183, "learning_rate": 3.328185891148167e-05, "loss": 0.0093, "step": 33625 }, { "epoch": 10.035788845809723, "grad_norm": 0.3044419288635254, "learning_rate": 3.3265289891572336e-05, "loss": 0.0031, "step": 33650 }, { "epoch": 10.043244855353414, "grad_norm": 0.131506085395813, "learning_rate": 3.3248720871663006e-05, "loss": 0.0342, "step": 33675 }, { "epoch": 10.050700864897108, "grad_norm": 9.862656593322754, "learning_rate": 3.323215185175367e-05, "loss": 0.0135, "step": 33700 }, { "epoch": 10.0581568744408, "grad_norm": 0.00710965134203434, "learning_rate": 3.321558283184433e-05, "loss": 0.0154, "step": 33725 }, { "epoch": 10.06561288398449, "grad_norm": 0.7810943722724915, "learning_rate": 3.3199013811935e-05, "loss": 0.0095, "step": 33750 }, { "epoch": 10.073068893528184, "grad_norm": 0.16734705865383148, "learning_rate": 3.318244479202566e-05, "loss": 0.0051, "step": 33775 }, { "epoch": 10.080524903071876, "grad_norm": 7.479839324951172, "learning_rate": 3.316587577211633e-05, "loss": 0.0197, "step": 33800 }, { "epoch": 10.087980912615569, "grad_norm": 0.019815055653452873, "learning_rate": 3.314930675220699e-05, "loss": 0.0021, "step": 33825 }, { "epoch": 10.09543692215926, "grad_norm": 0.01897830329835415, "learning_rate": 3.3132737732297656e-05, "loss": 0.0948, "step": 33850 }, { "epoch": 10.102892931702952, "grad_norm": 15.485894203186035, "learning_rate": 3.311616871238833e-05, "loss": 0.0269, "step": 33875 }, { "epoch": 10.110348941246645, "grad_norm": 0.07259183377027512, "learning_rate": 3.3099599692478994e-05, "loss": 0.0008, "step": 33900 }, { "epoch": 10.117804950790337, "grad_norm": 0.11839132755994797, "learning_rate": 3.3083030672569656e-05, "loss": 0.011, "step": 33925 }, { "epoch": 10.12526096033403, "grad_norm": 0.022647298872470856, "learning_rate": 3.3066461652660325e-05, "loss": 0.0161, "step": 33950 }, { "epoch": 10.132716969877722, "grad_norm": 0.008973998948931694, "learning_rate": 3.304989263275099e-05, "loss": 0.0214, "step": 33975 }, { "epoch": 10.140172979421413, "grad_norm": 0.30917757749557495, "learning_rate": 3.303332361284166e-05, "loss": 0.0331, "step": 34000 }, { "epoch": 10.147628988965106, "grad_norm": 1.5753026008605957, "learning_rate": 3.301675459293232e-05, "loss": 0.0074, "step": 34025 }, { "epoch": 10.155084998508798, "grad_norm": 0.23202548921108246, "learning_rate": 3.300018557302299e-05, "loss": 0.0055, "step": 34050 }, { "epoch": 10.162541008052491, "grad_norm": 0.12300365418195724, "learning_rate": 3.298361655311365e-05, "loss": 0.0053, "step": 34075 }, { "epoch": 10.169997017596183, "grad_norm": 0.031884919852018356, "learning_rate": 3.296704753320431e-05, "loss": 0.0046, "step": 34100 }, { "epoch": 10.177453027139874, "grad_norm": 0.02119879052042961, "learning_rate": 3.295047851329499e-05, "loss": 0.0124, "step": 34125 }, { "epoch": 10.184909036683568, "grad_norm": 2.8341071605682373, "learning_rate": 3.293390949338565e-05, "loss": 0.0475, "step": 34150 }, { "epoch": 10.192365046227259, "grad_norm": 0.02603074349462986, "learning_rate": 3.2917340473476313e-05, "loss": 0.0049, "step": 34175 }, { "epoch": 10.19982105577095, "grad_norm": 0.006299956236034632, "learning_rate": 3.290077145356698e-05, "loss": 0.0507, "step": 34200 }, { "epoch": 10.207277065314644, "grad_norm": 0.023150190711021423, "learning_rate": 3.2884202433657645e-05, "loss": 0.0034, "step": 34225 }, { "epoch": 10.214733074858335, "grad_norm": 0.0036992118693888187, "learning_rate": 3.2867633413748314e-05, "loss": 0.0061, "step": 34250 }, { "epoch": 10.222189084402029, "grad_norm": 0.012014524079859257, "learning_rate": 3.2851064393838976e-05, "loss": 0.0076, "step": 34275 }, { "epoch": 10.22964509394572, "grad_norm": 0.015969576314091682, "learning_rate": 3.283449537392964e-05, "loss": 0.0266, "step": 34300 }, { "epoch": 10.237101103489412, "grad_norm": 0.006832475308328867, "learning_rate": 3.281792635402031e-05, "loss": 0.0479, "step": 34325 }, { "epoch": 10.244557113033105, "grad_norm": 0.004264793358743191, "learning_rate": 3.280135733411098e-05, "loss": 0.0272, "step": 34350 }, { "epoch": 10.252013122576797, "grad_norm": 0.7217514514923096, "learning_rate": 3.278478831420164e-05, "loss": 0.0426, "step": 34375 }, { "epoch": 10.25946913212049, "grad_norm": 0.029282154515385628, "learning_rate": 3.276821929429231e-05, "loss": 0.0081, "step": 34400 }, { "epoch": 10.266925141664181, "grad_norm": 0.0008106474415399134, "learning_rate": 3.275165027438297e-05, "loss": 0.0165, "step": 34425 }, { "epoch": 10.274381151207873, "grad_norm": 0.1722215712070465, "learning_rate": 3.273508125447364e-05, "loss": 0.0262, "step": 34450 }, { "epoch": 10.281837160751566, "grad_norm": 9.704647064208984, "learning_rate": 3.27185122345643e-05, "loss": 0.0199, "step": 34475 }, { "epoch": 10.289293170295258, "grad_norm": 0.022019336000084877, "learning_rate": 3.2701943214654965e-05, "loss": 0.0056, "step": 34500 }, { "epoch": 10.296749179838951, "grad_norm": 0.0859612226486206, "learning_rate": 3.2685374194745634e-05, "loss": 0.0049, "step": 34525 }, { "epoch": 10.304205189382643, "grad_norm": 0.015857083722949028, "learning_rate": 3.2668805174836296e-05, "loss": 0.0071, "step": 34550 }, { "epoch": 10.311661198926334, "grad_norm": 0.7513738870620728, "learning_rate": 3.2652236154926965e-05, "loss": 0.0117, "step": 34575 }, { "epoch": 10.319117208470027, "grad_norm": 0.30653056502342224, "learning_rate": 3.2635667135017634e-05, "loss": 0.0176, "step": 34600 }, { "epoch": 10.326573218013719, "grad_norm": 0.09023960679769516, "learning_rate": 3.2619098115108297e-05, "loss": 0.0043, "step": 34625 }, { "epoch": 10.334029227557412, "grad_norm": 1.2998549938201904, "learning_rate": 3.2602529095198966e-05, "loss": 0.0227, "step": 34650 }, { "epoch": 10.341485237101104, "grad_norm": 63.218666076660156, "learning_rate": 3.258596007528963e-05, "loss": 0.0163, "step": 34675 }, { "epoch": 10.348941246644795, "grad_norm": 5.3180251121521, "learning_rate": 3.25693910553803e-05, "loss": 0.015, "step": 34700 }, { "epoch": 10.356397256188488, "grad_norm": 0.038112230598926544, "learning_rate": 3.255282203547096e-05, "loss": 0.0281, "step": 34725 }, { "epoch": 10.36385326573218, "grad_norm": 3.67035174369812, "learning_rate": 3.253625301556162e-05, "loss": 0.0441, "step": 34750 }, { "epoch": 10.371309275275872, "grad_norm": 0.013775043189525604, "learning_rate": 3.251968399565229e-05, "loss": 0.0089, "step": 34775 }, { "epoch": 10.378765284819565, "grad_norm": 0.06075282394886017, "learning_rate": 3.250311497574295e-05, "loss": 0.0006, "step": 34800 }, { "epoch": 10.386221294363256, "grad_norm": 28.066097259521484, "learning_rate": 3.248654595583362e-05, "loss": 0.052, "step": 34825 }, { "epoch": 10.39367730390695, "grad_norm": 0.009753179736435413, "learning_rate": 3.246997693592429e-05, "loss": 0.0187, "step": 34850 }, { "epoch": 10.401133313450641, "grad_norm": 0.005144761875271797, "learning_rate": 3.2453407916014954e-05, "loss": 0.0399, "step": 34875 }, { "epoch": 10.408589322994333, "grad_norm": 2.791918992996216, "learning_rate": 3.243683889610562e-05, "loss": 0.005, "step": 34900 }, { "epoch": 10.416045332538026, "grad_norm": 0.004900413099676371, "learning_rate": 3.2420269876196285e-05, "loss": 0.0072, "step": 34925 }, { "epoch": 10.423501342081718, "grad_norm": 0.009620816446840763, "learning_rate": 3.240370085628695e-05, "loss": 0.0016, "step": 34950 }, { "epoch": 10.43095735162541, "grad_norm": 1.331263780593872, "learning_rate": 3.238713183637762e-05, "loss": 0.0202, "step": 34975 }, { "epoch": 10.438413361169102, "grad_norm": 0.10611845552921295, "learning_rate": 3.237056281646828e-05, "loss": 0.018, "step": 35000 }, { "epoch": 10.445869370712794, "grad_norm": 0.5065842270851135, "learning_rate": 3.235399379655895e-05, "loss": 0.0153, "step": 35025 }, { "epoch": 10.453325380256487, "grad_norm": 0.11372986435890198, "learning_rate": 3.233742477664961e-05, "loss": 0.0048, "step": 35050 }, { "epoch": 10.460781389800179, "grad_norm": 0.8633256554603577, "learning_rate": 3.232085575674028e-05, "loss": 0.0014, "step": 35075 }, { "epoch": 10.468237399343872, "grad_norm": 0.003506710985675454, "learning_rate": 3.230428673683095e-05, "loss": 0.0411, "step": 35100 }, { "epoch": 10.475693408887564, "grad_norm": 0.018955357372760773, "learning_rate": 3.228771771692161e-05, "loss": 0.0014, "step": 35125 }, { "epoch": 10.483149418431255, "grad_norm": 0.005029291845858097, "learning_rate": 3.2271148697012274e-05, "loss": 0.0193, "step": 35150 }, { "epoch": 10.490605427974948, "grad_norm": 25.563217163085938, "learning_rate": 3.225457967710294e-05, "loss": 0.0691, "step": 35175 }, { "epoch": 10.49806143751864, "grad_norm": 1.60642409324646, "learning_rate": 3.2238010657193605e-05, "loss": 0.0017, "step": 35200 }, { "epoch": 10.505517447062331, "grad_norm": 0.017324600368738174, "learning_rate": 3.2221441637284274e-05, "loss": 0.0247, "step": 35225 }, { "epoch": 10.512973456606025, "grad_norm": 0.1269446760416031, "learning_rate": 3.2204872617374936e-05, "loss": 0.0024, "step": 35250 }, { "epoch": 10.520429466149716, "grad_norm": 0.14126725494861603, "learning_rate": 3.2188303597465606e-05, "loss": 0.0822, "step": 35275 }, { "epoch": 10.52788547569341, "grad_norm": 0.0053797028958797455, "learning_rate": 3.217173457755627e-05, "loss": 0.0634, "step": 35300 }, { "epoch": 10.535341485237101, "grad_norm": 0.01621091552078724, "learning_rate": 3.215516555764694e-05, "loss": 0.0914, "step": 35325 }, { "epoch": 10.542797494780793, "grad_norm": 0.00323239853605628, "learning_rate": 3.2138596537737606e-05, "loss": 0.0158, "step": 35350 }, { "epoch": 10.550253504324486, "grad_norm": 34.818946838378906, "learning_rate": 3.212202751782827e-05, "loss": 0.0173, "step": 35375 }, { "epoch": 10.557709513868177, "grad_norm": 0.008614491671323776, "learning_rate": 3.210545849791893e-05, "loss": 0.0271, "step": 35400 }, { "epoch": 10.56516552341187, "grad_norm": 0.001824019942432642, "learning_rate": 3.20888894780096e-05, "loss": 0.0395, "step": 35425 }, { "epoch": 10.572621532955562, "grad_norm": 0.007367938291281462, "learning_rate": 3.207232045810026e-05, "loss": 0.0011, "step": 35450 }, { "epoch": 10.580077542499254, "grad_norm": 1.3378117084503174, "learning_rate": 3.205575143819093e-05, "loss": 0.0146, "step": 35475 }, { "epoch": 10.587533552042947, "grad_norm": 0.015617124736309052, "learning_rate": 3.2039182418281594e-05, "loss": 0.0826, "step": 35500 }, { "epoch": 10.594989561586639, "grad_norm": 0.003436797996982932, "learning_rate": 3.2022613398372256e-05, "loss": 0.0084, "step": 35525 }, { "epoch": 10.602445571130332, "grad_norm": 0.31922221183776855, "learning_rate": 3.200604437846293e-05, "loss": 0.0991, "step": 35550 }, { "epoch": 10.609901580674023, "grad_norm": 0.8636718988418579, "learning_rate": 3.1989475358553594e-05, "loss": 0.0387, "step": 35575 }, { "epoch": 10.617357590217715, "grad_norm": 0.04692121967673302, "learning_rate": 3.197290633864426e-05, "loss": 0.0072, "step": 35600 }, { "epoch": 10.624813599761408, "grad_norm": 0.007798209320753813, "learning_rate": 3.1956337318734926e-05, "loss": 0.0464, "step": 35625 }, { "epoch": 10.6322696093051, "grad_norm": 51.005245208740234, "learning_rate": 3.193976829882559e-05, "loss": 0.0759, "step": 35650 }, { "epoch": 10.639725618848793, "grad_norm": 4.934067726135254, "learning_rate": 3.192319927891626e-05, "loss": 0.035, "step": 35675 }, { "epoch": 10.647181628392484, "grad_norm": 0.1517515480518341, "learning_rate": 3.190663025900692e-05, "loss": 0.0236, "step": 35700 }, { "epoch": 10.654637637936176, "grad_norm": 0.04861300066113472, "learning_rate": 3.189006123909758e-05, "loss": 0.0624, "step": 35725 }, { "epoch": 10.66209364747987, "grad_norm": 0.010758434422314167, "learning_rate": 3.187349221918825e-05, "loss": 0.0222, "step": 35750 }, { "epoch": 10.66954965702356, "grad_norm": 2.7868688106536865, "learning_rate": 3.1856923199278913e-05, "loss": 0.0418, "step": 35775 }, { "epoch": 10.677005666567254, "grad_norm": 13.515581130981445, "learning_rate": 3.184035417936958e-05, "loss": 0.0125, "step": 35800 }, { "epoch": 10.684461676110946, "grad_norm": 0.10708614438772202, "learning_rate": 3.182378515946025e-05, "loss": 0.06, "step": 35825 }, { "epoch": 10.691917685654637, "grad_norm": 0.0015107860090211034, "learning_rate": 3.1807216139550914e-05, "loss": 0.1163, "step": 35850 }, { "epoch": 10.69937369519833, "grad_norm": 0.032355859875679016, "learning_rate": 3.179064711964158e-05, "loss": 0.0372, "step": 35875 }, { "epoch": 10.706829704742022, "grad_norm": 21.268131256103516, "learning_rate": 3.1774078099732245e-05, "loss": 0.0237, "step": 35900 }, { "epoch": 10.714285714285714, "grad_norm": 21.97075080871582, "learning_rate": 3.1757509079822915e-05, "loss": 0.0661, "step": 35925 }, { "epoch": 10.721741723829407, "grad_norm": 0.10698114335536957, "learning_rate": 3.174094005991358e-05, "loss": 0.0203, "step": 35950 }, { "epoch": 10.729197733373098, "grad_norm": 0.009910419583320618, "learning_rate": 3.172437104000424e-05, "loss": 0.0022, "step": 35975 }, { "epoch": 10.736653742916792, "grad_norm": 0.007027831859886646, "learning_rate": 3.170780202009491e-05, "loss": 0.0578, "step": 36000 }, { "epoch": 10.744109752460483, "grad_norm": 0.024194374680519104, "learning_rate": 3.169123300018558e-05, "loss": 0.0021, "step": 36025 }, { "epoch": 10.751565762004175, "grad_norm": 0.018712317571043968, "learning_rate": 3.167466398027624e-05, "loss": 0.0096, "step": 36050 }, { "epoch": 10.759021771547868, "grad_norm": 3.4336538314819336, "learning_rate": 3.165809496036691e-05, "loss": 0.0158, "step": 36075 }, { "epoch": 10.76647778109156, "grad_norm": 9.927470207214355, "learning_rate": 3.164152594045757e-05, "loss": 0.066, "step": 36100 }, { "epoch": 10.773933790635253, "grad_norm": 0.14512130618095398, "learning_rate": 3.162495692054824e-05, "loss": 0.0242, "step": 36125 }, { "epoch": 10.781389800178944, "grad_norm": 0.05796957015991211, "learning_rate": 3.16083879006389e-05, "loss": 0.0366, "step": 36150 }, { "epoch": 10.788845809722636, "grad_norm": 0.034359026700258255, "learning_rate": 3.1591818880729565e-05, "loss": 0.0064, "step": 36175 }, { "epoch": 10.79630181926633, "grad_norm": 15.594103813171387, "learning_rate": 3.1575249860820234e-05, "loss": 0.0084, "step": 36200 }, { "epoch": 10.80375782881002, "grad_norm": 0.1458369642496109, "learning_rate": 3.1558680840910897e-05, "loss": 0.0132, "step": 36225 }, { "epoch": 10.811213838353712, "grad_norm": 0.034678563475608826, "learning_rate": 3.1542111821001566e-05, "loss": 0.0017, "step": 36250 }, { "epoch": 10.818669847897405, "grad_norm": 10.139859199523926, "learning_rate": 3.1525542801092235e-05, "loss": 0.0353, "step": 36275 }, { "epoch": 10.826125857441097, "grad_norm": 0.013228428550064564, "learning_rate": 3.15089737811829e-05, "loss": 0.0478, "step": 36300 }, { "epoch": 10.83358186698479, "grad_norm": 0.234503373503685, "learning_rate": 3.1492404761273566e-05, "loss": 0.0034, "step": 36325 }, { "epoch": 10.841037876528482, "grad_norm": 0.043588753789663315, "learning_rate": 3.147583574136423e-05, "loss": 0.0014, "step": 36350 }, { "epoch": 10.848493886072173, "grad_norm": 0.0065058995969593525, "learning_rate": 3.145926672145489e-05, "loss": 0.0328, "step": 36375 }, { "epoch": 10.855949895615867, "grad_norm": 0.07438325136899948, "learning_rate": 3.144269770154556e-05, "loss": 0.0166, "step": 36400 }, { "epoch": 10.863405905159558, "grad_norm": 1.323083519935608, "learning_rate": 3.142612868163622e-05, "loss": 0.0564, "step": 36425 }, { "epoch": 10.870861914703251, "grad_norm": 0.1142137348651886, "learning_rate": 3.140955966172689e-05, "loss": 0.0619, "step": 36450 }, { "epoch": 10.878317924246943, "grad_norm": 0.04162459075450897, "learning_rate": 3.1392990641817554e-05, "loss": 0.02, "step": 36475 }, { "epoch": 10.885773933790635, "grad_norm": 0.020116539672017097, "learning_rate": 3.137642162190822e-05, "loss": 0.0265, "step": 36500 }, { "epoch": 10.893229943334328, "grad_norm": 0.04184208810329437, "learning_rate": 3.135985260199889e-05, "loss": 0.0149, "step": 36525 }, { "epoch": 10.90068595287802, "grad_norm": 0.04820263385772705, "learning_rate": 3.1343283582089554e-05, "loss": 0.0125, "step": 36550 }, { "epoch": 10.908141962421713, "grad_norm": 0.0024502065498381853, "learning_rate": 3.1326714562180224e-05, "loss": 0.0111, "step": 36575 }, { "epoch": 10.915597971965404, "grad_norm": 2.155289649963379, "learning_rate": 3.1310145542270886e-05, "loss": 0.0035, "step": 36600 }, { "epoch": 10.923053981509096, "grad_norm": 0.014378400519490242, "learning_rate": 3.129357652236155e-05, "loss": 0.0551, "step": 36625 }, { "epoch": 10.930509991052789, "grad_norm": 2.1999733448028564, "learning_rate": 3.127700750245222e-05, "loss": 0.0459, "step": 36650 }, { "epoch": 10.93796600059648, "grad_norm": 26.96295928955078, "learning_rate": 3.126043848254288e-05, "loss": 0.0074, "step": 36675 }, { "epoch": 10.945422010140174, "grad_norm": 4.9573845863342285, "learning_rate": 3.124386946263355e-05, "loss": 0.0271, "step": 36700 }, { "epoch": 10.952878019683865, "grad_norm": 0.9740040302276611, "learning_rate": 3.122730044272421e-05, "loss": 0.0067, "step": 36725 }, { "epoch": 10.960334029227557, "grad_norm": 0.10786978155374527, "learning_rate": 3.121073142281488e-05, "loss": 0.0063, "step": 36750 }, { "epoch": 10.96779003877125, "grad_norm": 0.9033737182617188, "learning_rate": 3.119416240290555e-05, "loss": 0.0241, "step": 36775 }, { "epoch": 10.975246048314942, "grad_norm": 0.8337206840515137, "learning_rate": 3.117759338299621e-05, "loss": 0.0423, "step": 36800 }, { "epoch": 10.982702057858635, "grad_norm": 34.99683380126953, "learning_rate": 3.1161024363086874e-05, "loss": 0.0323, "step": 36825 }, { "epoch": 10.990158067402326, "grad_norm": 0.3933325707912445, "learning_rate": 3.114445534317754e-05, "loss": 0.0058, "step": 36850 }, { "epoch": 10.997614076946018, "grad_norm": 0.019569693133234978, "learning_rate": 3.1127886323268206e-05, "loss": 0.0019, "step": 36875 }, { "epoch": 11.0, "eval_gen_len": 8.7108, "eval_loss": 0.0731603354215622, "eval_rouge1": 97.3528, "eval_rouge2": 84.9895, "eval_rougeL": 97.3143, "eval_rougeLsum": 97.3154, "eval_runtime": 98.1335, "eval_samples_per_second": 17.089, "eval_steps_per_second": 4.28, "step": 36883 }, { "epoch": 11.005070086489711, "grad_norm": 0.00565438624471426, "learning_rate": 3.1111317303358875e-05, "loss": 0.004, "step": 36900 }, { "epoch": 11.012526096033403, "grad_norm": 0.008675969205796719, "learning_rate": 3.109474828344954e-05, "loss": 0.1224, "step": 36925 }, { "epoch": 11.019982105577094, "grad_norm": 0.2460847944021225, "learning_rate": 3.10781792635402e-05, "loss": 0.0458, "step": 36950 }, { "epoch": 11.027438115120788, "grad_norm": 0.015279813669621944, "learning_rate": 3.106161024363087e-05, "loss": 0.0285, "step": 36975 }, { "epoch": 11.03489412466448, "grad_norm": 0.01354867685586214, "learning_rate": 3.104504122372154e-05, "loss": 0.02, "step": 37000 }, { "epoch": 11.042350134208172, "grad_norm": 0.009907645173370838, "learning_rate": 3.10284722038122e-05, "loss": 0.1091, "step": 37025 }, { "epoch": 11.049806143751864, "grad_norm": 0.033154286444187164, "learning_rate": 3.101190318390287e-05, "loss": 0.0064, "step": 37050 }, { "epoch": 11.057262153295556, "grad_norm": 0.04053553566336632, "learning_rate": 3.099533416399353e-05, "loss": 0.0103, "step": 37075 }, { "epoch": 11.064718162839249, "grad_norm": 0.05914434418082237, "learning_rate": 3.09787651440842e-05, "loss": 0.0343, "step": 37100 }, { "epoch": 11.07217417238294, "grad_norm": 0.021367311477661133, "learning_rate": 3.096219612417486e-05, "loss": 0.0057, "step": 37125 }, { "epoch": 11.079630181926634, "grad_norm": 0.001664644107222557, "learning_rate": 3.094562710426553e-05, "loss": 0.0005, "step": 37150 }, { "epoch": 11.087086191470325, "grad_norm": 0.12640069425106049, "learning_rate": 3.0929058084356194e-05, "loss": 0.0003, "step": 37175 }, { "epoch": 11.094542201014017, "grad_norm": 0.004810268059372902, "learning_rate": 3.091248906444686e-05, "loss": 0.0119, "step": 37200 }, { "epoch": 11.10199821055771, "grad_norm": 1.4785408973693848, "learning_rate": 3.089592004453753e-05, "loss": 0.0008, "step": 37225 }, { "epoch": 11.109454220101401, "grad_norm": 0.0022451153490692377, "learning_rate": 3.0879351024628195e-05, "loss": 0.0123, "step": 37250 }, { "epoch": 11.116910229645095, "grad_norm": 0.1429530680179596, "learning_rate": 3.086278200471886e-05, "loss": 0.0115, "step": 37275 }, { "epoch": 11.124366239188786, "grad_norm": 0.03411104530096054, "learning_rate": 3.0846212984809526e-05, "loss": 0.0212, "step": 37300 }, { "epoch": 11.131822248732478, "grad_norm": 0.8815001845359802, "learning_rate": 3.082964396490019e-05, "loss": 0.0005, "step": 37325 }, { "epoch": 11.139278258276171, "grad_norm": 0.008912123739719391, "learning_rate": 3.081307494499086e-05, "loss": 0.0029, "step": 37350 }, { "epoch": 11.146734267819863, "grad_norm": 0.021167289465665817, "learning_rate": 3.079650592508152e-05, "loss": 0.016, "step": 37375 }, { "epoch": 11.154190277363554, "grad_norm": 0.043260324746370316, "learning_rate": 3.077993690517218e-05, "loss": 0.0195, "step": 37400 }, { "epoch": 11.161646286907247, "grad_norm": 0.0005007116124033928, "learning_rate": 3.076336788526285e-05, "loss": 0.001, "step": 37425 }, { "epoch": 11.169102296450939, "grad_norm": 0.003123112255707383, "learning_rate": 3.0746798865353514e-05, "loss": 0.0003, "step": 37450 }, { "epoch": 11.176558305994632, "grad_norm": 0.1323997676372528, "learning_rate": 3.073022984544418e-05, "loss": 0.0171, "step": 37475 }, { "epoch": 11.184014315538324, "grad_norm": 0.00898793339729309, "learning_rate": 3.071366082553485e-05, "loss": 0.0087, "step": 37500 }, { "epoch": 11.191470325082015, "grad_norm": 0.004661829676479101, "learning_rate": 3.0697091805625515e-05, "loss": 0.0006, "step": 37525 }, { "epoch": 11.198926334625709, "grad_norm": 0.0023466881830245256, "learning_rate": 3.0680522785716184e-05, "loss": 0.0259, "step": 37550 }, { "epoch": 11.2063823441694, "grad_norm": 0.016425127163529396, "learning_rate": 3.0663953765806846e-05, "loss": 0.0683, "step": 37575 }, { "epoch": 11.213838353713093, "grad_norm": 0.12483184039592743, "learning_rate": 3.064738474589751e-05, "loss": 0.0005, "step": 37600 }, { "epoch": 11.221294363256785, "grad_norm": 0.013576678931713104, "learning_rate": 3.063081572598818e-05, "loss": 0.0043, "step": 37625 }, { "epoch": 11.228750372800476, "grad_norm": 0.025191502645611763, "learning_rate": 3.061424670607884e-05, "loss": 0.0029, "step": 37650 }, { "epoch": 11.23620638234417, "grad_norm": 0.007174923084676266, "learning_rate": 3.059767768616951e-05, "loss": 0.043, "step": 37675 }, { "epoch": 11.243662391887861, "grad_norm": 27.290576934814453, "learning_rate": 3.058110866626018e-05, "loss": 0.0149, "step": 37700 }, { "epoch": 11.251118401431555, "grad_norm": 0.13593660295009613, "learning_rate": 3.056453964635084e-05, "loss": 0.0061, "step": 37725 }, { "epoch": 11.258574410975246, "grad_norm": 1.3596652746200562, "learning_rate": 3.054797062644151e-05, "loss": 0.0196, "step": 37750 }, { "epoch": 11.266030420518938, "grad_norm": 0.007876387797296047, "learning_rate": 3.053140160653217e-05, "loss": 0.0035, "step": 37775 }, { "epoch": 11.273486430062631, "grad_norm": 0.016628660261631012, "learning_rate": 3.0514832586622838e-05, "loss": 0.0015, "step": 37800 }, { "epoch": 11.280942439606322, "grad_norm": 19.705419540405273, "learning_rate": 3.0498926327509875e-05, "loss": 0.0885, "step": 37825 }, { "epoch": 11.288398449150016, "grad_norm": 0.007640378549695015, "learning_rate": 3.048235730760054e-05, "loss": 0.0057, "step": 37850 }, { "epoch": 11.295854458693707, "grad_norm": 0.044530533254146576, "learning_rate": 3.046578828769121e-05, "loss": 0.0026, "step": 37875 }, { "epoch": 11.303310468237399, "grad_norm": 0.008475979790091515, "learning_rate": 3.0449219267781875e-05, "loss": 0.0099, "step": 37900 }, { "epoch": 11.310766477781092, "grad_norm": 0.008451344445347786, "learning_rate": 3.043265024787254e-05, "loss": 0.0013, "step": 37925 }, { "epoch": 11.318222487324784, "grad_norm": 0.0787351205945015, "learning_rate": 3.0416081227963207e-05, "loss": 0.0322, "step": 37950 }, { "epoch": 11.325678496868475, "grad_norm": 0.13200482726097107, "learning_rate": 3.0399512208053872e-05, "loss": 0.0002, "step": 37975 }, { "epoch": 11.333134506412168, "grad_norm": 0.15154162049293518, "learning_rate": 3.0382943188144535e-05, "loss": 0.0274, "step": 38000 }, { "epoch": 11.34059051595586, "grad_norm": 0.18614114820957184, "learning_rate": 3.03663741682352e-05, "loss": 0.0056, "step": 38025 }, { "epoch": 11.348046525499553, "grad_norm": 0.02761516161262989, "learning_rate": 3.0349805148325866e-05, "loss": 0.0028, "step": 38050 }, { "epoch": 11.355502535043245, "grad_norm": 0.002439249772578478, "learning_rate": 3.0333236128416532e-05, "loss": 0.0038, "step": 38075 }, { "epoch": 11.362958544586936, "grad_norm": 0.03615495190024376, "learning_rate": 3.0316667108507198e-05, "loss": 0.0226, "step": 38100 }, { "epoch": 11.37041455413063, "grad_norm": 0.007680293172597885, "learning_rate": 3.0300098088597867e-05, "loss": 0.0394, "step": 38125 }, { "epoch": 11.377870563674321, "grad_norm": 0.010748565196990967, "learning_rate": 3.0283529068688533e-05, "loss": 0.0169, "step": 38150 }, { "epoch": 11.385326573218014, "grad_norm": 0.008955719880759716, "learning_rate": 3.0266960048779198e-05, "loss": 0.0071, "step": 38175 }, { "epoch": 11.392782582761706, "grad_norm": 0.03065124712884426, "learning_rate": 3.025039102886986e-05, "loss": 0.0277, "step": 38200 }, { "epoch": 11.400238592305397, "grad_norm": 0.0015641790814697742, "learning_rate": 3.0234484769756898e-05, "loss": 0.0472, "step": 38225 }, { "epoch": 11.40769460184909, "grad_norm": 0.012706178240478039, "learning_rate": 3.0217915749847563e-05, "loss": 0.0035, "step": 38250 }, { "epoch": 11.415150611392782, "grad_norm": 0.058647219091653824, "learning_rate": 3.0201346729938236e-05, "loss": 0.0022, "step": 38275 }, { "epoch": 11.422606620936476, "grad_norm": 0.32084959745407104, "learning_rate": 3.0184777710028898e-05, "loss": 0.0311, "step": 38300 }, { "epoch": 11.430062630480167, "grad_norm": 0.001740424195304513, "learning_rate": 3.0168208690119564e-05, "loss": 0.001, "step": 38325 }, { "epoch": 11.437518640023859, "grad_norm": 0.01977064087986946, "learning_rate": 3.015163967021023e-05, "loss": 0.0064, "step": 38350 }, { "epoch": 11.444974649567552, "grad_norm": 0.012652015313506126, "learning_rate": 3.0135070650300895e-05, "loss": 0.0038, "step": 38375 }, { "epoch": 11.452430659111243, "grad_norm": 0.029624788090586662, "learning_rate": 3.011850163039156e-05, "loss": 0.0294, "step": 38400 }, { "epoch": 11.459886668654935, "grad_norm": 0.729141891002655, "learning_rate": 3.0101932610482224e-05, "loss": 0.005, "step": 38425 }, { "epoch": 11.467342678198628, "grad_norm": 0.21458280086517334, "learning_rate": 3.008536359057289e-05, "loss": 0.0201, "step": 38450 }, { "epoch": 11.47479868774232, "grad_norm": 0.018610941246151924, "learning_rate": 3.0068794570663555e-05, "loss": 0.0118, "step": 38475 }, { "epoch": 11.482254697286013, "grad_norm": 3.399533987045288, "learning_rate": 3.005222555075422e-05, "loss": 0.0311, "step": 38500 }, { "epoch": 11.489710706829705, "grad_norm": 0.16793139278888702, "learning_rate": 3.003565653084489e-05, "loss": 0.0095, "step": 38525 }, { "epoch": 11.497166716373396, "grad_norm": 0.3863959312438965, "learning_rate": 3.0019087510935556e-05, "loss": 0.0268, "step": 38550 }, { "epoch": 11.50462272591709, "grad_norm": 0.005762874614447355, "learning_rate": 3.0003181251822593e-05, "loss": 0.0849, "step": 38575 }, { "epoch": 11.512078735460781, "grad_norm": 0.061609476804733276, "learning_rate": 2.998661223191326e-05, "loss": 0.0359, "step": 38600 }, { "epoch": 11.519534745004474, "grad_norm": 0.2524660527706146, "learning_rate": 2.9970043212003924e-05, "loss": 0.0463, "step": 38625 }, { "epoch": 11.526990754548166, "grad_norm": 0.03191876783967018, "learning_rate": 2.995347419209459e-05, "loss": 0.0268, "step": 38650 }, { "epoch": 11.534446764091857, "grad_norm": 0.32526031136512756, "learning_rate": 2.993690517218526e-05, "loss": 0.0362, "step": 38675 }, { "epoch": 11.54190277363555, "grad_norm": 0.0009662279044277966, "learning_rate": 2.9920336152275925e-05, "loss": 0.0105, "step": 38700 }, { "epoch": 11.549358783179242, "grad_norm": 0.9750592112541199, "learning_rate": 2.990376713236659e-05, "loss": 0.0283, "step": 38725 }, { "epoch": 11.556814792722935, "grad_norm": 0.028177186846733093, "learning_rate": 2.9887198112457253e-05, "loss": 0.0169, "step": 38750 }, { "epoch": 11.564270802266627, "grad_norm": 0.02503358945250511, "learning_rate": 2.987062909254792e-05, "loss": 0.0387, "step": 38775 }, { "epoch": 11.571726811810318, "grad_norm": 79.97856903076172, "learning_rate": 2.9854060072638584e-05, "loss": 0.0216, "step": 38800 }, { "epoch": 11.579182821354012, "grad_norm": 0.03171137720346451, "learning_rate": 2.983749105272925e-05, "loss": 0.0476, "step": 38825 }, { "epoch": 11.586638830897703, "grad_norm": 0.011583199724555016, "learning_rate": 2.9820922032819916e-05, "loss": 0.0549, "step": 38850 }, { "epoch": 11.594094840441397, "grad_norm": 0.04316306859254837, "learning_rate": 2.9804353012910578e-05, "loss": 0.0004, "step": 38875 }, { "epoch": 11.601550849985088, "grad_norm": 0.12726345658302307, "learning_rate": 2.9787783993001244e-05, "loss": 0.0019, "step": 38900 }, { "epoch": 11.60900685952878, "grad_norm": 0.0003953992563765496, "learning_rate": 2.9771214973091916e-05, "loss": 0.008, "step": 38925 }, { "epoch": 11.616462869072473, "grad_norm": 0.003280686680227518, "learning_rate": 2.9754645953182582e-05, "loss": 0.0462, "step": 38950 }, { "epoch": 11.623918878616164, "grad_norm": 0.09019768238067627, "learning_rate": 2.9738076933273244e-05, "loss": 0.0135, "step": 38975 }, { "epoch": 11.631374888159858, "grad_norm": 0.009447668679058552, "learning_rate": 2.972150791336391e-05, "loss": 0.0133, "step": 39000 }, { "epoch": 11.63883089770355, "grad_norm": 0.00495525635778904, "learning_rate": 2.9704938893454576e-05, "loss": 0.0032, "step": 39025 }, { "epoch": 11.64628690724724, "grad_norm": 0.3824717402458191, "learning_rate": 2.968836987354524e-05, "loss": 0.0177, "step": 39050 }, { "epoch": 11.653742916790934, "grad_norm": 0.013165218755602837, "learning_rate": 2.9671800853635907e-05, "loss": 0.0058, "step": 39075 }, { "epoch": 11.661198926334626, "grad_norm": 0.016493534669280052, "learning_rate": 2.965523183372657e-05, "loss": 0.012, "step": 39100 }, { "epoch": 11.668654935878317, "grad_norm": 0.02720525860786438, "learning_rate": 2.9638662813817235e-05, "loss": 0.0259, "step": 39125 }, { "epoch": 11.67611094542201, "grad_norm": 0.012515093199908733, "learning_rate": 2.9622093793907908e-05, "loss": 0.0312, "step": 39150 }, { "epoch": 11.683566954965702, "grad_norm": 0.011289069429039955, "learning_rate": 2.9605524773998574e-05, "loss": 0.0499, "step": 39175 }, { "epoch": 11.691022964509395, "grad_norm": 8.175752639770508, "learning_rate": 2.9588955754089236e-05, "loss": 0.0021, "step": 39200 }, { "epoch": 11.698478974053087, "grad_norm": 0.00964757427573204, "learning_rate": 2.95723867341799e-05, "loss": 0.0068, "step": 39225 }, { "epoch": 11.705934983596778, "grad_norm": 1.0415167808532715, "learning_rate": 2.9555817714270567e-05, "loss": 0.0199, "step": 39250 }, { "epoch": 11.713390993140472, "grad_norm": 0.3101750612258911, "learning_rate": 2.9539248694361233e-05, "loss": 0.0235, "step": 39275 }, { "epoch": 11.720847002684163, "grad_norm": 0.002900744555518031, "learning_rate": 2.95226796744519e-05, "loss": 0.0138, "step": 39300 }, { "epoch": 11.728303012227856, "grad_norm": 0.907034158706665, "learning_rate": 2.950611065454256e-05, "loss": 0.0401, "step": 39325 }, { "epoch": 11.735759021771548, "grad_norm": 0.24928085505962372, "learning_rate": 2.9489541634633227e-05, "loss": 0.0124, "step": 39350 }, { "epoch": 11.74321503131524, "grad_norm": 0.009192919358611107, "learning_rate": 2.9472972614723893e-05, "loss": 0.0254, "step": 39375 }, { "epoch": 11.750671040858933, "grad_norm": 0.11580682545900345, "learning_rate": 2.9456403594814562e-05, "loss": 0.0157, "step": 39400 }, { "epoch": 11.758127050402624, "grad_norm": 0.0013650426408275962, "learning_rate": 2.9439834574905228e-05, "loss": 0.0004, "step": 39425 }, { "epoch": 11.765583059946316, "grad_norm": 0.02957271784543991, "learning_rate": 2.9423265554995893e-05, "loss": 0.0122, "step": 39450 }, { "epoch": 11.77303906949001, "grad_norm": 0.004242660012096167, "learning_rate": 2.940669653508656e-05, "loss": 0.0008, "step": 39475 }, { "epoch": 11.7804950790337, "grad_norm": 0.04420284926891327, "learning_rate": 2.9390127515177225e-05, "loss": 0.0015, "step": 39500 }, { "epoch": 11.787951088577394, "grad_norm": 0.0010515704052522779, "learning_rate": 2.937355849526789e-05, "loss": 0.0013, "step": 39525 }, { "epoch": 11.795407098121085, "grad_norm": 0.2385762482881546, "learning_rate": 2.9356989475358553e-05, "loss": 0.0434, "step": 39550 }, { "epoch": 11.802863107664777, "grad_norm": 0.021942714229226112, "learning_rate": 2.934042045544922e-05, "loss": 0.0005, "step": 39575 }, { "epoch": 11.81031911720847, "grad_norm": 0.02746811881661415, "learning_rate": 2.9323851435539884e-05, "loss": 0.0014, "step": 39600 }, { "epoch": 11.817775126752162, "grad_norm": 0.013176214881241322, "learning_rate": 2.9307282415630553e-05, "loss": 0.0065, "step": 39625 }, { "epoch": 11.825231136295855, "grad_norm": 0.0018259455682709813, "learning_rate": 2.929071339572122e-05, "loss": 0.0247, "step": 39650 }, { "epoch": 11.832687145839547, "grad_norm": 0.0467984676361084, "learning_rate": 2.9274144375811885e-05, "loss": 0.0253, "step": 39675 }, { "epoch": 11.840143155383238, "grad_norm": 0.0014794999733567238, "learning_rate": 2.925757535590255e-05, "loss": 0.0002, "step": 39700 }, { "epoch": 11.847599164926931, "grad_norm": 0.7367884516716003, "learning_rate": 2.9241006335993216e-05, "loss": 0.0039, "step": 39725 }, { "epoch": 11.855055174470623, "grad_norm": 0.0043684993870556355, "learning_rate": 2.922443731608388e-05, "loss": 0.0408, "step": 39750 }, { "epoch": 11.862511184014316, "grad_norm": 13.370290756225586, "learning_rate": 2.9207868296174544e-05, "loss": 0.0133, "step": 39775 }, { "epoch": 11.869967193558008, "grad_norm": 0.04149283468723297, "learning_rate": 2.919129927626521e-05, "loss": 0.009, "step": 39800 }, { "epoch": 11.8774232031017, "grad_norm": 0.0070778988301754, "learning_rate": 2.9174730256355876e-05, "loss": 0.0003, "step": 39825 }, { "epoch": 11.884879212645393, "grad_norm": 0.13461697101593018, "learning_rate": 2.915816123644654e-05, "loss": 0.0176, "step": 39850 }, { "epoch": 11.892335222189084, "grad_norm": 0.06489887088537216, "learning_rate": 2.914159221653721e-05, "loss": 0.0796, "step": 39875 }, { "epoch": 11.899791231732777, "grad_norm": 0.25228577852249146, "learning_rate": 2.9125023196627876e-05, "loss": 0.0176, "step": 39900 }, { "epoch": 11.907247241276469, "grad_norm": 0.02081029862165451, "learning_rate": 2.9108454176718542e-05, "loss": 0.0103, "step": 39925 }, { "epoch": 11.91470325082016, "grad_norm": 0.0007288819178938866, "learning_rate": 2.9091885156809208e-05, "loss": 0.0224, "step": 39950 }, { "epoch": 11.922159260363854, "grad_norm": 0.45741400122642517, "learning_rate": 2.907531613689987e-05, "loss": 0.0431, "step": 39975 }, { "epoch": 11.929615269907545, "grad_norm": 0.0039330171421170235, "learning_rate": 2.9058747116990536e-05, "loss": 0.005, "step": 40000 }, { "epoch": 11.937071279451239, "grad_norm": 0.046019960194826126, "learning_rate": 2.90421780970812e-05, "loss": 0.0065, "step": 40025 }, { "epoch": 11.94452728899493, "grad_norm": 0.059516049921512604, "learning_rate": 2.9025609077171867e-05, "loss": 0.0031, "step": 40050 }, { "epoch": 11.951983298538622, "grad_norm": 0.005144066177308559, "learning_rate": 2.9009040057262533e-05, "loss": 0.0752, "step": 40075 }, { "epoch": 11.959439308082315, "grad_norm": 0.0025731483474373817, "learning_rate": 2.8992471037353195e-05, "loss": 0.0682, "step": 40100 }, { "epoch": 11.966895317626006, "grad_norm": 0.0010169928427785635, "learning_rate": 2.8975902017443868e-05, "loss": 0.0004, "step": 40125 }, { "epoch": 11.974351327169698, "grad_norm": 0.1493087112903595, "learning_rate": 2.8959332997534534e-05, "loss": 0.0745, "step": 40150 }, { "epoch": 11.981807336713391, "grad_norm": 0.024421213194727898, "learning_rate": 2.89427639776252e-05, "loss": 0.0019, "step": 40175 }, { "epoch": 11.989263346257083, "grad_norm": 0.01383435633033514, "learning_rate": 2.8926194957715862e-05, "loss": 0.0031, "step": 40200 }, { "epoch": 11.996719355800776, "grad_norm": 0.015260148793458939, "learning_rate": 2.8909625937806528e-05, "loss": 0.0206, "step": 40225 }, { "epoch": 12.0, "eval_gen_len": 8.7358, "eval_loss": 0.07119181752204895, "eval_rouge1": 97.3656, "eval_rouge2": 84.7781, "eval_rougeL": 97.3473, "eval_rougeLsum": 97.3476, "eval_runtime": 100.8839, "eval_samples_per_second": 16.623, "eval_steps_per_second": 4.163, "step": 40236 }, { "epoch": 12.004175365344468, "grad_norm": 0.02059837430715561, "learning_rate": 2.8893056917897193e-05, "loss": 0.017, "step": 40250 }, { "epoch": 12.01163137488816, "grad_norm": 13.741105079650879, "learning_rate": 2.887648789798786e-05, "loss": 0.0102, "step": 40275 }, { "epoch": 12.019087384431852, "grad_norm": 0.038666047155857086, "learning_rate": 2.8859918878078525e-05, "loss": 0.0228, "step": 40300 }, { "epoch": 12.026543393975544, "grad_norm": 0.004178918898105621, "learning_rate": 2.8843349858169187e-05, "loss": 0.0009, "step": 40325 }, { "epoch": 12.033999403519237, "grad_norm": 0.0011557178804650903, "learning_rate": 2.882678083825986e-05, "loss": 0.0194, "step": 40350 }, { "epoch": 12.041455413062929, "grad_norm": 0.11613950878381729, "learning_rate": 2.8810211818350525e-05, "loss": 0.0164, "step": 40375 }, { "epoch": 12.04891142260662, "grad_norm": 0.0020942033734172583, "learning_rate": 2.879364279844119e-05, "loss": 0.0246, "step": 40400 }, { "epoch": 12.056367432150314, "grad_norm": 0.0037937331944704056, "learning_rate": 2.8777073778531853e-05, "loss": 0.0229, "step": 40425 }, { "epoch": 12.063823441694005, "grad_norm": 0.47289687395095825, "learning_rate": 2.876050475862252e-05, "loss": 0.0003, "step": 40450 }, { "epoch": 12.071279451237698, "grad_norm": 0.0333111509680748, "learning_rate": 2.8743935738713185e-05, "loss": 0.0011, "step": 40475 }, { "epoch": 12.07873546078139, "grad_norm": 0.3923133909702301, "learning_rate": 2.872736671880385e-05, "loss": 0.0037, "step": 40500 }, { "epoch": 12.086191470325081, "grad_norm": 0.006680936552584171, "learning_rate": 2.8710797698894516e-05, "loss": 0.021, "step": 40525 }, { "epoch": 12.093647479868775, "grad_norm": 0.010220406576991081, "learning_rate": 2.869422867898518e-05, "loss": 0.0013, "step": 40550 }, { "epoch": 12.101103489412466, "grad_norm": 0.026769593358039856, "learning_rate": 2.8677659659075844e-05, "loss": 0.0036, "step": 40575 }, { "epoch": 12.108559498956158, "grad_norm": 0.006459955126047134, "learning_rate": 2.8661090639166517e-05, "loss": 0.001, "step": 40600 }, { "epoch": 12.116015508499851, "grad_norm": 0.5409096479415894, "learning_rate": 2.864452161925718e-05, "loss": 0.02, "step": 40625 }, { "epoch": 12.123471518043543, "grad_norm": 0.009102045558393002, "learning_rate": 2.8627952599347845e-05, "loss": 0.0743, "step": 40650 }, { "epoch": 12.130927527587236, "grad_norm": 0.04457883909344673, "learning_rate": 2.861138357943851e-05, "loss": 0.0082, "step": 40675 }, { "epoch": 12.138383537130927, "grad_norm": 0.009046703577041626, "learning_rate": 2.8594814559529176e-05, "loss": 0.0008, "step": 40700 }, { "epoch": 12.145839546674619, "grad_norm": 21.693180084228516, "learning_rate": 2.8578245539619842e-05, "loss": 0.0645, "step": 40725 }, { "epoch": 12.153295556218312, "grad_norm": 0.010429131798446178, "learning_rate": 2.8561676519710508e-05, "loss": 0.0301, "step": 40750 }, { "epoch": 12.160751565762004, "grad_norm": 0.11445409804582596, "learning_rate": 2.854510749980117e-05, "loss": 0.0078, "step": 40775 }, { "epoch": 12.168207575305697, "grad_norm": 0.001903259544633329, "learning_rate": 2.8528538479891836e-05, "loss": 0.0154, "step": 40800 }, { "epoch": 12.175663584849389, "grad_norm": 0.0068878475576639175, "learning_rate": 2.851196945998251e-05, "loss": 0.0041, "step": 40825 }, { "epoch": 12.18311959439308, "grad_norm": 0.0009471174562349916, "learning_rate": 2.849540044007317e-05, "loss": 0.0211, "step": 40850 }, { "epoch": 12.190575603936773, "grad_norm": 0.0013344428734853864, "learning_rate": 2.8478831420163837e-05, "loss": 0.0003, "step": 40875 }, { "epoch": 12.198031613480465, "grad_norm": 4.818425178527832, "learning_rate": 2.8462262400254502e-05, "loss": 0.0184, "step": 40900 }, { "epoch": 12.205487623024158, "grad_norm": 0.0003423032758291811, "learning_rate": 2.8445693380345168e-05, "loss": 0.0222, "step": 40925 }, { "epoch": 12.21294363256785, "grad_norm": 0.01015832182019949, "learning_rate": 2.8429124360435834e-05, "loss": 0.0044, "step": 40950 }, { "epoch": 12.220399642111541, "grad_norm": 0.00136271002702415, "learning_rate": 2.8412555340526496e-05, "loss": 0.0027, "step": 40975 }, { "epoch": 12.227855651655235, "grad_norm": 0.033610399812459946, "learning_rate": 2.8395986320617162e-05, "loss": 0.0345, "step": 41000 }, { "epoch": 12.235311661198926, "grad_norm": 0.0037056605797261, "learning_rate": 2.8379417300707828e-05, "loss": 0.0032, "step": 41025 }, { "epoch": 12.24276767074262, "grad_norm": 0.02005760185420513, "learning_rate": 2.8362848280798493e-05, "loss": 0.0306, "step": 41050 }, { "epoch": 12.250223680286311, "grad_norm": 0.9534538984298706, "learning_rate": 2.8346279260889162e-05, "loss": 0.0531, "step": 41075 }, { "epoch": 12.257679689830002, "grad_norm": 26.52764892578125, "learning_rate": 2.8329710240979828e-05, "loss": 0.0089, "step": 41100 }, { "epoch": 12.265135699373696, "grad_norm": 12.732146263122559, "learning_rate": 2.8313141221070494e-05, "loss": 0.0057, "step": 41125 }, { "epoch": 12.272591708917387, "grad_norm": 0.009532719850540161, "learning_rate": 2.829657220116116e-05, "loss": 0.0027, "step": 41150 }, { "epoch": 12.280047718461079, "grad_norm": 0.2986721694469452, "learning_rate": 2.8280003181251825e-05, "loss": 0.0284, "step": 41175 }, { "epoch": 12.287503728004772, "grad_norm": 0.010956763289868832, "learning_rate": 2.8263434161342488e-05, "loss": 0.0321, "step": 41200 }, { "epoch": 12.294959737548464, "grad_norm": 0.010674208402633667, "learning_rate": 2.8246865141433153e-05, "loss": 0.0036, "step": 41225 }, { "epoch": 12.302415747092157, "grad_norm": 0.00191806023940444, "learning_rate": 2.823029612152382e-05, "loss": 0.0047, "step": 41250 }, { "epoch": 12.309871756635848, "grad_norm": 0.03268317133188248, "learning_rate": 2.8213727101614485e-05, "loss": 0.0026, "step": 41275 }, { "epoch": 12.31732776617954, "grad_norm": 0.018544087186455727, "learning_rate": 2.8197158081705154e-05, "loss": 0.0022, "step": 41300 }, { "epoch": 12.324783775723233, "grad_norm": 0.04845629632472992, "learning_rate": 2.818058906179582e-05, "loss": 0.0147, "step": 41325 }, { "epoch": 12.332239785266925, "grad_norm": 0.3336451053619385, "learning_rate": 2.8164020041886485e-05, "loss": 0.0272, "step": 41350 }, { "epoch": 12.339695794810618, "grad_norm": 0.0033389104064553976, "learning_rate": 2.814745102197715e-05, "loss": 0.04, "step": 41375 }, { "epoch": 12.34715180435431, "grad_norm": 0.0076507944613695145, "learning_rate": 2.8130882002067817e-05, "loss": 0.0039, "step": 41400 }, { "epoch": 12.354607813898001, "grad_norm": 22.395612716674805, "learning_rate": 2.811431298215848e-05, "loss": 0.005, "step": 41425 }, { "epoch": 12.362063823441694, "grad_norm": 0.0016582268290221691, "learning_rate": 2.8097743962249145e-05, "loss": 0.0293, "step": 41450 }, { "epoch": 12.369519832985386, "grad_norm": 11.2319974899292, "learning_rate": 2.808117494233981e-05, "loss": 0.0066, "step": 41475 }, { "epoch": 12.37697584252908, "grad_norm": 2.7760393619537354, "learning_rate": 2.8064605922430476e-05, "loss": 0.0015, "step": 41500 }, { "epoch": 12.38443185207277, "grad_norm": 0.003564919577911496, "learning_rate": 2.8048036902521142e-05, "loss": 0.002, "step": 41525 }, { "epoch": 12.391887861616462, "grad_norm": 0.002883300883695483, "learning_rate": 2.803146788261181e-05, "loss": 0.0022, "step": 41550 }, { "epoch": 12.399343871160156, "grad_norm": 0.1490948647260666, "learning_rate": 2.8014898862702477e-05, "loss": 0.0006, "step": 41575 }, { "epoch": 12.406799880703847, "grad_norm": 0.004211048129945993, "learning_rate": 2.7998329842793143e-05, "loss": 0.0007, "step": 41600 }, { "epoch": 12.414255890247539, "grad_norm": 0.008610145188868046, "learning_rate": 2.798176082288381e-05, "loss": 0.0316, "step": 41625 }, { "epoch": 12.421711899791232, "grad_norm": 0.023798583075404167, "learning_rate": 2.796519180297447e-05, "loss": 0.0136, "step": 41650 }, { "epoch": 12.429167909334923, "grad_norm": 0.07987093180418015, "learning_rate": 2.7948622783065137e-05, "loss": 0.0008, "step": 41675 }, { "epoch": 12.436623918878617, "grad_norm": 0.48281994462013245, "learning_rate": 2.7932053763155802e-05, "loss": 0.0188, "step": 41700 }, { "epoch": 12.444079928422308, "grad_norm": 0.00766774220392108, "learning_rate": 2.7915484743246468e-05, "loss": 0.0128, "step": 41725 }, { "epoch": 12.451535937966, "grad_norm": 0.025086617097258568, "learning_rate": 2.7898915723337134e-05, "loss": 0.0431, "step": 41750 }, { "epoch": 12.458991947509693, "grad_norm": 0.012511699460446835, "learning_rate": 2.7882346703427796e-05, "loss": 0.0017, "step": 41775 }, { "epoch": 12.466447957053385, "grad_norm": 0.0011564865708351135, "learning_rate": 2.786577768351847e-05, "loss": 0.0011, "step": 41800 }, { "epoch": 12.473903966597078, "grad_norm": 0.65472412109375, "learning_rate": 2.7849208663609134e-05, "loss": 0.019, "step": 41825 }, { "epoch": 12.48135997614077, "grad_norm": 0.005623379722237587, "learning_rate": 2.7832639643699797e-05, "loss": 0.0303, "step": 41850 }, { "epoch": 12.488815985684461, "grad_norm": 0.018389273434877396, "learning_rate": 2.7816070623790462e-05, "loss": 0.0032, "step": 41875 }, { "epoch": 12.496271995228154, "grad_norm": 0.0019741118885576725, "learning_rate": 2.7799501603881128e-05, "loss": 0.0167, "step": 41900 }, { "epoch": 12.503728004771846, "grad_norm": 0.1198592334985733, "learning_rate": 2.7782932583971794e-05, "loss": 0.0053, "step": 41925 }, { "epoch": 12.511184014315539, "grad_norm": 0.03189831227064133, "learning_rate": 2.776636356406246e-05, "loss": 0.0043, "step": 41950 }, { "epoch": 12.51864002385923, "grad_norm": 0.12729355692863464, "learning_rate": 2.7749794544153125e-05, "loss": 0.001, "step": 41975 }, { "epoch": 12.526096033402922, "grad_norm": 0.00783438328653574, "learning_rate": 2.7733225524243788e-05, "loss": 0.012, "step": 42000 }, { "epoch": 12.533552042946615, "grad_norm": 0.024893639609217644, "learning_rate": 2.771665650433446e-05, "loss": 0.0283, "step": 42025 }, { "epoch": 12.541008052490307, "grad_norm": 0.016918212175369263, "learning_rate": 2.7700087484425126e-05, "loss": 0.0006, "step": 42050 }, { "epoch": 12.548464062034, "grad_norm": 0.03593961149454117, "learning_rate": 2.7683518464515788e-05, "loss": 0.0196, "step": 42075 }, { "epoch": 12.555920071577692, "grad_norm": 0.024990715086460114, "learning_rate": 2.7666949444606454e-05, "loss": 0.0039, "step": 42100 }, { "epoch": 12.563376081121383, "grad_norm": 0.09220023453235626, "learning_rate": 2.765038042469712e-05, "loss": 0.0034, "step": 42125 }, { "epoch": 12.570832090665077, "grad_norm": 0.007467248011380434, "learning_rate": 2.7633811404787785e-05, "loss": 0.0014, "step": 42150 }, { "epoch": 12.578288100208768, "grad_norm": 0.0017490466125309467, "learning_rate": 2.761724238487845e-05, "loss": 0.001, "step": 42175 }, { "epoch": 12.585744109752461, "grad_norm": 0.08711958676576614, "learning_rate": 2.7600673364969113e-05, "loss": 0.0216, "step": 42200 }, { "epoch": 12.593200119296153, "grad_norm": 0.05300259217619896, "learning_rate": 2.758410434505978e-05, "loss": 0.0161, "step": 42225 }, { "epoch": 12.600656128839844, "grad_norm": 0.009109907783567905, "learning_rate": 2.7567535325150445e-05, "loss": 0.0456, "step": 42250 }, { "epoch": 12.608112138383538, "grad_norm": 1.0785945653915405, "learning_rate": 2.7550966305241117e-05, "loss": 0.0015, "step": 42275 }, { "epoch": 12.61556814792723, "grad_norm": 0.0029738135635852814, "learning_rate": 2.753439728533178e-05, "loss": 0.0102, "step": 42300 }, { "epoch": 12.62302415747092, "grad_norm": 0.0014897419605404139, "learning_rate": 2.7517828265422446e-05, "loss": 0.0029, "step": 42325 }, { "epoch": 12.630480167014614, "grad_norm": 0.0512629896402359, "learning_rate": 2.750125924551311e-05, "loss": 0.0007, "step": 42350 }, { "epoch": 12.637936176558306, "grad_norm": 0.0043296511285007, "learning_rate": 2.7484690225603777e-05, "loss": 0.0004, "step": 42375 }, { "epoch": 12.645392186101999, "grad_norm": 0.005691382568329573, "learning_rate": 2.7468121205694443e-05, "loss": 0.01, "step": 42400 }, { "epoch": 12.65284819564569, "grad_norm": 0.004919271916151047, "learning_rate": 2.7451552185785105e-05, "loss": 0.0016, "step": 42425 }, { "epoch": 12.660304205189382, "grad_norm": 2.6745660305023193, "learning_rate": 2.743498316587577e-05, "loss": 0.037, "step": 42450 }, { "epoch": 12.667760214733075, "grad_norm": 0.005345212761312723, "learning_rate": 2.7418414145966437e-05, "loss": 0.0026, "step": 42475 }, { "epoch": 12.675216224276767, "grad_norm": 0.04073727875947952, "learning_rate": 2.740184512605711e-05, "loss": 0.0007, "step": 42500 }, { "epoch": 12.68267223382046, "grad_norm": 0.001461253734305501, "learning_rate": 2.738527610614777e-05, "loss": 0.0206, "step": 42525 }, { "epoch": 12.690128243364152, "grad_norm": 0.010080489329993725, "learning_rate": 2.7368707086238437e-05, "loss": 0.0007, "step": 42550 }, { "epoch": 12.697584252907843, "grad_norm": 0.0009966216748580337, "learning_rate": 2.7352138066329103e-05, "loss": 0.0004, "step": 42575 }, { "epoch": 12.705040262451536, "grad_norm": 0.04050817713141441, "learning_rate": 2.733556904641977e-05, "loss": 0.0143, "step": 42600 }, { "epoch": 12.712496271995228, "grad_norm": 0.00188263482414186, "learning_rate": 2.7319000026510434e-05, "loss": 0.0028, "step": 42625 }, { "epoch": 12.71995228153892, "grad_norm": 0.020978985354304314, "learning_rate": 2.7302431006601097e-05, "loss": 0.0174, "step": 42650 }, { "epoch": 12.727408291082613, "grad_norm": 0.00698693236336112, "learning_rate": 2.7285861986691762e-05, "loss": 0.0006, "step": 42675 }, { "epoch": 12.734864300626304, "grad_norm": 0.019326740875840187, "learning_rate": 2.7269292966782428e-05, "loss": 0.0027, "step": 42700 }, { "epoch": 12.742320310169998, "grad_norm": 0.03213045001029968, "learning_rate": 2.7252723946873094e-05, "loss": 0.0187, "step": 42725 }, { "epoch": 12.749776319713689, "grad_norm": 0.0011091905180364847, "learning_rate": 2.7236154926963763e-05, "loss": 0.0002, "step": 42750 }, { "epoch": 12.75723232925738, "grad_norm": 0.13166595995426178, "learning_rate": 2.721958590705443e-05, "loss": 0.0189, "step": 42775 }, { "epoch": 12.764688338801074, "grad_norm": 0.015718284994363785, "learning_rate": 2.7203016887145094e-05, "loss": 0.0002, "step": 42800 }, { "epoch": 12.772144348344765, "grad_norm": 0.01700720004737377, "learning_rate": 2.718644786723576e-05, "loss": 0.0119, "step": 42825 }, { "epoch": 12.779600357888459, "grad_norm": 0.000691990542691201, "learning_rate": 2.7169878847326426e-05, "loss": 0.0585, "step": 42850 }, { "epoch": 12.78705636743215, "grad_norm": 0.033096782863140106, "learning_rate": 2.7153309827417088e-05, "loss": 0.0066, "step": 42875 }, { "epoch": 12.794512376975842, "grad_norm": 0.0045728497207164764, "learning_rate": 2.7136740807507754e-05, "loss": 0.0022, "step": 42900 }, { "epoch": 12.801968386519535, "grad_norm": 0.035219863057136536, "learning_rate": 2.712017178759842e-05, "loss": 0.0287, "step": 42925 }, { "epoch": 12.809424396063227, "grad_norm": 0.04110410436987877, "learning_rate": 2.7103602767689085e-05, "loss": 0.0012, "step": 42950 }, { "epoch": 12.81688040560692, "grad_norm": 0.0013123464304953814, "learning_rate": 2.7087033747779755e-05, "loss": 0.019, "step": 42975 }, { "epoch": 12.824336415150611, "grad_norm": 0.005283961072564125, "learning_rate": 2.707046472787042e-05, "loss": 0.006, "step": 43000 }, { "epoch": 12.831792424694303, "grad_norm": 0.003638888243585825, "learning_rate": 2.7053895707961086e-05, "loss": 0.0198, "step": 43025 }, { "epoch": 12.839248434237996, "grad_norm": 0.10352325439453125, "learning_rate": 2.7037326688051752e-05, "loss": 0.0011, "step": 43050 }, { "epoch": 12.846704443781688, "grad_norm": 8.774328231811523, "learning_rate": 2.7020757668142414e-05, "loss": 0.0223, "step": 43075 }, { "epoch": 12.854160453325381, "grad_norm": 0.017438048496842384, "learning_rate": 2.700418864823308e-05, "loss": 0.0014, "step": 43100 }, { "epoch": 12.861616462869073, "grad_norm": 0.0033278854098170996, "learning_rate": 2.6987619628323746e-05, "loss": 0.0031, "step": 43125 }, { "epoch": 12.869072472412764, "grad_norm": 0.021386155858635902, "learning_rate": 2.697105060841441e-05, "loss": 0.0066, "step": 43150 }, { "epoch": 12.876528481956457, "grad_norm": 0.005346431862562895, "learning_rate": 2.6954481588505077e-05, "loss": 0.0233, "step": 43175 }, { "epoch": 12.883984491500149, "grad_norm": 0.18703578412532806, "learning_rate": 2.6937912568595743e-05, "loss": 0.0135, "step": 43200 }, { "epoch": 12.891440501043842, "grad_norm": 0.0013108194107189775, "learning_rate": 2.6921343548686412e-05, "loss": 0.0136, "step": 43225 }, { "epoch": 12.898896510587534, "grad_norm": 0.17484645545482635, "learning_rate": 2.6904774528777078e-05, "loss": 0.0017, "step": 43250 }, { "epoch": 12.906352520131225, "grad_norm": 0.008680760860443115, "learning_rate": 2.6888205508867743e-05, "loss": 0.0115, "step": 43275 }, { "epoch": 12.913808529674919, "grad_norm": 0.5126601457595825, "learning_rate": 2.6871636488958406e-05, "loss": 0.0152, "step": 43300 }, { "epoch": 12.92126453921861, "grad_norm": 1.5607045888900757, "learning_rate": 2.685506746904907e-05, "loss": 0.025, "step": 43325 }, { "epoch": 12.928720548762302, "grad_norm": 10.233118057250977, "learning_rate": 2.6838498449139737e-05, "loss": 0.0017, "step": 43350 }, { "epoch": 12.936176558305995, "grad_norm": 0.0010243066353723407, "learning_rate": 2.6821929429230403e-05, "loss": 0.0085, "step": 43375 }, { "epoch": 12.943632567849686, "grad_norm": 2.000396251678467, "learning_rate": 2.680536040932107e-05, "loss": 0.0107, "step": 43400 }, { "epoch": 12.95108857739338, "grad_norm": 0.002072014380246401, "learning_rate": 2.678879138941173e-05, "loss": 0.0084, "step": 43425 }, { "epoch": 12.958544586937071, "grad_norm": 0.01992828957736492, "learning_rate": 2.6772222369502397e-05, "loss": 0.027, "step": 43450 }, { "epoch": 12.966000596480763, "grad_norm": 0.027412349358201027, "learning_rate": 2.675565334959307e-05, "loss": 0.001, "step": 43475 }, { "epoch": 12.973456606024456, "grad_norm": 0.0024006732273846865, "learning_rate": 2.6739084329683735e-05, "loss": 0.0332, "step": 43500 }, { "epoch": 12.980912615568148, "grad_norm": 0.004388559143990278, "learning_rate": 2.6722515309774397e-05, "loss": 0.005, "step": 43525 }, { "epoch": 12.98836862511184, "grad_norm": 0.009956770576536655, "learning_rate": 2.6705946289865063e-05, "loss": 0.0473, "step": 43550 }, { "epoch": 12.995824634655532, "grad_norm": 0.001230629743076861, "learning_rate": 2.668937726995573e-05, "loss": 0.027, "step": 43575 }, { "epoch": 13.0, "eval_gen_len": 8.7484, "eval_loss": 0.06554193049669266, "eval_rouge1": 97.8785, "eval_rouge2": 85.6962, "eval_rougeL": 97.854, "eval_rougeLsum": 97.8551, "eval_runtime": 99.1128, "eval_samples_per_second": 16.92, "eval_steps_per_second": 4.238, "step": 43589 }, { "epoch": 13.003280644199224, "grad_norm": 0.02172417752444744, "learning_rate": 2.6672808250046394e-05, "loss": 0.034, "step": 43600 }, { "epoch": 13.010736653742917, "grad_norm": 0.01802818849682808, "learning_rate": 2.665623923013706e-05, "loss": 0.0239, "step": 43625 }, { "epoch": 13.018192663286609, "grad_norm": 0.0032581316772848368, "learning_rate": 2.6639670210227722e-05, "loss": 0.0071, "step": 43650 }, { "epoch": 13.025648672830302, "grad_norm": 36.717140197753906, "learning_rate": 2.6623101190318388e-05, "loss": 0.0701, "step": 43675 }, { "epoch": 13.033104682373994, "grad_norm": 0.012505823746323586, "learning_rate": 2.660653217040906e-05, "loss": 0.013, "step": 43700 }, { "epoch": 13.040560691917685, "grad_norm": 0.1828284114599228, "learning_rate": 2.6589963150499726e-05, "loss": 0.0014, "step": 43725 }, { "epoch": 13.048016701461378, "grad_norm": 14.492470741271973, "learning_rate": 2.657339413059039e-05, "loss": 0.0455, "step": 43750 }, { "epoch": 13.05547271100507, "grad_norm": 0.023192638531327248, "learning_rate": 2.6556825110681055e-05, "loss": 0.0421, "step": 43775 }, { "epoch": 13.062928720548761, "grad_norm": 0.006542236544191837, "learning_rate": 2.654025609077172e-05, "loss": 0.0004, "step": 43800 }, { "epoch": 13.070384730092455, "grad_norm": 0.001725667854771018, "learning_rate": 2.6523687070862386e-05, "loss": 0.0026, "step": 43825 }, { "epoch": 13.077840739636146, "grad_norm": 0.01235037762671709, "learning_rate": 2.6507118050953052e-05, "loss": 0.003, "step": 43850 }, { "epoch": 13.08529674917984, "grad_norm": 0.006246176082640886, "learning_rate": 2.6490549031043714e-05, "loss": 0.0019, "step": 43875 }, { "epoch": 13.092752758723531, "grad_norm": 0.19377729296684265, "learning_rate": 2.647398001113438e-05, "loss": 0.0008, "step": 43900 }, { "epoch": 13.100208768267223, "grad_norm": 0.010808723047375679, "learning_rate": 2.6457410991225046e-05, "loss": 0.0011, "step": 43925 }, { "epoch": 13.107664777810916, "grad_norm": 0.10874485224485397, "learning_rate": 2.6440841971315715e-05, "loss": 0.0508, "step": 43950 }, { "epoch": 13.115120787354607, "grad_norm": 4.295207977294922, "learning_rate": 2.642427295140638e-05, "loss": 0.0035, "step": 43975 }, { "epoch": 13.1225767968983, "grad_norm": 0.027464494109153748, "learning_rate": 2.6407703931497046e-05, "loss": 0.034, "step": 44000 }, { "epoch": 13.130032806441992, "grad_norm": 0.007744842674583197, "learning_rate": 2.6391134911587712e-05, "loss": 0.0144, "step": 44025 }, { "epoch": 13.137488815985684, "grad_norm": 0.026684366166591644, "learning_rate": 2.6374565891678378e-05, "loss": 0.0097, "step": 44050 }, { "epoch": 13.144944825529377, "grad_norm": 0.00497918576002121, "learning_rate": 2.6357996871769043e-05, "loss": 0.024, "step": 44075 }, { "epoch": 13.152400835073069, "grad_norm": 0.0011154951062053442, "learning_rate": 2.6341427851859706e-05, "loss": 0.0351, "step": 44100 }, { "epoch": 13.159856844616762, "grad_norm": 0.07896313071250916, "learning_rate": 2.632485883195037e-05, "loss": 0.0005, "step": 44125 }, { "epoch": 13.167312854160453, "grad_norm": 0.0008197020506486297, "learning_rate": 2.6308289812041037e-05, "loss": 0.0072, "step": 44150 }, { "epoch": 13.174768863704145, "grad_norm": 0.005408796481788158, "learning_rate": 2.6291720792131706e-05, "loss": 0.0253, "step": 44175 }, { "epoch": 13.182224873247838, "grad_norm": 0.04275033622980118, "learning_rate": 2.6275151772222372e-05, "loss": 0.0243, "step": 44200 }, { "epoch": 13.18968088279153, "grad_norm": 0.009670349769294262, "learning_rate": 2.6258582752313038e-05, "loss": 0.0332, "step": 44225 }, { "epoch": 13.197136892335223, "grad_norm": 0.000518015876878053, "learning_rate": 2.6242013732403703e-05, "loss": 0.0136, "step": 44250 }, { "epoch": 13.204592901878915, "grad_norm": 0.03769877925515175, "learning_rate": 2.622544471249437e-05, "loss": 0.0135, "step": 44275 }, { "epoch": 13.212048911422606, "grad_norm": 0.05490969493985176, "learning_rate": 2.620887569258503e-05, "loss": 0.0024, "step": 44300 }, { "epoch": 13.2195049209663, "grad_norm": 0.86505126953125, "learning_rate": 2.6192306672675697e-05, "loss": 0.0024, "step": 44325 }, { "epoch": 13.22696093050999, "grad_norm": 0.004475884605199099, "learning_rate": 2.6175737652766363e-05, "loss": 0.0296, "step": 44350 }, { "epoch": 13.234416940053682, "grad_norm": 0.05450137332081795, "learning_rate": 2.615916863285703e-05, "loss": 0.0002, "step": 44375 }, { "epoch": 13.241872949597376, "grad_norm": 0.139171302318573, "learning_rate": 2.6142599612947694e-05, "loss": 0.0006, "step": 44400 }, { "epoch": 13.249328959141067, "grad_norm": 0.032572779804468155, "learning_rate": 2.6126030593038364e-05, "loss": 0.0002, "step": 44425 }, { "epoch": 13.25678496868476, "grad_norm": 59.228363037109375, "learning_rate": 2.610946157312903e-05, "loss": 0.0387, "step": 44450 }, { "epoch": 13.264240978228452, "grad_norm": 0.001431209035217762, "learning_rate": 2.6092892553219695e-05, "loss": 0.0001, "step": 44475 }, { "epoch": 13.271696987772144, "grad_norm": 0.5849717855453491, "learning_rate": 2.607632353331036e-05, "loss": 0.005, "step": 44500 }, { "epoch": 13.279152997315837, "grad_norm": 0.007736376952379942, "learning_rate": 2.6059754513401023e-05, "loss": 0.0264, "step": 44525 }, { "epoch": 13.286609006859528, "grad_norm": 0.00018958588771056384, "learning_rate": 2.604318549349169e-05, "loss": 0.0002, "step": 44550 }, { "epoch": 13.294065016403222, "grad_norm": 52.449249267578125, "learning_rate": 2.6026616473582355e-05, "loss": 0.0724, "step": 44575 }, { "epoch": 13.301521025946913, "grad_norm": 0.0037214909680187702, "learning_rate": 2.601004745367302e-05, "loss": 0.0003, "step": 44600 }, { "epoch": 13.308977035490605, "grad_norm": 0.009356235153973103, "learning_rate": 2.5993478433763686e-05, "loss": 0.0005, "step": 44625 }, { "epoch": 13.316433045034298, "grad_norm": 8.970348358154297, "learning_rate": 2.5976909413854355e-05, "loss": 0.0087, "step": 44650 }, { "epoch": 13.32388905457799, "grad_norm": 0.0007844800129532814, "learning_rate": 2.596034039394502e-05, "loss": 0.0002, "step": 44675 }, { "epoch": 13.331345064121683, "grad_norm": 0.0024908827617764473, "learning_rate": 2.5943771374035687e-05, "loss": 0.0006, "step": 44700 }, { "epoch": 13.338801073665374, "grad_norm": 0.004964989144355059, "learning_rate": 2.5927202354126352e-05, "loss": 0.0028, "step": 44725 }, { "epoch": 13.346257083209066, "grad_norm": 1.3698699474334717, "learning_rate": 2.5910633334217015e-05, "loss": 0.0058, "step": 44750 }, { "epoch": 13.35371309275276, "grad_norm": 0.04636652022600174, "learning_rate": 2.589406431430768e-05, "loss": 0.0266, "step": 44775 }, { "epoch": 13.36116910229645, "grad_norm": 0.04417644068598747, "learning_rate": 2.5877495294398346e-05, "loss": 0.0054, "step": 44800 }, { "epoch": 13.368625111840144, "grad_norm": 0.0034272114280611277, "learning_rate": 2.5860926274489012e-05, "loss": 0.0002, "step": 44825 }, { "epoch": 13.376081121383836, "grad_norm": 0.04019004851579666, "learning_rate": 2.5845020015376052e-05, "loss": 0.0306, "step": 44850 }, { "epoch": 13.383537130927527, "grad_norm": 3.370781898498535, "learning_rate": 2.5828450995466718e-05, "loss": 0.0017, "step": 44875 }, { "epoch": 13.39099314047122, "grad_norm": 0.15892678499221802, "learning_rate": 2.5811881975557384e-05, "loss": 0.0011, "step": 44900 }, { "epoch": 13.398449150014912, "grad_norm": 0.0011944427387788892, "learning_rate": 2.579531295564805e-05, "loss": 0.0188, "step": 44925 }, { "epoch": 13.405905159558603, "grad_norm": 0.023739833384752274, "learning_rate": 2.5778743935738715e-05, "loss": 0.0029, "step": 44950 }, { "epoch": 13.413361169102297, "grad_norm": 0.002228564117103815, "learning_rate": 2.5762174915829378e-05, "loss": 0.0002, "step": 44975 }, { "epoch": 13.420817178645988, "grad_norm": 0.007577619515359402, "learning_rate": 2.5745605895920043e-05, "loss": 0.0058, "step": 45000 }, { "epoch": 13.428273188189682, "grad_norm": 0.00293240649625659, "learning_rate": 2.572903687601071e-05, "loss": 0.0185, "step": 45025 }, { "epoch": 13.435729197733373, "grad_norm": 0.3604615330696106, "learning_rate": 2.571246785610138e-05, "loss": 0.0478, "step": 45050 }, { "epoch": 13.443185207277065, "grad_norm": 0.005937446840107441, "learning_rate": 2.5695898836192044e-05, "loss": 0.0035, "step": 45075 }, { "epoch": 13.450641216820758, "grad_norm": 0.003275347175076604, "learning_rate": 2.567932981628271e-05, "loss": 0.0061, "step": 45100 }, { "epoch": 13.45809722636445, "grad_norm": 0.003746124915778637, "learning_rate": 2.5662760796373375e-05, "loss": 0.0029, "step": 45125 }, { "epoch": 13.465553235908143, "grad_norm": 12.623387336730957, "learning_rate": 2.564619177646404e-05, "loss": 0.0121, "step": 45150 }, { "epoch": 13.473009245451834, "grad_norm": 0.2839594781398773, "learning_rate": 2.5629622756554707e-05, "loss": 0.0013, "step": 45175 }, { "epoch": 13.480465254995526, "grad_norm": 6.323868751525879, "learning_rate": 2.561305373664537e-05, "loss": 0.01, "step": 45200 }, { "epoch": 13.487921264539219, "grad_norm": 0.0072549269534647465, "learning_rate": 2.5596484716736035e-05, "loss": 0.002, "step": 45225 }, { "epoch": 13.49537727408291, "grad_norm": 0.01581634394824505, "learning_rate": 2.55799156968267e-05, "loss": 0.0002, "step": 45250 }, { "epoch": 13.502833283626604, "grad_norm": 0.013846264220774174, "learning_rate": 2.5563346676917366e-05, "loss": 0.0001, "step": 45275 }, { "epoch": 13.510289293170295, "grad_norm": 0.004495398607105017, "learning_rate": 2.5546777657008035e-05, "loss": 0.001, "step": 45300 }, { "epoch": 13.517745302713987, "grad_norm": 0.029844263568520546, "learning_rate": 2.55302086370987e-05, "loss": 0.0029, "step": 45325 }, { "epoch": 13.52520131225768, "grad_norm": 0.002655371557921171, "learning_rate": 2.5513639617189367e-05, "loss": 0.0001, "step": 45350 }, { "epoch": 13.532657321801372, "grad_norm": 0.0574614480137825, "learning_rate": 2.5497070597280033e-05, "loss": 0.0117, "step": 45375 }, { "epoch": 13.540113331345065, "grad_norm": 0.0013383477926254272, "learning_rate": 2.54805015773707e-05, "loss": 0.0002, "step": 45400 }, { "epoch": 13.547569340888757, "grad_norm": 0.061951614916324615, "learning_rate": 2.546393255746136e-05, "loss": 0.001, "step": 45425 }, { "epoch": 13.555025350432448, "grad_norm": 19.441818237304688, "learning_rate": 2.5447363537552026e-05, "loss": 0.0544, "step": 45450 }, { "epoch": 13.562481359976141, "grad_norm": 0.0009547994122840464, "learning_rate": 2.5430794517642692e-05, "loss": 0.0055, "step": 45475 }, { "epoch": 13.569937369519833, "grad_norm": 8.116317749023438, "learning_rate": 2.5414225497733358e-05, "loss": 0.004, "step": 45500 }, { "epoch": 13.577393379063524, "grad_norm": 0.016299117356538773, "learning_rate": 2.5397656477824024e-05, "loss": 0.0001, "step": 45525 }, { "epoch": 13.584849388607218, "grad_norm": 0.10050185024738312, "learning_rate": 2.5381087457914693e-05, "loss": 0.0019, "step": 45550 }, { "epoch": 13.59230539815091, "grad_norm": 0.0007255134987644851, "learning_rate": 2.536451843800536e-05, "loss": 0.0002, "step": 45575 }, { "epoch": 13.599761407694603, "grad_norm": 0.017564982175827026, "learning_rate": 2.5347949418096024e-05, "loss": 0.0096, "step": 45600 }, { "epoch": 13.607217417238294, "grad_norm": 0.002698665950447321, "learning_rate": 2.5331380398186687e-05, "loss": 0.0121, "step": 45625 }, { "epoch": 13.614673426781986, "grad_norm": 0.03711654245853424, "learning_rate": 2.5314811378277352e-05, "loss": 0.0072, "step": 45650 }, { "epoch": 13.622129436325679, "grad_norm": 0.015306944027543068, "learning_rate": 2.5298242358368018e-05, "loss": 0.0275, "step": 45675 }, { "epoch": 13.62958544586937, "grad_norm": 0.04432107135653496, "learning_rate": 2.5281673338458684e-05, "loss": 0.0008, "step": 45700 }, { "epoch": 13.637041455413064, "grad_norm": 0.013447861187160015, "learning_rate": 2.526510431854935e-05, "loss": 0.0009, "step": 45725 }, { "epoch": 13.644497464956755, "grad_norm": 0.012608986347913742, "learning_rate": 2.5248535298640015e-05, "loss": 0.0013, "step": 45750 }, { "epoch": 13.651953474500447, "grad_norm": 0.0007090241997502744, "learning_rate": 2.5231966278730684e-05, "loss": 0.0058, "step": 45775 }, { "epoch": 13.65940948404414, "grad_norm": 0.08077728748321533, "learning_rate": 2.521539725882135e-05, "loss": 0.0259, "step": 45800 }, { "epoch": 13.666865493587832, "grad_norm": 1.085524082183838, "learning_rate": 2.5198828238912016e-05, "loss": 0.0181, "step": 45825 }, { "epoch": 13.674321503131525, "grad_norm": 0.0005606280756182969, "learning_rate": 2.5182259219002678e-05, "loss": 0.0004, "step": 45850 }, { "epoch": 13.681777512675216, "grad_norm": 0.004463412798941135, "learning_rate": 2.5165690199093344e-05, "loss": 0.0014, "step": 45875 }, { "epoch": 13.689233522218908, "grad_norm": 0.15161412954330444, "learning_rate": 2.514912117918401e-05, "loss": 0.0128, "step": 45900 }, { "epoch": 13.696689531762601, "grad_norm": 0.009529800154268742, "learning_rate": 2.5132552159274675e-05, "loss": 0.0178, "step": 45925 }, { "epoch": 13.704145541306293, "grad_norm": 4.095335006713867, "learning_rate": 2.511598313936534e-05, "loss": 0.011, "step": 45950 }, { "epoch": 13.711601550849984, "grad_norm": 0.05283172428607941, "learning_rate": 2.5099414119456003e-05, "loss": 0.0004, "step": 45975 }, { "epoch": 13.719057560393678, "grad_norm": 0.012699637562036514, "learning_rate": 2.508284509954667e-05, "loss": 0.0002, "step": 46000 }, { "epoch": 13.726513569937369, "grad_norm": 0.9599093794822693, "learning_rate": 2.506627607963734e-05, "loss": 0.0346, "step": 46025 }, { "epoch": 13.733969579481062, "grad_norm": 0.0005534732481464744, "learning_rate": 2.5049707059728007e-05, "loss": 0.0164, "step": 46050 }, { "epoch": 13.741425589024754, "grad_norm": 0.013052860274910927, "learning_rate": 2.503313803981867e-05, "loss": 0.005, "step": 46075 }, { "epoch": 13.748881598568445, "grad_norm": 0.0030805133283138275, "learning_rate": 2.5016569019909335e-05, "loss": 0.0594, "step": 46100 }, { "epoch": 13.756337608112139, "grad_norm": 0.010317280888557434, "learning_rate": 2.5e-05, "loss": 0.0076, "step": 46125 }, { "epoch": 13.76379361765583, "grad_norm": 0.0013628338929265738, "learning_rate": 2.4983430980090667e-05, "loss": 0.0014, "step": 46150 }, { "epoch": 13.771249627199524, "grad_norm": 0.0019057797035202384, "learning_rate": 2.4966861960181333e-05, "loss": 0.0003, "step": 46175 }, { "epoch": 13.778705636743215, "grad_norm": 0.10203558206558228, "learning_rate": 2.4950292940272e-05, "loss": 0.0003, "step": 46200 }, { "epoch": 13.786161646286907, "grad_norm": 0.007893604226410389, "learning_rate": 2.4933723920362664e-05, "loss": 0.0139, "step": 46225 }, { "epoch": 13.7936176558306, "grad_norm": 0.04614598676562309, "learning_rate": 2.491715490045333e-05, "loss": 0.0022, "step": 46250 }, { "epoch": 13.801073665374291, "grad_norm": 0.010470214299857616, "learning_rate": 2.4900585880543996e-05, "loss": 0.0021, "step": 46275 }, { "epoch": 13.808529674917985, "grad_norm": 0.001019405317492783, "learning_rate": 2.488401686063466e-05, "loss": 0.0131, "step": 46300 }, { "epoch": 13.815985684461676, "grad_norm": 0.01035932544618845, "learning_rate": 2.4867447840725327e-05, "loss": 0.0069, "step": 46325 }, { "epoch": 13.823441694005368, "grad_norm": 0.004152575973421335, "learning_rate": 2.4850878820815993e-05, "loss": 0.0263, "step": 46350 }, { "epoch": 13.830897703549061, "grad_norm": 0.00521685928106308, "learning_rate": 2.483430980090666e-05, "loss": 0.0389, "step": 46375 }, { "epoch": 13.838353713092753, "grad_norm": 0.008577450178563595, "learning_rate": 2.4817740780997324e-05, "loss": 0.0006, "step": 46400 }, { "epoch": 13.845809722636446, "grad_norm": 0.004177641589194536, "learning_rate": 2.480117176108799e-05, "loss": 0.0046, "step": 46425 }, { "epoch": 13.853265732180137, "grad_norm": 0.0356358103454113, "learning_rate": 2.4784602741178656e-05, "loss": 0.0005, "step": 46450 }, { "epoch": 13.860721741723829, "grad_norm": 1.355373740196228, "learning_rate": 2.476803372126932e-05, "loss": 0.0133, "step": 46475 }, { "epoch": 13.868177751267522, "grad_norm": 24.991962432861328, "learning_rate": 2.4751464701359987e-05, "loss": 0.0436, "step": 46500 }, { "epoch": 13.875633760811214, "grad_norm": 0.007507434580475092, "learning_rate": 2.473489568145065e-05, "loss": 0.0004, "step": 46525 }, { "epoch": 13.883089770354907, "grad_norm": 29.0565242767334, "learning_rate": 2.471832666154132e-05, "loss": 0.0673, "step": 46550 }, { "epoch": 13.890545779898599, "grad_norm": 0.00047484287642873824, "learning_rate": 2.4701757641631984e-05, "loss": 0.0258, "step": 46575 }, { "epoch": 13.89800178944229, "grad_norm": 0.0007500798092223704, "learning_rate": 2.468518862172265e-05, "loss": 0.0204, "step": 46600 }, { "epoch": 13.905457798985983, "grad_norm": 0.009910466149449348, "learning_rate": 2.4668619601813316e-05, "loss": 0.004, "step": 46625 }, { "epoch": 13.912913808529675, "grad_norm": 0.03574687987565994, "learning_rate": 2.4652050581903978e-05, "loss": 0.0359, "step": 46650 }, { "epoch": 13.920369818073366, "grad_norm": 0.004774956498295069, "learning_rate": 2.4635481561994647e-05, "loss": 0.0013, "step": 46675 }, { "epoch": 13.92782582761706, "grad_norm": 0.005982845090329647, "learning_rate": 2.4618912542085313e-05, "loss": 0.0005, "step": 46700 }, { "epoch": 13.935281837160751, "grad_norm": 0.23766636848449707, "learning_rate": 2.460234352217598e-05, "loss": 0.0157, "step": 46725 }, { "epoch": 13.942737846704444, "grad_norm": 0.026977479457855225, "learning_rate": 2.458577450226664e-05, "loss": 0.0192, "step": 46750 }, { "epoch": 13.950193856248136, "grad_norm": 0.009664705954492092, "learning_rate": 2.4569205482357307e-05, "loss": 0.0485, "step": 46775 }, { "epoch": 13.957649865791828, "grad_norm": 0.025634411722421646, "learning_rate": 2.4552636462447976e-05, "loss": 0.0291, "step": 46800 }, { "epoch": 13.96510587533552, "grad_norm": 2.642550468444824, "learning_rate": 2.453606744253864e-05, "loss": 0.0156, "step": 46825 }, { "epoch": 13.972561884879212, "grad_norm": 0.004317351151257753, "learning_rate": 2.4519498422629304e-05, "loss": 0.0001, "step": 46850 }, { "epoch": 13.980017894422906, "grad_norm": 0.19009476900100708, "learning_rate": 2.450292940271997e-05, "loss": 0.0559, "step": 46875 }, { "epoch": 13.987473903966597, "grad_norm": 0.9585381150245667, "learning_rate": 2.4486360382810635e-05, "loss": 0.0051, "step": 46900 }, { "epoch": 13.994929913510289, "grad_norm": 0.01820201426744461, "learning_rate": 2.4469791362901305e-05, "loss": 0.0011, "step": 46925 }, { "epoch": 14.0, "eval_gen_len": 8.7346, "eval_loss": 0.06252285093069077, "eval_rouge1": 97.9816, "eval_rouge2": 85.7167, "eval_rougeL": 97.9566, "eval_rougeLsum": 97.9606, "eval_runtime": 97.0515, "eval_samples_per_second": 17.279, "eval_steps_per_second": 4.328, "step": 46942 } ], "logging_steps": 25, "max_steps": 83825, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 232890506477568.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }