{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.998716302952504, "eval_steps": 500, "global_step": 3505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025673940949935817, "grad_norm": 5121380.5, "learning_rate": 2.0000000000000003e-06, "loss": 13.9863, "step": 10 }, { "epoch": 0.051347881899871634, "grad_norm": 5153433.5, "learning_rate": 4.000000000000001e-06, "loss": 13.5554, "step": 20 }, { "epoch": 0.07702182284980745, "grad_norm": 5378263.0, "learning_rate": 6e-06, "loss": 12.3429, "step": 30 }, { "epoch": 0.10269576379974327, "grad_norm": 5853167.0, "learning_rate": 8.000000000000001e-06, "loss": 10.9424, "step": 40 }, { "epoch": 0.12836970474967907, "grad_norm": 5810211.5, "learning_rate": 1e-05, "loss": 8.8056, "step": 50 }, { "epoch": 0.1540436456996149, "grad_norm": 5421540.5, "learning_rate": 1.2e-05, "loss": 6.446, "step": 60 }, { "epoch": 0.1797175866495507, "grad_norm": 3989913.0, "learning_rate": 1.4000000000000001e-05, "loss": 4.0088, "step": 70 }, { "epoch": 0.20539152759948653, "grad_norm": 1593379.125, "learning_rate": 1.6000000000000003e-05, "loss": 2.4106, "step": 80 }, { "epoch": 0.23106546854942234, "grad_norm": 351223.375, "learning_rate": 1.8e-05, "loss": 1.5032, "step": 90 }, { "epoch": 0.25673940949935814, "grad_norm": 255315.6875, "learning_rate": 2e-05, "loss": 1.0421, "step": 100 }, { "epoch": 0.28241335044929394, "grad_norm": 150451.453125, "learning_rate": 2.2000000000000003e-05, "loss": 0.6102, "step": 110 }, { "epoch": 0.3080872913992298, "grad_norm": 44021.93359375, "learning_rate": 2.4e-05, "loss": 0.3687, "step": 120 }, { "epoch": 0.3337612323491656, "grad_norm": 23825.30078125, "learning_rate": 2.6000000000000002e-05, "loss": 0.2745, "step": 130 }, { "epoch": 0.3594351732991014, "grad_norm": 19372.984375, "learning_rate": 2.8000000000000003e-05, "loss": 0.233, "step": 140 }, { "epoch": 0.3851091142490372, "grad_norm": 19907.76171875, "learning_rate": 3e-05, "loss": 0.2075, "step": 150 }, { "epoch": 0.41078305519897307, "grad_norm": 23577.75390625, "learning_rate": 3.2000000000000005e-05, "loss": 0.197, "step": 160 }, { "epoch": 0.43645699614890887, "grad_norm": 15442.431640625, "learning_rate": 3.4000000000000007e-05, "loss": 0.1759, "step": 170 }, { "epoch": 0.4621309370988447, "grad_norm": 24589.8671875, "learning_rate": 3.6e-05, "loss": 0.1626, "step": 180 }, { "epoch": 0.4878048780487805, "grad_norm": 11427.1728515625, "learning_rate": 3.8e-05, "loss": 0.1541, "step": 190 }, { "epoch": 0.5134788189987163, "grad_norm": 10105.453125, "learning_rate": 4e-05, "loss": 0.1542, "step": 200 }, { "epoch": 0.5391527599486521, "grad_norm": 9209.875, "learning_rate": 4.2e-05, "loss": 0.1497, "step": 210 }, { "epoch": 0.5648267008985879, "grad_norm": 8737.0283203125, "learning_rate": 4.4000000000000006e-05, "loss": 0.1397, "step": 220 }, { "epoch": 0.5905006418485238, "grad_norm": 28856.4609375, "learning_rate": 4.600000000000001e-05, "loss": 0.1466, "step": 230 }, { "epoch": 0.6161745827984596, "grad_norm": 9787.6689453125, "learning_rate": 4.8e-05, "loss": 0.1358, "step": 240 }, { "epoch": 0.6418485237483954, "grad_norm": 6893.62353515625, "learning_rate": 5e-05, "loss": 0.1265, "step": 250 }, { "epoch": 0.6675224646983312, "grad_norm": 14635.2509765625, "learning_rate": 5.2000000000000004e-05, "loss": 0.1376, "step": 260 }, { "epoch": 0.693196405648267, "grad_norm": 7151.79150390625, "learning_rate": 5.4000000000000005e-05, "loss": 0.1375, "step": 270 }, { "epoch": 0.7188703465982028, "grad_norm": 6492.265625, "learning_rate": 5.6000000000000006e-05, "loss": 0.1271, "step": 280 }, { "epoch": 0.7445442875481386, "grad_norm": 6572.00244140625, "learning_rate": 5.8e-05, "loss": 0.1334, "step": 290 }, { "epoch": 0.7702182284980744, "grad_norm": 9629.9423828125, "learning_rate": 6e-05, "loss": 0.1384, "step": 300 }, { "epoch": 0.7958921694480102, "grad_norm": 6990.53466796875, "learning_rate": 6.2e-05, "loss": 0.1349, "step": 310 }, { "epoch": 0.8215661103979461, "grad_norm": 8195.203125, "learning_rate": 6.400000000000001e-05, "loss": 0.1231, "step": 320 }, { "epoch": 0.8472400513478819, "grad_norm": 5189.6279296875, "learning_rate": 6.6e-05, "loss": 0.1274, "step": 330 }, { "epoch": 0.8729139922978177, "grad_norm": 8494.017578125, "learning_rate": 6.800000000000001e-05, "loss": 0.1324, "step": 340 }, { "epoch": 0.8985879332477535, "grad_norm": 6397.20166015625, "learning_rate": 7e-05, "loss": 0.1223, "step": 350 }, { "epoch": 0.9242618741976893, "grad_norm": 5670.24072265625, "learning_rate": 7.2e-05, "loss": 0.129, "step": 360 }, { "epoch": 0.9499358151476252, "grad_norm": 6880.46533203125, "learning_rate": 7.4e-05, "loss": 0.1261, "step": 370 }, { "epoch": 0.975609756097561, "grad_norm": 5716.75537109375, "learning_rate": 7.6e-05, "loss": 0.1263, "step": 380 }, { "epoch": 1.0012836970474968, "grad_norm": 7472.732421875, "learning_rate": 7.800000000000001e-05, "loss": 0.1229, "step": 390 }, { "epoch": 1.0269576379974326, "grad_norm": 6848.66162109375, "learning_rate": 8e-05, "loss": 0.1236, "step": 400 }, { "epoch": 1.0526315789473684, "grad_norm": 6818.19921875, "learning_rate": 8.2e-05, "loss": 0.1198, "step": 410 }, { "epoch": 1.0783055198973042, "grad_norm": 6299.18115234375, "learning_rate": 8.4e-05, "loss": 0.127, "step": 420 }, { "epoch": 1.10397946084724, "grad_norm": 5963.009765625, "learning_rate": 8.6e-05, "loss": 0.1228, "step": 430 }, { "epoch": 1.1296534017971758, "grad_norm": 5657.501953125, "learning_rate": 8.800000000000001e-05, "loss": 0.1248, "step": 440 }, { "epoch": 1.1553273427471118, "grad_norm": 15452.26953125, "learning_rate": 9e-05, "loss": 0.1237, "step": 450 }, { "epoch": 1.1810012836970474, "grad_norm": 6293.9599609375, "learning_rate": 9.200000000000001e-05, "loss": 0.1218, "step": 460 }, { "epoch": 1.2066752246469834, "grad_norm": 6370.39794921875, "learning_rate": 9.4e-05, "loss": 0.1191, "step": 470 }, { "epoch": 1.2323491655969192, "grad_norm": 6101.39697265625, "learning_rate": 9.6e-05, "loss": 0.1161, "step": 480 }, { "epoch": 1.258023106546855, "grad_norm": 6093.3916015625, "learning_rate": 9.8e-05, "loss": 0.1247, "step": 490 }, { "epoch": 1.2836970474967908, "grad_norm": 4891.51171875, "learning_rate": 0.0001, "loss": 0.117, "step": 500 }, { "epoch": 1.2836970474967908, "eval_gen_len": 38.948, "eval_loss": 0.10336166620254517, "eval_rouge1": 0.5729, "eval_rouge2": 0.2787, "eval_rougeL": 0.5355, "eval_runtime": 65.3435, "eval_samples_per_second": 7.652, "eval_steps_per_second": 0.49, "step": 500 }, { "epoch": 1.3093709884467266, "grad_norm": 4472.9599609375, "learning_rate": 9.970501474926254e-05, "loss": 0.1215, "step": 510 }, { "epoch": 1.3350449293966624, "grad_norm": 5620.90576171875, "learning_rate": 9.941002949852508e-05, "loss": 0.1212, "step": 520 }, { "epoch": 1.3607188703465982, "grad_norm": 8320.6953125, "learning_rate": 9.911504424778762e-05, "loss": 0.1145, "step": 530 }, { "epoch": 1.386392811296534, "grad_norm": 6414.8359375, "learning_rate": 9.882005899705014e-05, "loss": 0.1248, "step": 540 }, { "epoch": 1.4120667522464698, "grad_norm": 6748.93017578125, "learning_rate": 9.85250737463127e-05, "loss": 0.1167, "step": 550 }, { "epoch": 1.4377406931964056, "grad_norm": 6198.98779296875, "learning_rate": 9.823008849557522e-05, "loss": 0.126, "step": 560 }, { "epoch": 1.4634146341463414, "grad_norm": 6716.78515625, "learning_rate": 9.793510324483777e-05, "loss": 0.1173, "step": 570 }, { "epoch": 1.4890885750962772, "grad_norm": 5211.44921875, "learning_rate": 9.76401179941003e-05, "loss": 0.1117, "step": 580 }, { "epoch": 1.514762516046213, "grad_norm": 5593.43798828125, "learning_rate": 9.734513274336283e-05, "loss": 0.1113, "step": 590 }, { "epoch": 1.540436456996149, "grad_norm": 6742.3544921875, "learning_rate": 9.705014749262537e-05, "loss": 0.1184, "step": 600 }, { "epoch": 1.5661103979460846, "grad_norm": 5100.9228515625, "learning_rate": 9.675516224188791e-05, "loss": 0.1122, "step": 610 }, { "epoch": 1.5917843388960207, "grad_norm": 6717.302734375, "learning_rate": 9.646017699115044e-05, "loss": 0.1211, "step": 620 }, { "epoch": 1.6174582798459562, "grad_norm": 8748.5849609375, "learning_rate": 9.616519174041299e-05, "loss": 0.1198, "step": 630 }, { "epoch": 1.6431322207958923, "grad_norm": 6425.04248046875, "learning_rate": 9.587020648967551e-05, "loss": 0.1146, "step": 640 }, { "epoch": 1.6688061617458279, "grad_norm": 5612.4580078125, "learning_rate": 9.557522123893806e-05, "loss": 0.1113, "step": 650 }, { "epoch": 1.6944801026957639, "grad_norm": 6821.66259765625, "learning_rate": 9.528023598820059e-05, "loss": 0.1144, "step": 660 }, { "epoch": 1.7201540436456995, "grad_norm": 7305.1376953125, "learning_rate": 9.498525073746313e-05, "loss": 0.1139, "step": 670 }, { "epoch": 1.7458279845956355, "grad_norm": 6363.8701171875, "learning_rate": 9.469026548672566e-05, "loss": 0.1121, "step": 680 }, { "epoch": 1.7715019255455713, "grad_norm": 5113.75537109375, "learning_rate": 9.43952802359882e-05, "loss": 0.115, "step": 690 }, { "epoch": 1.797175866495507, "grad_norm": 6163.470703125, "learning_rate": 9.410029498525074e-05, "loss": 0.1244, "step": 700 }, { "epoch": 1.822849807445443, "grad_norm": 6645.03466796875, "learning_rate": 9.380530973451328e-05, "loss": 0.1168, "step": 710 }, { "epoch": 1.8485237483953787, "grad_norm": 9238.5634765625, "learning_rate": 9.351032448377582e-05, "loss": 0.1198, "step": 720 }, { "epoch": 1.8741976893453145, "grad_norm": 5459.36962890625, "learning_rate": 9.321533923303836e-05, "loss": 0.1175, "step": 730 }, { "epoch": 1.8998716302952503, "grad_norm": 5480.2998046875, "learning_rate": 9.29203539823009e-05, "loss": 0.1115, "step": 740 }, { "epoch": 1.925545571245186, "grad_norm": 5369.232421875, "learning_rate": 9.262536873156342e-05, "loss": 0.1126, "step": 750 }, { "epoch": 1.951219512195122, "grad_norm": 5776.08154296875, "learning_rate": 9.233038348082597e-05, "loss": 0.1184, "step": 760 }, { "epoch": 1.976893453145058, "grad_norm": 4656.33203125, "learning_rate": 9.20353982300885e-05, "loss": 0.1145, "step": 770 }, { "epoch": 2.0025673940949935, "grad_norm": 6602.3056640625, "learning_rate": 9.174041297935103e-05, "loss": 0.126, "step": 780 }, { "epoch": 2.0282413350449295, "grad_norm": 5742.654296875, "learning_rate": 9.144542772861357e-05, "loss": 0.1036, "step": 790 }, { "epoch": 2.053915275994865, "grad_norm": 5546.1728515625, "learning_rate": 9.115044247787611e-05, "loss": 0.1077, "step": 800 }, { "epoch": 2.079589216944801, "grad_norm": 6285.85693359375, "learning_rate": 9.085545722713865e-05, "loss": 0.1156, "step": 810 }, { "epoch": 2.1052631578947367, "grad_norm": 5655.2763671875, "learning_rate": 9.056047197640119e-05, "loss": 0.1136, "step": 820 }, { "epoch": 2.1309370988446728, "grad_norm": 5984.7080078125, "learning_rate": 9.026548672566371e-05, "loss": 0.1076, "step": 830 }, { "epoch": 2.1566110397946083, "grad_norm": 5037.02490234375, "learning_rate": 8.997050147492626e-05, "loss": 0.1181, "step": 840 }, { "epoch": 2.1822849807445444, "grad_norm": 5577.619140625, "learning_rate": 8.96755162241888e-05, "loss": 0.1147, "step": 850 }, { "epoch": 2.20795892169448, "grad_norm": 4631.0732421875, "learning_rate": 8.938053097345133e-05, "loss": 0.1106, "step": 860 }, { "epoch": 2.233632862644416, "grad_norm": 4876.57177734375, "learning_rate": 8.908554572271388e-05, "loss": 0.1099, "step": 870 }, { "epoch": 2.2593068035943515, "grad_norm": 4877.7177734375, "learning_rate": 8.87905604719764e-05, "loss": 0.1134, "step": 880 }, { "epoch": 2.2849807445442876, "grad_norm": 5603.54150390625, "learning_rate": 8.849557522123895e-05, "loss": 0.1091, "step": 890 }, { "epoch": 2.3106546854942236, "grad_norm": 4900.8056640625, "learning_rate": 8.820058997050148e-05, "loss": 0.111, "step": 900 }, { "epoch": 2.336328626444159, "grad_norm": 5891.17919921875, "learning_rate": 8.790560471976402e-05, "loss": 0.113, "step": 910 }, { "epoch": 2.3620025673940948, "grad_norm": 6097.0400390625, "learning_rate": 8.761061946902655e-05, "loss": 0.1131, "step": 920 }, { "epoch": 2.387676508344031, "grad_norm": 5553.201171875, "learning_rate": 8.731563421828909e-05, "loss": 0.105, "step": 930 }, { "epoch": 2.413350449293967, "grad_norm": 4939.27392578125, "learning_rate": 8.702064896755162e-05, "loss": 0.115, "step": 940 }, { "epoch": 2.4390243902439024, "grad_norm": 5012.8740234375, "learning_rate": 8.672566371681417e-05, "loss": 0.1095, "step": 950 }, { "epoch": 2.4646983311938384, "grad_norm": 6639.900390625, "learning_rate": 8.64306784660767e-05, "loss": 0.1096, "step": 960 }, { "epoch": 2.490372272143774, "grad_norm": 6572.72705078125, "learning_rate": 8.613569321533924e-05, "loss": 0.1091, "step": 970 }, { "epoch": 2.51604621309371, "grad_norm": 4813.23291015625, "learning_rate": 8.584070796460177e-05, "loss": 0.1114, "step": 980 }, { "epoch": 2.5417201540436456, "grad_norm": 6162.396484375, "learning_rate": 8.554572271386431e-05, "loss": 0.1101, "step": 990 }, { "epoch": 2.5673940949935816, "grad_norm": 6070.53857421875, "learning_rate": 8.525073746312685e-05, "loss": 0.1103, "step": 1000 }, { "epoch": 2.5673940949935816, "eval_gen_len": 38.946, "eval_loss": 0.0969744473695755, "eval_rouge1": 0.5903, "eval_rouge2": 0.3078, "eval_rougeL": 0.556, "eval_runtime": 65.5669, "eval_samples_per_second": 7.626, "eval_steps_per_second": 0.488, "step": 1000 }, { "epoch": 2.593068035943517, "grad_norm": 6648.20751953125, "learning_rate": 8.495575221238938e-05, "loss": 0.109, "step": 1010 }, { "epoch": 2.6187419768934532, "grad_norm": 4565.7236328125, "learning_rate": 8.466076696165192e-05, "loss": 0.1062, "step": 1020 }, { "epoch": 2.644415917843389, "grad_norm": 5601.0908203125, "learning_rate": 8.436578171091446e-05, "loss": 0.1099, "step": 1030 }, { "epoch": 2.670089858793325, "grad_norm": 5187.435546875, "learning_rate": 8.4070796460177e-05, "loss": 0.1127, "step": 1040 }, { "epoch": 2.6957637997432604, "grad_norm": 5046.6767578125, "learning_rate": 8.377581120943954e-05, "loss": 0.122, "step": 1050 }, { "epoch": 2.7214377406931964, "grad_norm": 4885.3818359375, "learning_rate": 8.348082595870208e-05, "loss": 0.105, "step": 1060 }, { "epoch": 2.7471116816431325, "grad_norm": 5473.66015625, "learning_rate": 8.31858407079646e-05, "loss": 0.1081, "step": 1070 }, { "epoch": 2.772785622593068, "grad_norm": 6577.04931640625, "learning_rate": 8.289085545722715e-05, "loss": 0.1161, "step": 1080 }, { "epoch": 2.7984595635430036, "grad_norm": 5628.3349609375, "learning_rate": 8.259587020648968e-05, "loss": 0.1057, "step": 1090 }, { "epoch": 2.8241335044929397, "grad_norm": 5690.630859375, "learning_rate": 8.230088495575221e-05, "loss": 0.1116, "step": 1100 }, { "epoch": 2.8498074454428757, "grad_norm": 4804.5458984375, "learning_rate": 8.200589970501475e-05, "loss": 0.103, "step": 1110 }, { "epoch": 2.8754813863928113, "grad_norm": 4892.4580078125, "learning_rate": 8.171091445427729e-05, "loss": 0.1116, "step": 1120 }, { "epoch": 2.901155327342747, "grad_norm": 5333.4130859375, "learning_rate": 8.141592920353983e-05, "loss": 0.106, "step": 1130 }, { "epoch": 2.926829268292683, "grad_norm": 5062.69580078125, "learning_rate": 8.112094395280237e-05, "loss": 0.1086, "step": 1140 }, { "epoch": 2.952503209242619, "grad_norm": 4429.697265625, "learning_rate": 8.082595870206489e-05, "loss": 0.1079, "step": 1150 }, { "epoch": 2.9781771501925545, "grad_norm": 5827.998046875, "learning_rate": 8.053097345132744e-05, "loss": 0.1151, "step": 1160 }, { "epoch": 3.0038510911424905, "grad_norm": 5520.6826171875, "learning_rate": 8.023598820058997e-05, "loss": 0.1063, "step": 1170 }, { "epoch": 3.029525032092426, "grad_norm": 5321.1328125, "learning_rate": 7.99410029498525e-05, "loss": 0.1028, "step": 1180 }, { "epoch": 3.055198973042362, "grad_norm": 6147.1103515625, "learning_rate": 7.964601769911504e-05, "loss": 0.1027, "step": 1190 }, { "epoch": 3.0808729139922977, "grad_norm": 5837.15673828125, "learning_rate": 7.935103244837758e-05, "loss": 0.1129, "step": 1200 }, { "epoch": 3.1065468549422337, "grad_norm": 5890.27490234375, "learning_rate": 7.905604719764012e-05, "loss": 0.105, "step": 1210 }, { "epoch": 3.1322207958921693, "grad_norm": 5193.16259765625, "learning_rate": 7.876106194690266e-05, "loss": 0.1025, "step": 1220 }, { "epoch": 3.1578947368421053, "grad_norm": 5144.00048828125, "learning_rate": 7.84660766961652e-05, "loss": 0.1053, "step": 1230 }, { "epoch": 3.183568677792041, "grad_norm": 4484.66552734375, "learning_rate": 7.817109144542774e-05, "loss": 0.1111, "step": 1240 }, { "epoch": 3.209242618741977, "grad_norm": 5307.7431640625, "learning_rate": 7.787610619469027e-05, "loss": 0.1088, "step": 1250 }, { "epoch": 3.2349165596919125, "grad_norm": 5560.54736328125, "learning_rate": 7.75811209439528e-05, "loss": 0.1056, "step": 1260 }, { "epoch": 3.2605905006418485, "grad_norm": 5743.2255859375, "learning_rate": 7.728613569321535e-05, "loss": 0.1057, "step": 1270 }, { "epoch": 3.2862644415917845, "grad_norm": 5270.234375, "learning_rate": 7.699115044247787e-05, "loss": 0.1072, "step": 1280 }, { "epoch": 3.31193838254172, "grad_norm": 5415.4580078125, "learning_rate": 7.669616519174043e-05, "loss": 0.1044, "step": 1290 }, { "epoch": 3.337612323491656, "grad_norm": 5077.26123046875, "learning_rate": 7.640117994100295e-05, "loss": 0.1063, "step": 1300 }, { "epoch": 3.3632862644415917, "grad_norm": 5815.587890625, "learning_rate": 7.610619469026549e-05, "loss": 0.104, "step": 1310 }, { "epoch": 3.3889602053915278, "grad_norm": 5168.4140625, "learning_rate": 7.581120943952803e-05, "loss": 0.1057, "step": 1320 }, { "epoch": 3.4146341463414633, "grad_norm": 6175.1962890625, "learning_rate": 7.551622418879057e-05, "loss": 0.1085, "step": 1330 }, { "epoch": 3.4403080872913994, "grad_norm": 4866.1806640625, "learning_rate": 7.522123893805309e-05, "loss": 0.1088, "step": 1340 }, { "epoch": 3.465982028241335, "grad_norm": 4987.28662109375, "learning_rate": 7.492625368731564e-05, "loss": 0.1144, "step": 1350 }, { "epoch": 3.491655969191271, "grad_norm": 6046.6435546875, "learning_rate": 7.463126843657817e-05, "loss": 0.1066, "step": 1360 }, { "epoch": 3.5173299101412066, "grad_norm": 6097.72802734375, "learning_rate": 7.433628318584072e-05, "loss": 0.1109, "step": 1370 }, { "epoch": 3.5430038510911426, "grad_norm": 6237.98388671875, "learning_rate": 7.404129793510324e-05, "loss": 0.1061, "step": 1380 }, { "epoch": 3.568677792041078, "grad_norm": 5106.3720703125, "learning_rate": 7.374631268436578e-05, "loss": 0.1052, "step": 1390 }, { "epoch": 3.594351732991014, "grad_norm": 5083.40234375, "learning_rate": 7.345132743362832e-05, "loss": 0.1012, "step": 1400 }, { "epoch": 3.62002567394095, "grad_norm": 5191.7353515625, "learning_rate": 7.315634218289086e-05, "loss": 0.1081, "step": 1410 }, { "epoch": 3.645699614890886, "grad_norm": 6808.04638671875, "learning_rate": 7.28613569321534e-05, "loss": 0.1046, "step": 1420 }, { "epoch": 3.6713735558408214, "grad_norm": 5069.12353515625, "learning_rate": 7.256637168141593e-05, "loss": 0.1018, "step": 1430 }, { "epoch": 3.6970474967907574, "grad_norm": 5624.8330078125, "learning_rate": 7.227138643067847e-05, "loss": 0.1087, "step": 1440 }, { "epoch": 3.7227214377406934, "grad_norm": 8308.7177734375, "learning_rate": 7.197640117994101e-05, "loss": 0.1045, "step": 1450 }, { "epoch": 3.748395378690629, "grad_norm": 5380.990234375, "learning_rate": 7.168141592920355e-05, "loss": 0.1083, "step": 1460 }, { "epoch": 3.7740693196405646, "grad_norm": 4756.90576171875, "learning_rate": 7.138643067846607e-05, "loss": 0.1058, "step": 1470 }, { "epoch": 3.7997432605905006, "grad_norm": 5780.90625, "learning_rate": 7.109144542772862e-05, "loss": 0.1029, "step": 1480 }, { "epoch": 3.8254172015404366, "grad_norm": 5286.49560546875, "learning_rate": 7.079646017699115e-05, "loss": 0.1029, "step": 1490 }, { "epoch": 3.851091142490372, "grad_norm": 5398.28369140625, "learning_rate": 7.050147492625369e-05, "loss": 0.1086, "step": 1500 }, { "epoch": 3.851091142490372, "eval_gen_len": 38.946, "eval_loss": 0.09477131813764572, "eval_rouge1": 0.5988, "eval_rouge2": 0.3178, "eval_rougeL": 0.5661, "eval_runtime": 65.5594, "eval_samples_per_second": 7.627, "eval_steps_per_second": 0.488, "step": 1500 }, { "epoch": 3.8767650834403082, "grad_norm": 4950.18212890625, "learning_rate": 7.020648967551623e-05, "loss": 0.1025, "step": 1510 }, { "epoch": 3.902439024390244, "grad_norm": 4885.29248046875, "learning_rate": 6.991150442477876e-05, "loss": 0.1067, "step": 1520 }, { "epoch": 3.92811296534018, "grad_norm": 6418.5791015625, "learning_rate": 6.96165191740413e-05, "loss": 0.1009, "step": 1530 }, { "epoch": 3.9537869062901154, "grad_norm": 6914.34375, "learning_rate": 6.932153392330384e-05, "loss": 0.1085, "step": 1540 }, { "epoch": 3.9794608472400514, "grad_norm": 5611.89306640625, "learning_rate": 6.902654867256638e-05, "loss": 0.1125, "step": 1550 }, { "epoch": 4.005134788189987, "grad_norm": 4575.3046875, "learning_rate": 6.873156342182892e-05, "loss": 0.1037, "step": 1560 }, { "epoch": 4.030808729139923, "grad_norm": 5809.431640625, "learning_rate": 6.843657817109145e-05, "loss": 0.0999, "step": 1570 }, { "epoch": 4.056482670089859, "grad_norm": 6907.01025390625, "learning_rate": 6.814159292035398e-05, "loss": 0.1007, "step": 1580 }, { "epoch": 4.082156611039794, "grad_norm": 6448.38330078125, "learning_rate": 6.784660766961653e-05, "loss": 0.11, "step": 1590 }, { "epoch": 4.10783055198973, "grad_norm": 15915.1982421875, "learning_rate": 6.755162241887906e-05, "loss": 0.1106, "step": 1600 }, { "epoch": 4.133504492939666, "grad_norm": 5690.85107421875, "learning_rate": 6.725663716814161e-05, "loss": 0.1037, "step": 1610 }, { "epoch": 4.159178433889602, "grad_norm": 4913.6220703125, "learning_rate": 6.696165191740413e-05, "loss": 0.1052, "step": 1620 }, { "epoch": 4.184852374839538, "grad_norm": 5320.2470703125, "learning_rate": 6.666666666666667e-05, "loss": 0.1018, "step": 1630 }, { "epoch": 4.2105263157894735, "grad_norm": 6042.61376953125, "learning_rate": 6.637168141592921e-05, "loss": 0.1013, "step": 1640 }, { "epoch": 4.2362002567394095, "grad_norm": 5034.08203125, "learning_rate": 6.607669616519175e-05, "loss": 0.1085, "step": 1650 }, { "epoch": 4.2618741976893455, "grad_norm": 6053.23876953125, "learning_rate": 6.578171091445427e-05, "loss": 0.0986, "step": 1660 }, { "epoch": 4.2875481386392815, "grad_norm": 5543.45556640625, "learning_rate": 6.548672566371682e-05, "loss": 0.0986, "step": 1670 }, { "epoch": 4.313222079589217, "grad_norm": 6083.2236328125, "learning_rate": 6.519174041297935e-05, "loss": 0.1028, "step": 1680 }, { "epoch": 4.338896020539153, "grad_norm": 5847.65087890625, "learning_rate": 6.48967551622419e-05, "loss": 0.1001, "step": 1690 }, { "epoch": 4.364569961489089, "grad_norm": 5046.88623046875, "learning_rate": 6.460176991150442e-05, "loss": 0.102, "step": 1700 }, { "epoch": 4.390243902439025, "grad_norm": 7761.01611328125, "learning_rate": 6.430678466076696e-05, "loss": 0.1017, "step": 1710 }, { "epoch": 4.41591784338896, "grad_norm": 5590.93505859375, "learning_rate": 6.40117994100295e-05, "loss": 0.1058, "step": 1720 }, { "epoch": 4.441591784338896, "grad_norm": 4478.46484375, "learning_rate": 6.371681415929204e-05, "loss": 0.0995, "step": 1730 }, { "epoch": 4.467265725288832, "grad_norm": 6958.63720703125, "learning_rate": 6.342182890855458e-05, "loss": 0.1028, "step": 1740 }, { "epoch": 4.492939666238768, "grad_norm": 5210.4853515625, "learning_rate": 6.312684365781711e-05, "loss": 0.1078, "step": 1750 }, { "epoch": 4.518613607188703, "grad_norm": 4667.54345703125, "learning_rate": 6.283185840707965e-05, "loss": 0.1025, "step": 1760 }, { "epoch": 4.544287548138639, "grad_norm": 5578.6943359375, "learning_rate": 6.253687315634219e-05, "loss": 0.1029, "step": 1770 }, { "epoch": 4.569961489088575, "grad_norm": 6289.7841796875, "learning_rate": 6.224188790560473e-05, "loss": 0.1062, "step": 1780 }, { "epoch": 4.595635430038511, "grad_norm": 5193.00244140625, "learning_rate": 6.194690265486725e-05, "loss": 0.1104, "step": 1790 }, { "epoch": 4.621309370988447, "grad_norm": 5092.68408203125, "learning_rate": 6.16519174041298e-05, "loss": 0.0996, "step": 1800 }, { "epoch": 4.646983311938382, "grad_norm": 5535.3857421875, "learning_rate": 6.135693215339233e-05, "loss": 0.1066, "step": 1810 }, { "epoch": 4.672657252888318, "grad_norm": 6088.28515625, "learning_rate": 6.106194690265487e-05, "loss": 0.1031, "step": 1820 }, { "epoch": 4.698331193838254, "grad_norm": 5986.71240234375, "learning_rate": 6.0766961651917406e-05, "loss": 0.1043, "step": 1830 }, { "epoch": 4.7240051347881895, "grad_norm": 5196.69140625, "learning_rate": 6.0471976401179945e-05, "loss": 0.1035, "step": 1840 }, { "epoch": 4.7496790757381255, "grad_norm": 5394.7138671875, "learning_rate": 6.017699115044248e-05, "loss": 0.1017, "step": 1850 }, { "epoch": 4.775353016688062, "grad_norm": 5689.53173828125, "learning_rate": 5.988200589970502e-05, "loss": 0.107, "step": 1860 }, { "epoch": 4.801026957637998, "grad_norm": 5098.4541015625, "learning_rate": 5.958702064896755e-05, "loss": 0.1032, "step": 1870 }, { "epoch": 4.826700898587934, "grad_norm": 4243.0087890625, "learning_rate": 5.92920353982301e-05, "loss": 0.1017, "step": 1880 }, { "epoch": 4.852374839537869, "grad_norm": 5340.123046875, "learning_rate": 5.899705014749263e-05, "loss": 0.0986, "step": 1890 }, { "epoch": 4.878048780487805, "grad_norm": 5436.1259765625, "learning_rate": 5.870206489675516e-05, "loss": 0.1, "step": 1900 }, { "epoch": 4.903722721437741, "grad_norm": 5866.4375, "learning_rate": 5.8407079646017705e-05, "loss": 0.104, "step": 1910 }, { "epoch": 4.929396662387677, "grad_norm": 5687.0595703125, "learning_rate": 5.8112094395280236e-05, "loss": 0.1003, "step": 1920 }, { "epoch": 4.955070603337612, "grad_norm": 5049.65869140625, "learning_rate": 5.781710914454278e-05, "loss": 0.1051, "step": 1930 }, { "epoch": 4.980744544287548, "grad_norm": 4348.83251953125, "learning_rate": 5.752212389380531e-05, "loss": 0.1067, "step": 1940 }, { "epoch": 5.006418485237484, "grad_norm": 5278.10498046875, "learning_rate": 5.7227138643067844e-05, "loss": 0.1028, "step": 1950 }, { "epoch": 5.03209242618742, "grad_norm": 5227.31689453125, "learning_rate": 5.693215339233039e-05, "loss": 0.1015, "step": 1960 }, { "epoch": 5.057766367137355, "grad_norm": 5626.7041015625, "learning_rate": 5.663716814159292e-05, "loss": 0.0953, "step": 1970 }, { "epoch": 5.083440308087291, "grad_norm": 4941.787109375, "learning_rate": 5.634218289085545e-05, "loss": 0.1, "step": 1980 }, { "epoch": 5.109114249037227, "grad_norm": 5543.74365234375, "learning_rate": 5.6047197640118e-05, "loss": 0.0975, "step": 1990 }, { "epoch": 5.134788189987163, "grad_norm": 6526.22509765625, "learning_rate": 5.575221238938053e-05, "loss": 0.1046, "step": 2000 }, { "epoch": 5.134788189987163, "eval_gen_len": 38.946, "eval_loss": 0.09348437190055847, "eval_rouge1": 0.6042, "eval_rouge2": 0.3248, "eval_rougeL": 0.5712, "eval_runtime": 65.0847, "eval_samples_per_second": 7.682, "eval_steps_per_second": 0.492, "step": 2000 }, { "epoch": 5.160462130937099, "grad_norm": 5070.046875, "learning_rate": 5.545722713864307e-05, "loss": 0.0981, "step": 2010 }, { "epoch": 5.186136071887034, "grad_norm": 5264.22509765625, "learning_rate": 5.5162241887905605e-05, "loss": 0.1023, "step": 2020 }, { "epoch": 5.21181001283697, "grad_norm": 10262.3994140625, "learning_rate": 5.486725663716814e-05, "loss": 0.1007, "step": 2030 }, { "epoch": 5.2374839537869065, "grad_norm": 4638.310546875, "learning_rate": 5.457227138643069e-05, "loss": 0.1019, "step": 2040 }, { "epoch": 5.2631578947368425, "grad_norm": 5691.34033203125, "learning_rate": 5.427728613569322e-05, "loss": 0.1048, "step": 2050 }, { "epoch": 5.288831835686778, "grad_norm": 5892.60986328125, "learning_rate": 5.398230088495575e-05, "loss": 0.1002, "step": 2060 }, { "epoch": 5.314505776636714, "grad_norm": 5043.25, "learning_rate": 5.3687315634218295e-05, "loss": 0.1026, "step": 2070 }, { "epoch": 5.34017971758665, "grad_norm": 5076.90283203125, "learning_rate": 5.339233038348083e-05, "loss": 0.103, "step": 2080 }, { "epoch": 5.365853658536586, "grad_norm": 5730.2998046875, "learning_rate": 5.309734513274337e-05, "loss": 0.0995, "step": 2090 }, { "epoch": 5.391527599486521, "grad_norm": 5071.3759765625, "learning_rate": 5.28023598820059e-05, "loss": 0.1006, "step": 2100 }, { "epoch": 5.417201540436457, "grad_norm": 4912.38134765625, "learning_rate": 5.2507374631268435e-05, "loss": 0.0965, "step": 2110 }, { "epoch": 5.442875481386393, "grad_norm": 5349.1376953125, "learning_rate": 5.221238938053098e-05, "loss": 0.1016, "step": 2120 }, { "epoch": 5.468549422336329, "grad_norm": 6012.4912109375, "learning_rate": 5.191740412979351e-05, "loss": 0.0985, "step": 2130 }, { "epoch": 5.494223363286264, "grad_norm": 6078.17333984375, "learning_rate": 5.162241887905604e-05, "loss": 0.1001, "step": 2140 }, { "epoch": 5.5198973042362, "grad_norm": 6352.015625, "learning_rate": 5.132743362831859e-05, "loss": 0.0995, "step": 2150 }, { "epoch": 5.545571245186136, "grad_norm": 10780.03125, "learning_rate": 5.103244837758112e-05, "loss": 0.1037, "step": 2160 }, { "epoch": 5.571245186136072, "grad_norm": 4540.59326171875, "learning_rate": 5.0737463126843664e-05, "loss": 0.1003, "step": 2170 }, { "epoch": 5.596919127086007, "grad_norm": 5141.4697265625, "learning_rate": 5.0442477876106195e-05, "loss": 0.0993, "step": 2180 }, { "epoch": 5.622593068035943, "grad_norm": 8023.310546875, "learning_rate": 5.014749262536873e-05, "loss": 0.0923, "step": 2190 }, { "epoch": 5.648267008985879, "grad_norm": 6443.39404296875, "learning_rate": 4.985250737463127e-05, "loss": 0.1059, "step": 2200 }, { "epoch": 5.673940949935815, "grad_norm": 4546.185546875, "learning_rate": 4.955752212389381e-05, "loss": 0.1027, "step": 2210 }, { "epoch": 5.699614890885751, "grad_norm": 5331.25634765625, "learning_rate": 4.926253687315635e-05, "loss": 0.0994, "step": 2220 }, { "epoch": 5.7252888318356865, "grad_norm": 5486.52587890625, "learning_rate": 4.8967551622418886e-05, "loss": 0.1097, "step": 2230 }, { "epoch": 5.7509627727856225, "grad_norm": 5083.9794921875, "learning_rate": 4.867256637168142e-05, "loss": 0.0971, "step": 2240 }, { "epoch": 5.7766367137355585, "grad_norm": 5799.4931640625, "learning_rate": 4.8377581120943956e-05, "loss": 0.103, "step": 2250 }, { "epoch": 5.802310654685495, "grad_norm": 5407.1708984375, "learning_rate": 4.8082595870206494e-05, "loss": 0.1091, "step": 2260 }, { "epoch": 5.82798459563543, "grad_norm": 4890.9697265625, "learning_rate": 4.778761061946903e-05, "loss": 0.1013, "step": 2270 }, { "epoch": 5.853658536585366, "grad_norm": 5403.1416015625, "learning_rate": 4.749262536873156e-05, "loss": 0.1076, "step": 2280 }, { "epoch": 5.879332477535302, "grad_norm": 5159.65234375, "learning_rate": 4.71976401179941e-05, "loss": 0.0994, "step": 2290 }, { "epoch": 5.905006418485238, "grad_norm": 6055.45458984375, "learning_rate": 4.690265486725664e-05, "loss": 0.0998, "step": 2300 }, { "epoch": 5.930680359435174, "grad_norm": 5306.44677734375, "learning_rate": 4.660766961651918e-05, "loss": 0.0995, "step": 2310 }, { "epoch": 5.956354300385109, "grad_norm": 5193.0009765625, "learning_rate": 4.631268436578171e-05, "loss": 0.1011, "step": 2320 }, { "epoch": 5.982028241335045, "grad_norm": 6859.47509765625, "learning_rate": 4.601769911504425e-05, "loss": 0.1043, "step": 2330 }, { "epoch": 6.007702182284981, "grad_norm": 4973.0458984375, "learning_rate": 4.5722713864306786e-05, "loss": 0.1002, "step": 2340 }, { "epoch": 6.033376123234916, "grad_norm": 5652.50439453125, "learning_rate": 4.5427728613569324e-05, "loss": 0.0994, "step": 2350 }, { "epoch": 6.059050064184852, "grad_norm": 6935.865234375, "learning_rate": 4.5132743362831855e-05, "loss": 0.0998, "step": 2360 }, { "epoch": 6.084724005134788, "grad_norm": 4675.81982421875, "learning_rate": 4.48377581120944e-05, "loss": 0.1014, "step": 2370 }, { "epoch": 6.110397946084724, "grad_norm": 4515.3134765625, "learning_rate": 4.454277286135694e-05, "loss": 0.0968, "step": 2380 }, { "epoch": 6.13607188703466, "grad_norm": 5213.7578125, "learning_rate": 4.4247787610619477e-05, "loss": 0.0987, "step": 2390 }, { "epoch": 6.161745827984595, "grad_norm": 5425.05615234375, "learning_rate": 4.395280235988201e-05, "loss": 0.102, "step": 2400 }, { "epoch": 6.187419768934531, "grad_norm": 4345.66552734375, "learning_rate": 4.3657817109144546e-05, "loss": 0.0978, "step": 2410 }, { "epoch": 6.213093709884467, "grad_norm": 5057.90087890625, "learning_rate": 4.3362831858407084e-05, "loss": 0.1011, "step": 2420 }, { "epoch": 6.238767650834403, "grad_norm": 6916.2607421875, "learning_rate": 4.306784660766962e-05, "loss": 0.1023, "step": 2430 }, { "epoch": 6.264441591784339, "grad_norm": 6013.05126953125, "learning_rate": 4.2772861356932154e-05, "loss": 0.0995, "step": 2440 }, { "epoch": 6.290115532734275, "grad_norm": 4742.91357421875, "learning_rate": 4.247787610619469e-05, "loss": 0.0974, "step": 2450 }, { "epoch": 6.315789473684211, "grad_norm": 4979.93115234375, "learning_rate": 4.218289085545723e-05, "loss": 0.1019, "step": 2460 }, { "epoch": 6.341463414634147, "grad_norm": 5349.9130859375, "learning_rate": 4.188790560471977e-05, "loss": 0.1027, "step": 2470 }, { "epoch": 6.367137355584082, "grad_norm": 5003.3203125, "learning_rate": 4.15929203539823e-05, "loss": 0.1038, "step": 2480 }, { "epoch": 6.392811296534018, "grad_norm": 5897.6796875, "learning_rate": 4.129793510324484e-05, "loss": 0.0998, "step": 2490 }, { "epoch": 6.418485237483954, "grad_norm": 5018.42138671875, "learning_rate": 4.1002949852507376e-05, "loss": 0.0962, "step": 2500 }, { "epoch": 6.418485237483954, "eval_gen_len": 38.946, "eval_loss": 0.09220927208662033, "eval_rouge1": 0.6077, "eval_rouge2": 0.3279, "eval_rougeL": 0.5755, "eval_runtime": 65.0093, "eval_samples_per_second": 7.691, "eval_steps_per_second": 0.492, "step": 2500 }, { "epoch": 6.44415917843389, "grad_norm": 6092.09375, "learning_rate": 4.0707964601769914e-05, "loss": 0.0929, "step": 2510 }, { "epoch": 6.469833119383825, "grad_norm": 6269.76171875, "learning_rate": 4.0412979351032446e-05, "loss": 0.0972, "step": 2520 }, { "epoch": 6.495507060333761, "grad_norm": 4338.68896484375, "learning_rate": 4.0117994100294984e-05, "loss": 0.1009, "step": 2530 }, { "epoch": 6.521181001283697, "grad_norm": 4670.00537109375, "learning_rate": 3.982300884955752e-05, "loss": 0.1032, "step": 2540 }, { "epoch": 6.546854942233633, "grad_norm": 5199.564453125, "learning_rate": 3.952802359882006e-05, "loss": 0.0977, "step": 2550 }, { "epoch": 6.572528883183569, "grad_norm": 6262.904296875, "learning_rate": 3.92330383480826e-05, "loss": 0.0966, "step": 2560 }, { "epoch": 6.598202824133504, "grad_norm": 7214.66748046875, "learning_rate": 3.893805309734514e-05, "loss": 0.0942, "step": 2570 }, { "epoch": 6.62387676508344, "grad_norm": 5746.22705078125, "learning_rate": 3.8643067846607675e-05, "loss": 0.0946, "step": 2580 }, { "epoch": 6.649550706033376, "grad_norm": 6876.60986328125, "learning_rate": 3.834808259587021e-05, "loss": 0.1031, "step": 2590 }, { "epoch": 6.675224646983312, "grad_norm": 5216.8642578125, "learning_rate": 3.8053097345132744e-05, "loss": 0.0984, "step": 2600 }, { "epoch": 6.700898587933247, "grad_norm": 5965.8583984375, "learning_rate": 3.775811209439528e-05, "loss": 0.099, "step": 2610 }, { "epoch": 6.7265725288831835, "grad_norm": 7099.044921875, "learning_rate": 3.746312684365782e-05, "loss": 0.1052, "step": 2620 }, { "epoch": 6.7522464698331195, "grad_norm": 4748.5703125, "learning_rate": 3.716814159292036e-05, "loss": 0.1045, "step": 2630 }, { "epoch": 6.7779204107830555, "grad_norm": 5743.19921875, "learning_rate": 3.687315634218289e-05, "loss": 0.0937, "step": 2640 }, { "epoch": 6.803594351732991, "grad_norm": 5680.45068359375, "learning_rate": 3.657817109144543e-05, "loss": 0.0965, "step": 2650 }, { "epoch": 6.829268292682927, "grad_norm": 7245.03564453125, "learning_rate": 3.628318584070797e-05, "loss": 0.0909, "step": 2660 }, { "epoch": 6.854942233632863, "grad_norm": 5226.4365234375, "learning_rate": 3.5988200589970505e-05, "loss": 0.0987, "step": 2670 }, { "epoch": 6.880616174582799, "grad_norm": 5511.99853515625, "learning_rate": 3.5693215339233036e-05, "loss": 0.1066, "step": 2680 }, { "epoch": 6.906290115532734, "grad_norm": 5711.359375, "learning_rate": 3.5398230088495574e-05, "loss": 0.0995, "step": 2690 }, { "epoch": 6.93196405648267, "grad_norm": 5092.283203125, "learning_rate": 3.510324483775811e-05, "loss": 0.0971, "step": 2700 }, { "epoch": 6.957637997432606, "grad_norm": 6100.78271484375, "learning_rate": 3.480825958702065e-05, "loss": 0.1009, "step": 2710 }, { "epoch": 6.983311938382542, "grad_norm": 5600.61181640625, "learning_rate": 3.451327433628319e-05, "loss": 0.1026, "step": 2720 }, { "epoch": 7.008985879332478, "grad_norm": 5000.9541015625, "learning_rate": 3.421828908554573e-05, "loss": 0.101, "step": 2730 }, { "epoch": 7.034659820282413, "grad_norm": 5288.25048828125, "learning_rate": 3.3923303834808265e-05, "loss": 0.0961, "step": 2740 }, { "epoch": 7.060333761232349, "grad_norm": 5404.33837890625, "learning_rate": 3.3628318584070804e-05, "loss": 0.1074, "step": 2750 }, { "epoch": 7.086007702182285, "grad_norm": 4586.51708984375, "learning_rate": 3.3333333333333335e-05, "loss": 0.0978, "step": 2760 }, { "epoch": 7.111681643132221, "grad_norm": 5383.466796875, "learning_rate": 3.303834808259587e-05, "loss": 0.0983, "step": 2770 }, { "epoch": 7.137355584082156, "grad_norm": 5845.02294921875, "learning_rate": 3.274336283185841e-05, "loss": 0.0922, "step": 2780 }, { "epoch": 7.163029525032092, "grad_norm": 5654.388671875, "learning_rate": 3.244837758112095e-05, "loss": 0.0941, "step": 2790 }, { "epoch": 7.188703465982028, "grad_norm": 5832.42724609375, "learning_rate": 3.215339233038348e-05, "loss": 0.0949, "step": 2800 }, { "epoch": 7.214377406931964, "grad_norm": 5299.91015625, "learning_rate": 3.185840707964602e-05, "loss": 0.0988, "step": 2810 }, { "epoch": 7.2400513478818995, "grad_norm": 5652.5087890625, "learning_rate": 3.156342182890856e-05, "loss": 0.0982, "step": 2820 }, { "epoch": 7.2657252888318355, "grad_norm": 6181.0361328125, "learning_rate": 3.1268436578171095e-05, "loss": 0.098, "step": 2830 }, { "epoch": 7.291399229781772, "grad_norm": 13162.5078125, "learning_rate": 3.097345132743363e-05, "loss": 0.0951, "step": 2840 }, { "epoch": 7.317073170731708, "grad_norm": 5318.009765625, "learning_rate": 3.0678466076696165e-05, "loss": 0.0988, "step": 2850 }, { "epoch": 7.342747111681643, "grad_norm": 5820.310546875, "learning_rate": 3.0383480825958703e-05, "loss": 0.0983, "step": 2860 }, { "epoch": 7.368421052631579, "grad_norm": 5990.56640625, "learning_rate": 3.008849557522124e-05, "loss": 0.0982, "step": 2870 }, { "epoch": 7.394094993581515, "grad_norm": 5594.0703125, "learning_rate": 2.9793510324483776e-05, "loss": 0.0974, "step": 2880 }, { "epoch": 7.419768934531451, "grad_norm": 6317.15234375, "learning_rate": 2.9498525073746314e-05, "loss": 0.0932, "step": 2890 }, { "epoch": 7.445442875481387, "grad_norm": 8022.15185546875, "learning_rate": 2.9203539823008852e-05, "loss": 0.1041, "step": 2900 }, { "epoch": 7.471116816431322, "grad_norm": 5091.68310546875, "learning_rate": 2.890855457227139e-05, "loss": 0.0995, "step": 2910 }, { "epoch": 7.496790757381258, "grad_norm": 6386.40869140625, "learning_rate": 2.8613569321533922e-05, "loss": 0.0964, "step": 2920 }, { "epoch": 7.522464698331194, "grad_norm": 4850.58203125, "learning_rate": 2.831858407079646e-05, "loss": 0.1063, "step": 2930 }, { "epoch": 7.548138639281129, "grad_norm": 6846.75146484375, "learning_rate": 2.8023598820059e-05, "loss": 0.102, "step": 2940 }, { "epoch": 7.573812580231065, "grad_norm": 5613.95166015625, "learning_rate": 2.7728613569321537e-05, "loss": 0.0977, "step": 2950 }, { "epoch": 7.599486521181001, "grad_norm": 5055.47705078125, "learning_rate": 2.743362831858407e-05, "loss": 0.0937, "step": 2960 }, { "epoch": 7.625160462130937, "grad_norm": 5020.2568359375, "learning_rate": 2.713864306784661e-05, "loss": 0.0978, "step": 2970 }, { "epoch": 7.650834403080873, "grad_norm": 5974.265625, "learning_rate": 2.6843657817109148e-05, "loss": 0.098, "step": 2980 }, { "epoch": 7.676508344030808, "grad_norm": 6458.8662109375, "learning_rate": 2.6548672566371686e-05, "loss": 0.0964, "step": 2990 }, { "epoch": 7.702182284980744, "grad_norm": 5247.0791015625, "learning_rate": 2.6253687315634217e-05, "loss": 0.1029, "step": 3000 }, { "epoch": 7.702182284980744, "eval_gen_len": 38.946, "eval_loss": 0.09166006743907928, "eval_rouge1": 0.6133, "eval_rouge2": 0.3322, "eval_rougeL": 0.5794, "eval_runtime": 65.3195, "eval_samples_per_second": 7.655, "eval_steps_per_second": 0.49, "step": 3000 }, { "epoch": 7.7278562259306804, "grad_norm": 9114.529296875, "learning_rate": 2.5958702064896756e-05, "loss": 0.0958, "step": 3010 }, { "epoch": 7.7535301668806165, "grad_norm": 4675.69384765625, "learning_rate": 2.5663716814159294e-05, "loss": 0.0967, "step": 3020 }, { "epoch": 7.779204107830552, "grad_norm": 5986.85546875, "learning_rate": 2.5368731563421832e-05, "loss": 0.0963, "step": 3030 }, { "epoch": 7.804878048780488, "grad_norm": 5686.59716796875, "learning_rate": 2.5073746312684367e-05, "loss": 0.1, "step": 3040 }, { "epoch": 7.830551989730424, "grad_norm": 4628.58447265625, "learning_rate": 2.4778761061946905e-05, "loss": 0.0971, "step": 3050 }, { "epoch": 7.85622593068036, "grad_norm": 4568.95068359375, "learning_rate": 2.4483775811209443e-05, "loss": 0.0995, "step": 3060 }, { "epoch": 7.881899871630296, "grad_norm": 5026.5517578125, "learning_rate": 2.4188790560471978e-05, "loss": 0.0997, "step": 3070 }, { "epoch": 7.907573812580231, "grad_norm": 5142.33544921875, "learning_rate": 2.3893805309734516e-05, "loss": 0.0989, "step": 3080 }, { "epoch": 7.933247753530167, "grad_norm": 4715.99169921875, "learning_rate": 2.359882005899705e-05, "loss": 0.0982, "step": 3090 }, { "epoch": 7.958921694480103, "grad_norm": 7074.0263671875, "learning_rate": 2.330383480825959e-05, "loss": 0.0936, "step": 3100 }, { "epoch": 7.984595635430038, "grad_norm": 5483.7958984375, "learning_rate": 2.3008849557522124e-05, "loss": 0.1055, "step": 3110 }, { "epoch": 8.010269576379974, "grad_norm": 21462.302734375, "learning_rate": 2.2713864306784662e-05, "loss": 0.097, "step": 3120 }, { "epoch": 8.03594351732991, "grad_norm": 5375.9345703125, "learning_rate": 2.24188790560472e-05, "loss": 0.0945, "step": 3130 }, { "epoch": 8.061617458279846, "grad_norm": 5927.3203125, "learning_rate": 2.2123893805309738e-05, "loss": 0.0919, "step": 3140 }, { "epoch": 8.087291399229782, "grad_norm": 4952.16064453125, "learning_rate": 2.1828908554572273e-05, "loss": 0.1023, "step": 3150 }, { "epoch": 8.112965340179718, "grad_norm": 4753.6865234375, "learning_rate": 2.153392330383481e-05, "loss": 0.1038, "step": 3160 }, { "epoch": 8.138639281129654, "grad_norm": 4751.951171875, "learning_rate": 2.1238938053097346e-05, "loss": 0.0953, "step": 3170 }, { "epoch": 8.164313222079588, "grad_norm": 5049.07470703125, "learning_rate": 2.0943952802359884e-05, "loss": 0.0995, "step": 3180 }, { "epoch": 8.189987163029524, "grad_norm": 5914.5595703125, "learning_rate": 2.064896755162242e-05, "loss": 0.0994, "step": 3190 }, { "epoch": 8.21566110397946, "grad_norm": 5663.07568359375, "learning_rate": 2.0353982300884957e-05, "loss": 0.099, "step": 3200 }, { "epoch": 8.241335044929397, "grad_norm": 5172.39208984375, "learning_rate": 2.0058997050147492e-05, "loss": 0.1056, "step": 3210 }, { "epoch": 8.267008985879333, "grad_norm": 4296.75732421875, "learning_rate": 1.976401179941003e-05, "loss": 0.0917, "step": 3220 }, { "epoch": 8.292682926829269, "grad_norm": 5991.23046875, "learning_rate": 1.946902654867257e-05, "loss": 0.1005, "step": 3230 }, { "epoch": 8.318356867779205, "grad_norm": 4786.93017578125, "learning_rate": 1.9174041297935107e-05, "loss": 0.1013, "step": 3240 }, { "epoch": 8.34403080872914, "grad_norm": 5587.01416015625, "learning_rate": 1.887905604719764e-05, "loss": 0.1061, "step": 3250 }, { "epoch": 8.369704749679077, "grad_norm": 5002.3935546875, "learning_rate": 1.858407079646018e-05, "loss": 0.0981, "step": 3260 }, { "epoch": 8.39537869062901, "grad_norm": 5792.34814453125, "learning_rate": 1.8289085545722714e-05, "loss": 0.0982, "step": 3270 }, { "epoch": 8.421052631578947, "grad_norm": 5482.14501953125, "learning_rate": 1.7994100294985252e-05, "loss": 0.0977, "step": 3280 }, { "epoch": 8.446726572528883, "grad_norm": 5414.59326171875, "learning_rate": 1.7699115044247787e-05, "loss": 0.0959, "step": 3290 }, { "epoch": 8.472400513478819, "grad_norm": 6676.62548828125, "learning_rate": 1.7404129793510325e-05, "loss": 0.0964, "step": 3300 }, { "epoch": 8.498074454428755, "grad_norm": 5211.7705078125, "learning_rate": 1.7109144542772864e-05, "loss": 0.0936, "step": 3310 }, { "epoch": 8.523748395378691, "grad_norm": 5187.91015625, "learning_rate": 1.6814159292035402e-05, "loss": 0.092, "step": 3320 }, { "epoch": 8.549422336328627, "grad_norm": 16135.931640625, "learning_rate": 1.6519174041297937e-05, "loss": 0.093, "step": 3330 }, { "epoch": 8.575096277278563, "grad_norm": 5429.2236328125, "learning_rate": 1.6224188790560475e-05, "loss": 0.0957, "step": 3340 }, { "epoch": 8.600770218228497, "grad_norm": 5034.25732421875, "learning_rate": 1.592920353982301e-05, "loss": 0.0997, "step": 3350 }, { "epoch": 8.626444159178433, "grad_norm": 6611.349609375, "learning_rate": 1.5634218289085548e-05, "loss": 0.0963, "step": 3360 }, { "epoch": 8.65211810012837, "grad_norm": 5671.7568359375, "learning_rate": 1.5339233038348082e-05, "loss": 0.1065, "step": 3370 }, { "epoch": 8.677792041078305, "grad_norm": 8826.3564453125, "learning_rate": 1.504424778761062e-05, "loss": 0.0972, "step": 3380 }, { "epoch": 8.703465982028241, "grad_norm": 5669.00439453125, "learning_rate": 1.4749262536873157e-05, "loss": 0.0995, "step": 3390 }, { "epoch": 8.729139922978177, "grad_norm": 7719.87353515625, "learning_rate": 1.4454277286135695e-05, "loss": 0.0898, "step": 3400 }, { "epoch": 8.754813863928113, "grad_norm": 5668.51953125, "learning_rate": 1.415929203539823e-05, "loss": 0.1028, "step": 3410 }, { "epoch": 8.78048780487805, "grad_norm": 5719.044921875, "learning_rate": 1.3864306784660768e-05, "loss": 0.094, "step": 3420 }, { "epoch": 8.806161745827985, "grad_norm": 6085.166015625, "learning_rate": 1.3569321533923305e-05, "loss": 0.0938, "step": 3430 }, { "epoch": 8.83183568677792, "grad_norm": 5559.7431640625, "learning_rate": 1.3274336283185843e-05, "loss": 0.0962, "step": 3440 }, { "epoch": 8.857509627727856, "grad_norm": 7504.08349609375, "learning_rate": 1.2979351032448378e-05, "loss": 0.0964, "step": 3450 }, { "epoch": 8.883183568677792, "grad_norm": 6102.48486328125, "learning_rate": 1.2684365781710916e-05, "loss": 0.0967, "step": 3460 }, { "epoch": 8.908857509627728, "grad_norm": 5322.08251953125, "learning_rate": 1.2389380530973452e-05, "loss": 0.0952, "step": 3470 }, { "epoch": 8.934531450577664, "grad_norm": 5769.94091796875, "learning_rate": 1.2094395280235989e-05, "loss": 0.0957, "step": 3480 }, { "epoch": 8.9602053915276, "grad_norm": 4727.4755859375, "learning_rate": 1.1799410029498525e-05, "loss": 0.0942, "step": 3490 }, { "epoch": 8.985879332477536, "grad_norm": 4858.51416015625, "learning_rate": 1.1504424778761062e-05, "loss": 0.0977, "step": 3500 }, { "epoch": 8.985879332477536, "eval_gen_len": 38.946, "eval_loss": 0.09172539412975311, "eval_rouge1": 0.6126, "eval_rouge2": 0.3339, "eval_rougeL": 0.5795, "eval_runtime": 65.0875, "eval_samples_per_second": 7.682, "eval_steps_per_second": 0.492, "step": 3500 } ], "logging_steps": 10, "max_steps": 3890, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.517722961707008e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }