{ "best_metric": 0.8659195109526235, "best_model_checkpoint": "output/fine_tuned/t5-base/MNLI/checkpoint-61360", "epoch": 5.0, "eval_steps": 500, "global_step": 61360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04074315514993481, "grad_norm": 6.468524932861328, "learning_rate": 4.9796284224250325e-05, "loss": 0.8428, "step": 500 }, { "epoch": 0.08148631029986962, "grad_norm": 4.806485652923584, "learning_rate": 4.9592568448500655e-05, "loss": 0.59, "step": 1000 }, { "epoch": 0.12222946544980444, "grad_norm": 5.188954830169678, "learning_rate": 4.938885267275098e-05, "loss": 0.5275, "step": 1500 }, { "epoch": 0.16297262059973924, "grad_norm": 3.7646172046661377, "learning_rate": 4.918513689700131e-05, "loss": 0.508, "step": 2000 }, { "epoch": 0.20371577574967406, "grad_norm": 5.518546104431152, "learning_rate": 4.898142112125163e-05, "loss": 0.4712, "step": 2500 }, { "epoch": 0.24445893089960888, "grad_norm": 4.250612258911133, "learning_rate": 4.877770534550195e-05, "loss": 0.4501, "step": 3000 }, { "epoch": 0.28520208604954367, "grad_norm": 5.0553812980651855, "learning_rate": 4.857398956975228e-05, "loss": 0.446, "step": 3500 }, { "epoch": 0.3259452411994785, "grad_norm": 3.5348777770996094, "learning_rate": 4.8370273794002606e-05, "loss": 0.429, "step": 4000 }, { "epoch": 0.3666883963494133, "grad_norm": 2.972283363342285, "learning_rate": 4.8166558018252936e-05, "loss": 0.4305, "step": 4500 }, { "epoch": 0.4074315514993481, "grad_norm": 2.9549307823181152, "learning_rate": 4.7962842242503265e-05, "loss": 0.4194, "step": 5000 }, { "epoch": 0.44817470664928294, "grad_norm": 3.8171303272247314, "learning_rate": 4.775912646675359e-05, "loss": 0.4122, "step": 5500 }, { "epoch": 0.48891786179921776, "grad_norm": 1.5783547163009644, "learning_rate": 4.755541069100392e-05, "loss": 0.4059, "step": 6000 }, { "epoch": 0.5296610169491526, "grad_norm": 3.6074378490448, "learning_rate": 4.735169491525424e-05, "loss": 0.4043, "step": 6500 }, { "epoch": 0.5704041720990873, "grad_norm": 2.160903215408325, "learning_rate": 4.7147979139504564e-05, "loss": 0.3909, "step": 7000 }, { "epoch": 0.6111473272490222, "grad_norm": 3.1185765266418457, "learning_rate": 4.694426336375489e-05, "loss": 0.3941, "step": 7500 }, { "epoch": 0.651890482398957, "grad_norm": 2.998476982116699, "learning_rate": 4.6740547588005216e-05, "loss": 0.3964, "step": 8000 }, { "epoch": 0.6926336375488917, "grad_norm": 5.319198131561279, "learning_rate": 4.6536831812255546e-05, "loss": 0.3886, "step": 8500 }, { "epoch": 0.7333767926988266, "grad_norm": 3.9432480335235596, "learning_rate": 4.633311603650587e-05, "loss": 0.3969, "step": 9000 }, { "epoch": 0.7741199478487614, "grad_norm": 5.180667400360107, "learning_rate": 4.612940026075619e-05, "loss": 0.3833, "step": 9500 }, { "epoch": 0.8148631029986962, "grad_norm": 3.2222518920898438, "learning_rate": 4.592568448500652e-05, "loss": 0.3714, "step": 10000 }, { "epoch": 0.855606258148631, "grad_norm": 3.999013900756836, "learning_rate": 4.5721968709256844e-05, "loss": 0.3853, "step": 10500 }, { "epoch": 0.8963494132985659, "grad_norm": 3.423118829727173, "learning_rate": 4.5518252933507174e-05, "loss": 0.3772, "step": 11000 }, { "epoch": 0.9370925684485006, "grad_norm": 3.744582176208496, "learning_rate": 4.53145371577575e-05, "loss": 0.3753, "step": 11500 }, { "epoch": 0.9778357235984355, "grad_norm": 4.6033806800842285, "learning_rate": 4.511082138200782e-05, "loss": 0.3694, "step": 12000 }, { "epoch": 1.0, "eval_accuracy": 0.8565461029037188, "eval_loss": 0.38699737191200256, "eval_runtime": 64.7816, "eval_samples_per_second": 151.509, "eval_steps_per_second": 18.941, "step": 12272 }, { "epoch": 1.0185788787483703, "grad_norm": 4.516230583190918, "learning_rate": 4.490710560625815e-05, "loss": 0.3444, "step": 12500 }, { "epoch": 1.0593220338983051, "grad_norm": 4.428933620452881, "learning_rate": 4.470338983050847e-05, "loss": 0.3135, "step": 13000 }, { "epoch": 1.1000651890482398, "grad_norm": 4.436412334442139, "learning_rate": 4.44996740547588e-05, "loss": 0.3225, "step": 13500 }, { "epoch": 1.1408083441981747, "grad_norm": 4.193423271179199, "learning_rate": 4.429595827900913e-05, "loss": 0.3146, "step": 14000 }, { "epoch": 1.1815514993481095, "grad_norm": 4.420684814453125, "learning_rate": 4.4092242503259455e-05, "loss": 0.3277, "step": 14500 }, { "epoch": 1.2222946544980444, "grad_norm": 2.8461813926696777, "learning_rate": 4.3888526727509784e-05, "loss": 0.3134, "step": 15000 }, { "epoch": 1.263037809647979, "grad_norm": 2.6924009323120117, "learning_rate": 4.368481095176011e-05, "loss": 0.3185, "step": 15500 }, { "epoch": 1.303780964797914, "grad_norm": 3.6353909969329834, "learning_rate": 4.348109517601043e-05, "loss": 0.3166, "step": 16000 }, { "epoch": 1.3445241199478488, "grad_norm": 4.928586959838867, "learning_rate": 4.327737940026076e-05, "loss": 0.3165, "step": 16500 }, { "epoch": 1.3852672750977835, "grad_norm": 5.045453071594238, "learning_rate": 4.307366362451108e-05, "loss": 0.3196, "step": 17000 }, { "epoch": 1.4260104302477183, "grad_norm": 2.8752036094665527, "learning_rate": 4.286994784876141e-05, "loss": 0.3232, "step": 17500 }, { "epoch": 1.4667535853976532, "grad_norm": 2.486584424972534, "learning_rate": 4.2666232073011735e-05, "loss": 0.3153, "step": 18000 }, { "epoch": 1.5074967405475879, "grad_norm": 3.719909429550171, "learning_rate": 4.2462516297262065e-05, "loss": 0.3149, "step": 18500 }, { "epoch": 1.548239895697523, "grad_norm": 2.8027002811431885, "learning_rate": 4.225880052151239e-05, "loss": 0.3176, "step": 19000 }, { "epoch": 1.5889830508474576, "grad_norm": 5.058602333068848, "learning_rate": 4.205508474576271e-05, "loss": 0.3148, "step": 19500 }, { "epoch": 1.6297262059973925, "grad_norm": 3.070186138153076, "learning_rate": 4.185136897001304e-05, "loss": 0.3154, "step": 20000 }, { "epoch": 1.6704693611473274, "grad_norm": 3.288327932357788, "learning_rate": 4.164765319426336e-05, "loss": 0.3175, "step": 20500 }, { "epoch": 1.711212516297262, "grad_norm": 3.934622049331665, "learning_rate": 4.144393741851369e-05, "loss": 0.3127, "step": 21000 }, { "epoch": 1.7519556714471969, "grad_norm": 4.4335737228393555, "learning_rate": 4.1240221642764016e-05, "loss": 0.312, "step": 21500 }, { "epoch": 1.7926988265971318, "grad_norm": 2.0218756198883057, "learning_rate": 4.103650586701434e-05, "loss": 0.31, "step": 22000 }, { "epoch": 1.8334419817470664, "grad_norm": 3.1556270122528076, "learning_rate": 4.0832790091264675e-05, "loss": 0.3203, "step": 22500 }, { "epoch": 1.8741851368970013, "grad_norm": 4.388118267059326, "learning_rate": 4.0629074315515e-05, "loss": 0.3099, "step": 23000 }, { "epoch": 1.9149282920469362, "grad_norm": 1.511881709098816, "learning_rate": 4.042535853976532e-05, "loss": 0.3148, "step": 23500 }, { "epoch": 1.9556714471968708, "grad_norm": 1.853653073310852, "learning_rate": 4.022164276401565e-05, "loss": 0.3095, "step": 24000 }, { "epoch": 1.996414602346806, "grad_norm": 1.6253551244735718, "learning_rate": 4.0017926988265974e-05, "loss": 0.303, "step": 24500 }, { "epoch": 2.0, "eval_accuracy": 0.8651044319918492, "eval_loss": 0.37888792157173157, "eval_runtime": 27.9558, "eval_samples_per_second": 351.09, "eval_steps_per_second": 43.891, "step": 24544 }, { "epoch": 2.0371577574967406, "grad_norm": 3.7835922241210938, "learning_rate": 3.98142112125163e-05, "loss": 0.2552, "step": 25000 }, { "epoch": 2.077900912646675, "grad_norm": 3.147528886795044, "learning_rate": 3.9610495436766626e-05, "loss": 0.2401, "step": 25500 }, { "epoch": 2.1186440677966103, "grad_norm": 4.904528617858887, "learning_rate": 3.940677966101695e-05, "loss": 0.253, "step": 26000 }, { "epoch": 2.159387222946545, "grad_norm": 5.313526630401611, "learning_rate": 3.920306388526728e-05, "loss": 0.256, "step": 26500 }, { "epoch": 2.2001303780964796, "grad_norm": 1.7792415618896484, "learning_rate": 3.89993481095176e-05, "loss": 0.2486, "step": 27000 }, { "epoch": 2.2408735332464147, "grad_norm": 5.110089302062988, "learning_rate": 3.879563233376793e-05, "loss": 0.2411, "step": 27500 }, { "epoch": 2.2816166883963493, "grad_norm": 2.6529006958007812, "learning_rate": 3.8591916558018254e-05, "loss": 0.2588, "step": 28000 }, { "epoch": 2.322359843546284, "grad_norm": 5.450090408325195, "learning_rate": 3.838820078226858e-05, "loss": 0.2424, "step": 28500 }, { "epoch": 2.363102998696219, "grad_norm": 3.4213919639587402, "learning_rate": 3.818448500651891e-05, "loss": 0.2546, "step": 29000 }, { "epoch": 2.4038461538461537, "grad_norm": 4.965076923370361, "learning_rate": 3.798076923076923e-05, "loss": 0.2519, "step": 29500 }, { "epoch": 2.444589308996089, "grad_norm": 3.764460802078247, "learning_rate": 3.777705345501956e-05, "loss": 0.2529, "step": 30000 }, { "epoch": 2.4853324641460235, "grad_norm": 3.033212184906006, "learning_rate": 3.757333767926988e-05, "loss": 0.2559, "step": 30500 }, { "epoch": 2.526075619295958, "grad_norm": 2.535740613937378, "learning_rate": 3.7369621903520205e-05, "loss": 0.2587, "step": 31000 }, { "epoch": 2.5668187744458932, "grad_norm": 3.642535448074341, "learning_rate": 3.716590612777054e-05, "loss": 0.2509, "step": 31500 }, { "epoch": 2.607561929595828, "grad_norm": 5.233695030212402, "learning_rate": 3.6962190352020865e-05, "loss": 0.2471, "step": 32000 }, { "epoch": 2.648305084745763, "grad_norm": 4.331786155700684, "learning_rate": 3.675847457627119e-05, "loss": 0.2569, "step": 32500 }, { "epoch": 2.6890482398956976, "grad_norm": 3.5510385036468506, "learning_rate": 3.655475880052152e-05, "loss": 0.2518, "step": 33000 }, { "epoch": 2.7297913950456323, "grad_norm": 4.476876258850098, "learning_rate": 3.635104302477184e-05, "loss": 0.2596, "step": 33500 }, { "epoch": 2.770534550195567, "grad_norm": 1.7541571855545044, "learning_rate": 3.614732724902217e-05, "loss": 0.2674, "step": 34000 }, { "epoch": 2.811277705345502, "grad_norm": 5.406097412109375, "learning_rate": 3.594361147327249e-05, "loss": 0.2611, "step": 34500 }, { "epoch": 2.8520208604954367, "grad_norm": 3.6985819339752197, "learning_rate": 3.5739895697522816e-05, "loss": 0.2485, "step": 35000 }, { "epoch": 2.8927640156453718, "grad_norm": 5.607602596282959, "learning_rate": 3.5536179921773145e-05, "loss": 0.2529, "step": 35500 }, { "epoch": 2.9335071707953064, "grad_norm": 2.735818862915039, "learning_rate": 3.533246414602347e-05, "loss": 0.2545, "step": 36000 }, { "epoch": 2.974250325945241, "grad_norm": 4.402085781097412, "learning_rate": 3.51287483702738e-05, "loss": 0.2549, "step": 36500 }, { "epoch": 3.0, "eval_accuracy": 0.8649006622516556, "eval_loss": 0.4213252663612366, "eval_runtime": 27.4712, "eval_samples_per_second": 357.283, "eval_steps_per_second": 44.665, "step": 36816 }, { "epoch": 3.014993481095176, "grad_norm": 2.6076323986053467, "learning_rate": 3.492503259452412e-05, "loss": 0.2432, "step": 37000 }, { "epoch": 3.055736636245111, "grad_norm": 5.72471284866333, "learning_rate": 3.4721316818774444e-05, "loss": 0.2004, "step": 37500 }, { "epoch": 3.0964797913950455, "grad_norm": 2.282419443130493, "learning_rate": 3.451760104302477e-05, "loss": 0.1961, "step": 38000 }, { "epoch": 3.1372229465449806, "grad_norm": 6.373399257659912, "learning_rate": 3.4313885267275096e-05, "loss": 0.2047, "step": 38500 }, { "epoch": 3.1779661016949152, "grad_norm": 3.3030478954315186, "learning_rate": 3.4110169491525426e-05, "loss": 0.2051, "step": 39000 }, { "epoch": 3.21870925684485, "grad_norm": 3.459233283996582, "learning_rate": 3.390645371577575e-05, "loss": 0.2026, "step": 39500 }, { "epoch": 3.259452411994785, "grad_norm": 6.500429153442383, "learning_rate": 3.370273794002607e-05, "loss": 0.1989, "step": 40000 }, { "epoch": 3.3001955671447196, "grad_norm": 2.9848804473876953, "learning_rate": 3.349902216427641e-05, "loss": 0.2092, "step": 40500 }, { "epoch": 3.3409387222946547, "grad_norm": 6.0977091789245605, "learning_rate": 3.329530638852673e-05, "loss": 0.2062, "step": 41000 }, { "epoch": 3.3816818774445894, "grad_norm": 6.147849082946777, "learning_rate": 3.3091590612777054e-05, "loss": 0.2065, "step": 41500 }, { "epoch": 3.422425032594524, "grad_norm": 4.928431034088135, "learning_rate": 3.2887874837027384e-05, "loss": 0.2044, "step": 42000 }, { "epoch": 3.463168187744459, "grad_norm": 5.093155860900879, "learning_rate": 3.2684159061277707e-05, "loss": 0.2134, "step": 42500 }, { "epoch": 3.5039113428943938, "grad_norm": 3.0888853073120117, "learning_rate": 3.2480443285528036e-05, "loss": 0.2114, "step": 43000 }, { "epoch": 3.5446544980443284, "grad_norm": 4.273815155029297, "learning_rate": 3.227672750977836e-05, "loss": 0.2122, "step": 43500 }, { "epoch": 3.5853976531942635, "grad_norm": 5.846683025360107, "learning_rate": 3.207301173402868e-05, "loss": 0.2053, "step": 44000 }, { "epoch": 3.626140808344198, "grad_norm": 6.744229316711426, "learning_rate": 3.186929595827901e-05, "loss": 0.2141, "step": 44500 }, { "epoch": 3.666883963494133, "grad_norm": 6.070234775543213, "learning_rate": 3.1665580182529335e-05, "loss": 0.2108, "step": 45000 }, { "epoch": 3.707627118644068, "grad_norm": 4.892887115478516, "learning_rate": 3.1461864406779664e-05, "loss": 0.2067, "step": 45500 }, { "epoch": 3.7483702737940026, "grad_norm": 2.861110210418701, "learning_rate": 3.125814863102999e-05, "loss": 0.2022, "step": 46000 }, { "epoch": 3.7891134289439377, "grad_norm": 5.521021842956543, "learning_rate": 3.105443285528031e-05, "loss": 0.2119, "step": 46500 }, { "epoch": 3.8298565840938723, "grad_norm": 4.308606147766113, "learning_rate": 3.085071707953064e-05, "loss": 0.2101, "step": 47000 }, { "epoch": 3.870599739243807, "grad_norm": 5.6051025390625, "learning_rate": 3.064700130378096e-05, "loss": 0.2179, "step": 47500 }, { "epoch": 3.9113428943937416, "grad_norm": 2.6445212364196777, "learning_rate": 3.044328552803129e-05, "loss": 0.2174, "step": 48000 }, { "epoch": 3.9520860495436767, "grad_norm": 2.7666218280792236, "learning_rate": 3.0239569752281615e-05, "loss": 0.2021, "step": 48500 }, { "epoch": 3.9928292046936114, "grad_norm": 4.770074844360352, "learning_rate": 3.003585397653194e-05, "loss": 0.2118, "step": 49000 }, { "epoch": 4.0, "eval_accuracy": 0.86571574121243, "eval_loss": 0.44608649611473083, "eval_runtime": 27.5189, "eval_samples_per_second": 356.663, "eval_steps_per_second": 44.587, "step": 49088 }, { "epoch": 4.0335723598435465, "grad_norm": 1.6207386255264282, "learning_rate": 2.983213820078227e-05, "loss": 0.1697, "step": 49500 }, { "epoch": 4.074315514993481, "grad_norm": 3.2992472648620605, "learning_rate": 2.9628422425032598e-05, "loss": 0.1552, "step": 50000 }, { "epoch": 4.115058670143416, "grad_norm": 5.406311511993408, "learning_rate": 2.9424706649282924e-05, "loss": 0.1705, "step": 50500 }, { "epoch": 4.15580182529335, "grad_norm": 3.330251455307007, "learning_rate": 2.922099087353325e-05, "loss": 0.1669, "step": 51000 }, { "epoch": 4.196544980443286, "grad_norm": 9.146599769592285, "learning_rate": 2.9017275097783576e-05, "loss": 0.1614, "step": 51500 }, { "epoch": 4.237288135593221, "grad_norm": 3.782498598098755, "learning_rate": 2.88135593220339e-05, "loss": 0.167, "step": 52000 }, { "epoch": 4.278031290743155, "grad_norm": 3.704604387283325, "learning_rate": 2.8609843546284226e-05, "loss": 0.171, "step": 52500 }, { "epoch": 4.31877444589309, "grad_norm": 4.229913234710693, "learning_rate": 2.8406127770534552e-05, "loss": 0.1752, "step": 53000 }, { "epoch": 4.3595176010430245, "grad_norm": 5.513256549835205, "learning_rate": 2.8202411994784878e-05, "loss": 0.1688, "step": 53500 }, { "epoch": 4.400260756192959, "grad_norm": 3.369919776916504, "learning_rate": 2.7998696219035204e-05, "loss": 0.17, "step": 54000 }, { "epoch": 4.441003911342895, "grad_norm": 6.391170978546143, "learning_rate": 2.7794980443285527e-05, "loss": 0.1663, "step": 54500 }, { "epoch": 4.481747066492829, "grad_norm": 3.2502150535583496, "learning_rate": 2.7591264667535854e-05, "loss": 0.1729, "step": 55000 }, { "epoch": 4.522490221642764, "grad_norm": 2.424835443496704, "learning_rate": 2.738754889178618e-05, "loss": 0.1733, "step": 55500 }, { "epoch": 4.563233376792699, "grad_norm": 2.813887119293213, "learning_rate": 2.7183833116036506e-05, "loss": 0.1653, "step": 56000 }, { "epoch": 4.603976531942633, "grad_norm": 9.048973083496094, "learning_rate": 2.6980117340286833e-05, "loss": 0.1755, "step": 56500 }, { "epoch": 4.644719687092568, "grad_norm": 4.4565558433532715, "learning_rate": 2.6776401564537155e-05, "loss": 0.1717, "step": 57000 }, { "epoch": 4.6854628422425035, "grad_norm": 1.3312770128250122, "learning_rate": 2.6572685788787482e-05, "loss": 0.1664, "step": 57500 }, { "epoch": 4.726205997392438, "grad_norm": 7.608250617980957, "learning_rate": 2.6368970013037815e-05, "loss": 0.1687, "step": 58000 }, { "epoch": 4.766949152542373, "grad_norm": 4.287994861602783, "learning_rate": 2.6165254237288138e-05, "loss": 0.1757, "step": 58500 }, { "epoch": 4.8076923076923075, "grad_norm": 3.2377748489379883, "learning_rate": 2.5961538461538464e-05, "loss": 0.1746, "step": 59000 }, { "epoch": 4.848435462842242, "grad_norm": 6.194840431213379, "learning_rate": 2.575782268578879e-05, "loss": 0.1655, "step": 59500 }, { "epoch": 4.889178617992178, "grad_norm": 3.546142339706421, "learning_rate": 2.5554106910039117e-05, "loss": 0.1799, "step": 60000 }, { "epoch": 4.929921773142112, "grad_norm": 7.121952533721924, "learning_rate": 2.5350391134289443e-05, "loss": 0.1683, "step": 60500 }, { "epoch": 4.970664928292047, "grad_norm": 4.6492838859558105, "learning_rate": 2.5146675358539766e-05, "loss": 0.1733, "step": 61000 }, { "epoch": 5.0, "eval_accuracy": 0.8659195109526235, "eval_loss": 0.46997779607772827, "eval_runtime": 27.6271, "eval_samples_per_second": 355.267, "eval_steps_per_second": 44.413, "step": 61360 }, { "epoch": 5.0, "step": 61360, "total_flos": 2.998178041960627e+17, "train_loss": 0.0, "train_runtime": 0.2558, "train_samples_per_second": 7675471.049, "train_steps_per_second": 239859.692 } ], "logging_steps": 500, "max_steps": 61360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.998178041960627e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }