{ "best_metric": 2.2994935512542725, "best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/longt5_xl_sfd_20/checkpoint-28", "epoch": 19.47826086956522, "eval_steps": 500, "global_step": 280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14, "grad_norm": 8.068708419799805, "learning_rate": 0.001, "loss": 3.274, "step": 2 }, { "epoch": 0.28, "grad_norm": 1.4994572401046753, "learning_rate": 0.001, "loss": 3.2963, "step": 4 }, { "epoch": 0.42, "grad_norm": 1.0570803880691528, "learning_rate": 0.001, "loss": 3.3164, "step": 6 }, { "epoch": 0.56, "grad_norm": 1.2446849346160889, "learning_rate": 0.001, "loss": 3.0866, "step": 8 }, { "epoch": 0.7, "grad_norm": 0.721084713935852, "learning_rate": 0.001, "loss": 2.8976, "step": 10 }, { "epoch": 0.83, "grad_norm": 1.2132383584976196, "learning_rate": 0.001, "loss": 2.8298, "step": 12 }, { "epoch": 0.97, "grad_norm": 0.4689762592315674, "learning_rate": 0.001, "loss": 2.9377, "step": 14 }, { "epoch": 0.97, "eval_loss": 2.7965147495269775, "eval_runtime": 81.4763, "eval_samples_per_second": 4.148, "eval_steps_per_second": 0.528, "step": 14 }, { "epoch": 1.11, "grad_norm": 0.42892181873321533, "learning_rate": 0.001, "loss": 2.741, "step": 16 }, { "epoch": 1.25, "grad_norm": 0.4487678110599518, "learning_rate": 0.001, "loss": 2.4441, "step": 18 }, { "epoch": 1.39, "grad_norm": 0.4653552770614624, "learning_rate": 0.001, "loss": 2.432, "step": 20 }, { "epoch": 1.53, "grad_norm": 0.35275548696517944, "learning_rate": 0.001, "loss": 2.4016, "step": 22 }, { "epoch": 1.67, "grad_norm": 0.43277695775032043, "learning_rate": 0.001, "loss": 2.391, "step": 24 }, { "epoch": 1.81, "grad_norm": 0.3408297300338745, "learning_rate": 0.001, "loss": 2.3911, "step": 26 }, { "epoch": 1.95, "grad_norm": 0.3205319344997406, "learning_rate": 0.001, "loss": 2.3247, "step": 28 }, { "epoch": 1.95, "eval_loss": 2.2994935512542725, "eval_runtime": 81.4693, "eval_samples_per_second": 4.149, "eval_steps_per_second": 0.528, "step": 28 }, { "epoch": 2.09, "grad_norm": 0.4033512771129608, "learning_rate": 0.001, "loss": 2.0701, "step": 30 }, { "epoch": 2.23, "grad_norm": 0.36825311183929443, "learning_rate": 0.001, "loss": 2.0968, "step": 32 }, { "epoch": 2.37, "grad_norm": 0.5080482363700867, "learning_rate": 0.001, "loss": 2.0681, "step": 34 }, { "epoch": 2.5, "grad_norm": 0.4196927845478058, "learning_rate": 0.001, "loss": 2.0914, "step": 36 }, { "epoch": 2.64, "grad_norm": 0.3230506479740143, "learning_rate": 0.001, "loss": 2.0317, "step": 38 }, { "epoch": 2.78, "grad_norm": 0.2733004689216614, "learning_rate": 0.001, "loss": 1.9723, "step": 40 }, { "epoch": 2.92, "grad_norm": 0.2709517776966095, "learning_rate": 0.001, "loss": 1.9943, "step": 42 }, { "epoch": 2.99, "eval_loss": 2.3308048248291016, "eval_runtime": 81.5083, "eval_samples_per_second": 4.147, "eval_steps_per_second": 0.528, "step": 43 }, { "epoch": 3.06, "grad_norm": 0.3230663537979126, "learning_rate": 0.001, "loss": 1.9093, "step": 44 }, { "epoch": 3.2, "grad_norm": 0.3976946175098419, "learning_rate": 0.001, "loss": 1.7682, "step": 46 }, { "epoch": 3.34, "grad_norm": 0.42008209228515625, "learning_rate": 0.001, "loss": 1.7119, "step": 48 }, { "epoch": 3.48, "grad_norm": 0.31828513741493225, "learning_rate": 0.001, "loss": 1.7283, "step": 50 }, { "epoch": 3.62, "grad_norm": 0.2448839396238327, "learning_rate": 0.001, "loss": 1.6905, "step": 52 }, { "epoch": 3.76, "grad_norm": 0.25552132725715637, "learning_rate": 0.001, "loss": 1.6645, "step": 54 }, { "epoch": 3.9, "grad_norm": 15.679224014282227, "learning_rate": 0.001, "loss": 1.7056, "step": 56 }, { "epoch": 3.97, "eval_loss": 2.3368992805480957, "eval_runtime": 81.4742, "eval_samples_per_second": 4.149, "eval_steps_per_second": 0.528, "step": 57 }, { "epoch": 4.03, "grad_norm": 0.29547178745269775, "learning_rate": 0.001, "loss": 1.564, "step": 58 }, { "epoch": 4.17, "grad_norm": 0.31610924005508423, "learning_rate": 0.001, "loss": 1.3607, "step": 60 }, { "epoch": 4.31, "grad_norm": 0.32351407408714294, "learning_rate": 0.001, "loss": 1.4158, "step": 62 }, { "epoch": 4.45, "grad_norm": 0.5101042985916138, "learning_rate": 0.001, "loss": 1.4694, "step": 64 }, { "epoch": 4.59, "grad_norm": 0.41575145721435547, "learning_rate": 0.001, "loss": 1.4755, "step": 66 }, { "epoch": 4.73, "grad_norm": 0.3269899785518646, "learning_rate": 0.001, "loss": 1.4268, "step": 68 }, { "epoch": 4.87, "grad_norm": 0.4077276587486267, "learning_rate": 0.001, "loss": 1.4471, "step": 70 }, { "epoch": 4.94, "eval_loss": 2.553175926208496, "eval_runtime": 81.5149, "eval_samples_per_second": 4.146, "eval_steps_per_second": 0.528, "step": 71 }, { "epoch": 5.01, "grad_norm": 0.37493908405303955, "learning_rate": 0.001, "loss": 1.4436, "step": 72 }, { "epoch": 5.15, "grad_norm": 0.8398223519325256, "learning_rate": 0.001, "loss": 1.1776, "step": 74 }, { "epoch": 5.29, "grad_norm": 0.621316134929657, "learning_rate": 0.001, "loss": 1.192, "step": 76 }, { "epoch": 5.43, "grad_norm": 0.5988876819610596, "learning_rate": 0.001, "loss": 1.1561, "step": 78 }, { "epoch": 5.57, "grad_norm": 0.561390221118927, "learning_rate": 0.001, "loss": 1.2129, "step": 80 }, { "epoch": 5.7, "grad_norm": 0.32573097944259644, "learning_rate": 0.001, "loss": 1.19, "step": 82 }, { "epoch": 5.84, "grad_norm": 0.3272527754306793, "learning_rate": 0.001, "loss": 1.1933, "step": 84 }, { "epoch": 5.98, "grad_norm": 0.36107558012008667, "learning_rate": 0.001, "loss": 1.1932, "step": 86 }, { "epoch": 5.98, "eval_loss": 2.696089744567871, "eval_runtime": 81.5294, "eval_samples_per_second": 4.146, "eval_steps_per_second": 0.527, "step": 86 }, { "epoch": 6.12, "grad_norm": 0.4167131781578064, "learning_rate": 0.001, "loss": 0.9285, "step": 88 }, { "epoch": 6.26, "grad_norm": 0.38736867904663086, "learning_rate": 0.001, "loss": 0.9568, "step": 90 }, { "epoch": 6.4, "grad_norm": 0.3212537169456482, "learning_rate": 0.001, "loss": 0.9538, "step": 92 }, { "epoch": 6.54, "grad_norm": 0.2966512143611908, "learning_rate": 0.001, "loss": 0.9133, "step": 94 }, { "epoch": 6.68, "grad_norm": 0.3149372935295105, "learning_rate": 0.001, "loss": 0.9374, "step": 96 }, { "epoch": 6.82, "grad_norm": 0.3140605092048645, "learning_rate": 0.001, "loss": 0.9585, "step": 98 }, { "epoch": 6.96, "grad_norm": 0.33559679985046387, "learning_rate": 0.001, "loss": 0.9199, "step": 100 }, { "epoch": 6.96, "eval_loss": 2.645321846008301, "eval_runtime": 81.5044, "eval_samples_per_second": 4.147, "eval_steps_per_second": 0.528, "step": 100 }, { "epoch": 7.1, "grad_norm": 0.3616858720779419, "learning_rate": 0.001, "loss": 0.7517, "step": 102 }, { "epoch": 7.23, "grad_norm": 0.4970415234565735, "learning_rate": 0.001, "loss": 0.7378, "step": 104 }, { "epoch": 7.37, "grad_norm": 0.6654688119888306, "learning_rate": 0.001, "loss": 0.7864, "step": 106 }, { "epoch": 7.51, "grad_norm": 0.51229327917099, "learning_rate": 0.001, "loss": 0.762, "step": 108 }, { "epoch": 7.65, "grad_norm": 0.4524416923522949, "learning_rate": 0.001, "loss": 0.7342, "step": 110 }, { "epoch": 7.79, "grad_norm": 0.48206427693367004, "learning_rate": 0.001, "loss": 0.7706, "step": 112 }, { "epoch": 7.93, "grad_norm": 0.4534417688846588, "learning_rate": 0.001, "loss": 0.7571, "step": 114 }, { "epoch": 8.0, "eval_loss": 3.0977730751037598, "eval_runtime": 81.5778, "eval_samples_per_second": 4.143, "eval_steps_per_second": 0.527, "step": 115 }, { "epoch": 8.07, "grad_norm": 0.306815505027771, "learning_rate": 0.001, "loss": 0.6809, "step": 116 }, { "epoch": 8.21, "grad_norm": 0.34183812141418457, "learning_rate": 0.001, "loss": 0.5853, "step": 118 }, { "epoch": 8.35, "grad_norm": 0.3781261444091797, "learning_rate": 0.001, "loss": 0.5819, "step": 120 }, { "epoch": 8.49, "grad_norm": 0.36344149708747864, "learning_rate": 0.001, "loss": 0.6059, "step": 122 }, { "epoch": 8.63, "grad_norm": 0.38990476727485657, "learning_rate": 0.001, "loss": 0.5929, "step": 124 }, { "epoch": 8.77, "grad_norm": 0.34000781178474426, "learning_rate": 0.001, "loss": 0.5887, "step": 126 }, { "epoch": 8.9, "grad_norm": 0.32895970344543457, "learning_rate": 0.001, "loss": 0.6287, "step": 128 }, { "epoch": 8.97, "eval_loss": 3.145782709121704, "eval_runtime": 81.5735, "eval_samples_per_second": 4.144, "eval_steps_per_second": 0.527, "step": 129 }, { "epoch": 9.04, "grad_norm": 0.36275872588157654, "learning_rate": 0.001, "loss": 0.5983, "step": 130 }, { "epoch": 9.18, "grad_norm": 0.3596336245536804, "learning_rate": 0.001, "loss": 0.4615, "step": 132 }, { "epoch": 9.32, "grad_norm": 0.37557095289230347, "learning_rate": 0.001, "loss": 0.4756, "step": 134 }, { "epoch": 9.46, "grad_norm": 0.39249515533447266, "learning_rate": 0.001, "loss": 0.4546, "step": 136 }, { "epoch": 9.6, "grad_norm": 0.3760348856449127, "learning_rate": 0.001, "loss": 0.4792, "step": 138 }, { "epoch": 9.74, "grad_norm": 0.3137217164039612, "learning_rate": 0.001, "loss": 0.4674, "step": 140 }, { "epoch": 9.88, "grad_norm": 0.40549594163894653, "learning_rate": 0.001, "loss": 0.4939, "step": 142 }, { "epoch": 9.95, "eval_loss": 3.5685999393463135, "eval_runtime": 81.5958, "eval_samples_per_second": 4.142, "eval_steps_per_second": 0.527, "step": 143 }, { "epoch": 10.02, "grad_norm": 0.4173819422721863, "learning_rate": 0.001, "loss": 0.5055, "step": 144 }, { "epoch": 10.16, "grad_norm": 0.280066579580307, "learning_rate": 0.001, "loss": 0.3353, "step": 146 }, { "epoch": 10.3, "grad_norm": 0.30166783928871155, "learning_rate": 0.001, "loss": 0.351, "step": 148 }, { "epoch": 10.43, "grad_norm": 0.28606531023979187, "learning_rate": 0.001, "loss": 0.3834, "step": 150 }, { "epoch": 10.57, "grad_norm": 0.2835221588611603, "learning_rate": 0.001, "loss": 0.3718, "step": 152 }, { "epoch": 10.71, "grad_norm": 0.3148328959941864, "learning_rate": 0.001, "loss": 0.3692, "step": 154 }, { "epoch": 10.85, "grad_norm": 0.3502219021320343, "learning_rate": 0.001, "loss": 0.38, "step": 156 }, { "epoch": 10.99, "grad_norm": 0.3344653844833374, "learning_rate": 0.001, "loss": 0.376, "step": 158 }, { "epoch": 10.99, "eval_loss": 3.425977945327759, "eval_runtime": 81.532, "eval_samples_per_second": 4.146, "eval_steps_per_second": 0.527, "step": 158 }, { "epoch": 11.13, "grad_norm": 0.32332998514175415, "learning_rate": 0.001, "loss": 0.2827, "step": 160 }, { "epoch": 11.27, "grad_norm": 0.35432103276252747, "learning_rate": 0.001, "loss": 0.2966, "step": 162 }, { "epoch": 11.41, "grad_norm": 0.29032111167907715, "learning_rate": 0.001, "loss": 0.2954, "step": 164 }, { "epoch": 11.55, "grad_norm": 0.3170696198940277, "learning_rate": 0.001, "loss": 0.2738, "step": 166 }, { "epoch": 11.69, "grad_norm": 0.3339516520500183, "learning_rate": 0.001, "loss": 0.2786, "step": 168 }, { "epoch": 11.83, "grad_norm": 0.3187398910522461, "learning_rate": 0.001, "loss": 0.315, "step": 170 }, { "epoch": 11.97, "grad_norm": 0.2842791974544525, "learning_rate": 0.001, "loss": 0.313, "step": 172 }, { "epoch": 11.97, "eval_loss": 3.9301607608795166, "eval_runtime": 81.5908, "eval_samples_per_second": 4.143, "eval_steps_per_second": 0.527, "step": 172 }, { "epoch": 12.1, "grad_norm": 0.2522130012512207, "learning_rate": 0.001, "loss": 0.2504, "step": 174 }, { "epoch": 12.24, "grad_norm": 0.23560765385627747, "learning_rate": 0.001, "loss": 0.212, "step": 176 }, { "epoch": 12.38, "grad_norm": 0.24140460789203644, "learning_rate": 0.001, "loss": 0.2156, "step": 178 }, { "epoch": 12.52, "grad_norm": 0.2790488302707672, "learning_rate": 0.001, "loss": 0.2474, "step": 180 }, { "epoch": 12.66, "grad_norm": 0.2879179120063782, "learning_rate": 0.001, "loss": 0.2486, "step": 182 }, { "epoch": 12.8, "grad_norm": 0.3126004934310913, "learning_rate": 0.001, "loss": 0.2499, "step": 184 }, { "epoch": 12.94, "grad_norm": 0.3011338412761688, "learning_rate": 0.001, "loss": 0.2562, "step": 186 }, { "epoch": 12.94, "eval_loss": 3.743312120437622, "eval_runtime": 81.5885, "eval_samples_per_second": 4.143, "eval_steps_per_second": 0.527, "step": 186 }, { "epoch": 13.08, "grad_norm": 0.24417123198509216, "learning_rate": 0.001, "loss": 0.2166, "step": 188 }, { "epoch": 13.22, "grad_norm": 0.21955759823322296, "learning_rate": 0.001, "loss": 0.1767, "step": 190 }, { "epoch": 13.36, "grad_norm": 0.20537225902080536, "learning_rate": 0.001, "loss": 0.1715, "step": 192 }, { "epoch": 13.5, "grad_norm": 0.21406413614749908, "learning_rate": 0.001, "loss": 0.1857, "step": 194 }, { "epoch": 13.63, "grad_norm": 0.21677067875862122, "learning_rate": 0.001, "loss": 0.1881, "step": 196 }, { "epoch": 13.77, "grad_norm": 0.2592070996761322, "learning_rate": 0.001, "loss": 0.2022, "step": 198 }, { "epoch": 13.91, "grad_norm": 0.23913638293743134, "learning_rate": 0.001, "loss": 0.2051, "step": 200 }, { "epoch": 13.98, "eval_loss": 3.911346197128296, "eval_runtime": 81.5425, "eval_samples_per_second": 4.145, "eval_steps_per_second": 0.527, "step": 201 }, { "epoch": 14.05, "grad_norm": 0.19888806343078613, "learning_rate": 0.001, "loss": 0.1774, "step": 202 }, { "epoch": 14.19, "grad_norm": 0.17841410636901855, "learning_rate": 0.001, "loss": 0.1409, "step": 204 }, { "epoch": 14.33, "grad_norm": 0.22502601146697998, "learning_rate": 0.001, "loss": 0.1432, "step": 206 }, { "epoch": 14.47, "grad_norm": 0.21947847306728363, "learning_rate": 0.001, "loss": 0.1487, "step": 208 }, { "epoch": 14.61, "grad_norm": 0.20319664478302002, "learning_rate": 0.001, "loss": 0.1753, "step": 210 }, { "epoch": 14.75, "grad_norm": 0.20484566688537598, "learning_rate": 0.001, "loss": 0.1627, "step": 212 }, { "epoch": 14.89, "grad_norm": 0.24411869049072266, "learning_rate": 0.001, "loss": 0.1802, "step": 214 }, { "epoch": 14.96, "eval_loss": 4.0449538230896, "eval_runtime": 81.5583, "eval_samples_per_second": 4.144, "eval_steps_per_second": 0.527, "step": 215 }, { "epoch": 15.03, "grad_norm": 0.23610645532608032, "learning_rate": 0.001, "loss": 0.1881, "step": 216 }, { "epoch": 15.17, "grad_norm": 0.17829175293445587, "learning_rate": 0.001, "loss": 0.123, "step": 218 }, { "epoch": 15.3, "grad_norm": 0.178519606590271, "learning_rate": 0.001, "loss": 0.1166, "step": 220 }, { "epoch": 15.44, "grad_norm": 0.19595706462860107, "learning_rate": 0.001, "loss": 0.135, "step": 222 }, { "epoch": 15.58, "grad_norm": 0.20790521800518036, "learning_rate": 0.001, "loss": 0.1494, "step": 224 }, { "epoch": 15.72, "grad_norm": 0.1832074671983719, "learning_rate": 0.001, "loss": 0.1488, "step": 226 }, { "epoch": 15.86, "grad_norm": 0.17795896530151367, "learning_rate": 0.001, "loss": 0.1448, "step": 228 }, { "epoch": 16.0, "grad_norm": 0.20039702951908112, "learning_rate": 0.001, "loss": 0.1378, "step": 230 }, { "epoch": 16.0, "eval_loss": 3.939739227294922, "eval_runtime": 81.6032, "eval_samples_per_second": 4.142, "eval_steps_per_second": 0.527, "step": 230 }, { "epoch": 16.14, "grad_norm": 0.19622142612934113, "learning_rate": 0.001, "loss": 0.3001, "step": 232 }, { "epoch": 16.28, "grad_norm": 19.05455207824707, "learning_rate": 0.001, "loss": 0.2708, "step": 234 }, { "epoch": 16.42, "grad_norm": 29.798582077026367, "learning_rate": 0.001, "loss": 0.2154, "step": 236 }, { "epoch": 16.56, "grad_norm": 8.835821151733398, "learning_rate": 0.001, "loss": 0.1348, "step": 238 }, { "epoch": 16.7, "grad_norm": 0.3760863244533539, "learning_rate": 0.001, "loss": 0.6235, "step": 240 }, { "epoch": 16.83, "grad_norm": 0.3473583459854126, "learning_rate": 0.001, "loss": 0.1445, "step": 242 }, { "epoch": 16.97, "grad_norm": 0.4041793942451477, "learning_rate": 0.001, "loss": 0.1546, "step": 244 }, { "epoch": 16.97, "eval_loss": 4.307888984680176, "eval_runtime": 81.6566, "eval_samples_per_second": 4.139, "eval_steps_per_second": 0.527, "step": 244 }, { "epoch": 17.11, "grad_norm": 0.2586219906806946, "learning_rate": 0.001, "loss": 0.1188, "step": 246 }, { "epoch": 17.25, "grad_norm": 0.4334220886230469, "learning_rate": 0.001, "loss": 0.1041, "step": 248 }, { "epoch": 17.39, "grad_norm": 17.520734786987305, "learning_rate": 0.001, "loss": 0.1108, "step": 250 }, { "epoch": 17.53, "grad_norm": 0.5943770408630371, "learning_rate": 0.001, "loss": 0.1146, "step": 252 }, { "epoch": 17.67, "grad_norm": 0.4325353503227234, "learning_rate": 0.001, "loss": 0.1325, "step": 254 }, { "epoch": 17.81, "grad_norm": 0.41412413120269775, "learning_rate": 0.001, "loss": 0.1491, "step": 256 }, { "epoch": 17.95, "grad_norm": 0.19986829161643982, "learning_rate": 0.001, "loss": 0.1375, "step": 258 }, { "epoch": 17.95, "eval_loss": 4.552526950836182, "eval_runtime": 81.6054, "eval_samples_per_second": 4.142, "eval_steps_per_second": 0.527, "step": 258 }, { "epoch": 18.09, "grad_norm": 0.7999384999275208, "learning_rate": 0.001, "loss": 0.1155, "step": 260 }, { "epoch": 18.23, "grad_norm": 0.17563021183013916, "learning_rate": 0.001, "loss": 0.1006, "step": 262 }, { "epoch": 18.37, "grad_norm": 0.17661228775978088, "learning_rate": 0.001, "loss": 0.1062, "step": 264 }, { "epoch": 18.5, "grad_norm": 0.17768113315105438, "learning_rate": 0.001, "loss": 0.1059, "step": 266 }, { "epoch": 18.64, "grad_norm": 0.15412819385528564, "learning_rate": 0.001, "loss": 0.0981, "step": 268 }, { "epoch": 18.78, "grad_norm": 0.1754271388053894, "learning_rate": 0.001, "loss": 0.0988, "step": 270 }, { "epoch": 18.92, "grad_norm": 0.15736614167690277, "learning_rate": 0.001, "loss": 0.1005, "step": 272 }, { "epoch": 18.99, "eval_loss": 4.900540828704834, "eval_runtime": 81.5789, "eval_samples_per_second": 4.143, "eval_steps_per_second": 0.527, "step": 273 }, { "epoch": 19.06, "grad_norm": 0.1531495302915573, "learning_rate": 0.001, "loss": 0.0844, "step": 274 }, { "epoch": 19.2, "grad_norm": 0.15237411856651306, "learning_rate": 0.001, "loss": 0.0752, "step": 276 }, { "epoch": 19.34, "grad_norm": 0.1433786153793335, "learning_rate": 0.001, "loss": 0.0782, "step": 278 }, { "epoch": 19.48, "grad_norm": 0.1296713650226593, "learning_rate": 0.001, "loss": 0.0808, "step": 280 }, { "epoch": 19.48, "eval_loss": 4.81671667098999, "eval_runtime": 81.4692, "eval_samples_per_second": 4.149, "eval_steps_per_second": 0.528, "step": 280 }, { "epoch": 19.48, "step": 280, "total_flos": 4.895208054457934e+18, "train_loss": 0.8494854368801628, "train_runtime": 68771.7044, "train_samples_per_second": 1.068, "train_steps_per_second": 0.004 } ], "logging_steps": 2, "max_steps": 280, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 4.895208054457934e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }