{ "best_metric": 0.8066528066528067, "best_model_checkpoint": "test-hasy-5/checkpoint-18935", "epoch": 100.0, "eval_steps": 500, "global_step": 54100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.92, "grad_norm": 4.6960930824279785, "learning_rate": 1.9815157116451017e-05, "loss": 3.9645, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.3970893970893971, "eval_loss": 3.429507255554199, "eval_runtime": 2.0488, "eval_samples_per_second": 234.77, "eval_steps_per_second": 29.773, "step": 541 }, { "epoch": 1.85, "grad_norm": 3.364806652069092, "learning_rate": 1.9630314232902035e-05, "loss": 3.4258, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.4781704781704782, "eval_loss": 2.879011392593384, "eval_runtime": 1.9708, "eval_samples_per_second": 244.059, "eval_steps_per_second": 30.951, "step": 1082 }, { "epoch": 2.77, "grad_norm": 3.35432767868042, "learning_rate": 1.944547134935305e-05, "loss": 3.04, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.5467775467775468, "eval_loss": 2.4893012046813965, "eval_runtime": 2.0175, "eval_samples_per_second": 238.411, "eval_steps_per_second": 30.235, "step": 1623 }, { "epoch": 3.7, "grad_norm": 4.459615230560303, "learning_rate": 1.9260628465804068e-05, "loss": 2.793, "step": 2000 }, { "epoch": 4.0, "eval_accuracy": 0.5738045738045738, "eval_loss": 2.2005958557128906, "eval_runtime": 1.962, "eval_samples_per_second": 245.161, "eval_steps_per_second": 31.091, "step": 2164 }, { "epoch": 4.62, "grad_norm": 4.481846332550049, "learning_rate": 1.9075785582255083e-05, "loss": 2.5551, "step": 2500 }, { "epoch": 5.0, "eval_accuracy": 0.6340956340956341, "eval_loss": 1.9055824279785156, "eval_runtime": 2.0221, "eval_samples_per_second": 237.875, "eval_steps_per_second": 30.167, "step": 2705 }, { "epoch": 5.55, "grad_norm": 4.1784281730651855, "learning_rate": 1.88909426987061e-05, "loss": 2.3662, "step": 3000 }, { "epoch": 6.0, "eval_accuracy": 0.6632016632016632, "eval_loss": 1.7023240327835083, "eval_runtime": 1.9666, "eval_samples_per_second": 244.588, "eval_steps_per_second": 31.018, "step": 3246 }, { "epoch": 6.47, "grad_norm": 6.06294059753418, "learning_rate": 1.8706099815157116e-05, "loss": 2.1965, "step": 3500 }, { "epoch": 7.0, "eval_accuracy": 0.6798336798336798, "eval_loss": 1.5739575624465942, "eval_runtime": 1.9941, "eval_samples_per_second": 241.216, "eval_steps_per_second": 30.591, "step": 3787 }, { "epoch": 7.39, "grad_norm": 4.912960052490234, "learning_rate": 1.8521256931608135e-05, "loss": 2.1397, "step": 4000 }, { "epoch": 8.0, "eval_accuracy": 0.6943866943866944, "eval_loss": 1.4560521841049194, "eval_runtime": 1.9835, "eval_samples_per_second": 242.507, "eval_steps_per_second": 30.754, "step": 4328 }, { "epoch": 8.32, "grad_norm": 5.236889362335205, "learning_rate": 1.833641404805915e-05, "loss": 1.9955, "step": 4500 }, { "epoch": 9.0, "eval_accuracy": 0.7234927234927235, "eval_loss": 1.3202540874481201, "eval_runtime": 2.0536, "eval_samples_per_second": 234.218, "eval_steps_per_second": 29.703, "step": 4869 }, { "epoch": 9.24, "grad_norm": 5.675503253936768, "learning_rate": 1.8151571164510168e-05, "loss": 1.9282, "step": 5000 }, { "epoch": 10.0, "eval_accuracy": 0.738045738045738, "eval_loss": 1.2246184349060059, "eval_runtime": 2.0017, "eval_samples_per_second": 240.293, "eval_steps_per_second": 30.474, "step": 5410 }, { "epoch": 10.17, "grad_norm": 4.67825174331665, "learning_rate": 1.7966728280961186e-05, "loss": 1.8368, "step": 5500 }, { "epoch": 11.0, "eval_accuracy": 0.738045738045738, "eval_loss": 1.1823257207870483, "eval_runtime": 1.9774, "eval_samples_per_second": 243.246, "eval_steps_per_second": 30.848, "step": 5951 }, { "epoch": 11.09, "grad_norm": 4.809859275817871, "learning_rate": 1.77818853974122e-05, "loss": 1.812, "step": 6000 }, { "epoch": 12.0, "eval_accuracy": 0.7214137214137214, "eval_loss": 1.1297953128814697, "eval_runtime": 2.0307, "eval_samples_per_second": 236.864, "eval_steps_per_second": 30.039, "step": 6492 }, { "epoch": 12.01, "grad_norm": 5.255190849304199, "learning_rate": 1.759704251386322e-05, "loss": 1.7353, "step": 6500 }, { "epoch": 12.94, "grad_norm": 8.597217559814453, "learning_rate": 1.7412199630314234e-05, "loss": 1.7195, "step": 7000 }, { "epoch": 13.0, "eval_accuracy": 0.7484407484407485, "eval_loss": 1.0423070192337036, "eval_runtime": 2.0193, "eval_samples_per_second": 238.201, "eval_steps_per_second": 30.208, "step": 7033 }, { "epoch": 13.86, "grad_norm": 6.453842639923096, "learning_rate": 1.7227356746765253e-05, "loss": 1.6314, "step": 7500 }, { "epoch": 14.0, "eval_accuracy": 0.7422037422037422, "eval_loss": 1.0077309608459473, "eval_runtime": 2.0783, "eval_samples_per_second": 231.439, "eval_steps_per_second": 29.351, "step": 7574 }, { "epoch": 14.79, "grad_norm": 8.70645523071289, "learning_rate": 1.7042513863216268e-05, "loss": 1.5979, "step": 8000 }, { "epoch": 15.0, "eval_accuracy": 0.7463617463617463, "eval_loss": 1.00509512424469, "eval_runtime": 1.9889, "eval_samples_per_second": 241.847, "eval_steps_per_second": 30.671, "step": 8115 }, { "epoch": 15.71, "grad_norm": 7.348147392272949, "learning_rate": 1.6857670979667286e-05, "loss": 1.5656, "step": 8500 }, { "epoch": 16.0, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.9325113296508789, "eval_runtime": 1.9923, "eval_samples_per_second": 241.43, "eval_steps_per_second": 30.618, "step": 8656 }, { "epoch": 16.64, "grad_norm": 6.420931816101074, "learning_rate": 1.66728280961183e-05, "loss": 1.5414, "step": 9000 }, { "epoch": 17.0, "eval_accuracy": 0.7733887733887734, "eval_loss": 0.8889437913894653, "eval_runtime": 1.9727, "eval_samples_per_second": 243.822, "eval_steps_per_second": 30.921, "step": 9197 }, { "epoch": 17.56, "grad_norm": 8.127350807189941, "learning_rate": 1.6487985212569316e-05, "loss": 1.5342, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.7484407484407485, "eval_loss": 0.9072721600532532, "eval_runtime": 1.9933, "eval_samples_per_second": 241.303, "eval_steps_per_second": 30.602, "step": 9738 }, { "epoch": 18.48, "grad_norm": 6.122061252593994, "learning_rate": 1.6303142329020334e-05, "loss": 1.4898, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.7713097713097713, "eval_loss": 0.8425627946853638, "eval_runtime": 1.9868, "eval_samples_per_second": 242.099, "eval_steps_per_second": 30.703, "step": 10279 }, { "epoch": 19.41, "grad_norm": 6.640945911407471, "learning_rate": 1.611829944547135e-05, "loss": 1.4731, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.7442827442827443, "eval_loss": 0.862506091594696, "eval_runtime": 1.9786, "eval_samples_per_second": 243.096, "eval_steps_per_second": 30.829, "step": 10820 }, { "epoch": 20.33, "grad_norm": 6.019400119781494, "learning_rate": 1.5933456561922367e-05, "loss": 1.451, "step": 11000 }, { "epoch": 21.0, "eval_accuracy": 0.762993762993763, "eval_loss": 0.8015209436416626, "eval_runtime": 1.9644, "eval_samples_per_second": 244.864, "eval_steps_per_second": 31.053, "step": 11361 }, { "epoch": 21.26, "grad_norm": 5.140503406524658, "learning_rate": 1.5748613678373382e-05, "loss": 1.4578, "step": 11500 }, { "epoch": 22.0, "eval_accuracy": 0.7588357588357588, "eval_loss": 0.8520306944847107, "eval_runtime": 2.0001, "eval_samples_per_second": 240.484, "eval_steps_per_second": 30.498, "step": 11902 }, { "epoch": 22.18, "grad_norm": 15.190984725952148, "learning_rate": 1.55637707948244e-05, "loss": 1.4126, "step": 12000 }, { "epoch": 23.0, "eval_accuracy": 0.7713097713097713, "eval_loss": 0.7928301692008972, "eval_runtime": 1.9822, "eval_samples_per_second": 242.66, "eval_steps_per_second": 30.774, "step": 12443 }, { "epoch": 23.11, "grad_norm": 11.220525741577148, "learning_rate": 1.5378927911275416e-05, "loss": 1.3626, "step": 12500 }, { "epoch": 24.0, "eval_accuracy": 0.7837837837837838, "eval_loss": 0.754388689994812, "eval_runtime": 1.975, "eval_samples_per_second": 243.545, "eval_steps_per_second": 30.886, "step": 12984 }, { "epoch": 24.03, "grad_norm": 3.5185582637786865, "learning_rate": 1.5194085027726432e-05, "loss": 1.3905, "step": 13000 }, { "epoch": 24.95, "grad_norm": 8.19352912902832, "learning_rate": 1.5009242144177449e-05, "loss": 1.3694, "step": 13500 }, { "epoch": 25.0, "eval_accuracy": 0.7775467775467776, "eval_loss": 0.7698755860328674, "eval_runtime": 2.0179, "eval_samples_per_second": 238.368, "eval_steps_per_second": 30.23, "step": 13525 }, { "epoch": 25.88, "grad_norm": 6.003907680511475, "learning_rate": 1.4824399260628467e-05, "loss": 1.3612, "step": 14000 }, { "epoch": 26.0, "eval_accuracy": 0.7775467775467776, "eval_loss": 0.7602183818817139, "eval_runtime": 1.9833, "eval_samples_per_second": 242.521, "eval_steps_per_second": 30.756, "step": 14066 }, { "epoch": 26.8, "grad_norm": 6.613931655883789, "learning_rate": 1.4639556377079484e-05, "loss": 1.2963, "step": 14500 }, { "epoch": 27.0, "eval_accuracy": 0.7713097713097713, "eval_loss": 0.7532169818878174, "eval_runtime": 2.0706, "eval_samples_per_second": 232.305, "eval_steps_per_second": 29.461, "step": 14607 }, { "epoch": 27.73, "grad_norm": 7.66683292388916, "learning_rate": 1.44547134935305e-05, "loss": 1.3009, "step": 15000 }, { "epoch": 28.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.7012535929679871, "eval_runtime": 1.9606, "eval_samples_per_second": 245.335, "eval_steps_per_second": 31.113, "step": 15148 }, { "epoch": 28.65, "grad_norm": 7.342077255249023, "learning_rate": 1.4269870609981517e-05, "loss": 1.2598, "step": 15500 }, { "epoch": 29.0, "eval_accuracy": 0.7796257796257796, "eval_loss": 0.7084705233573914, "eval_runtime": 1.9824, "eval_samples_per_second": 242.632, "eval_steps_per_second": 30.77, "step": 15689 }, { "epoch": 29.57, "grad_norm": 5.679790019989014, "learning_rate": 1.4085027726432534e-05, "loss": 1.2565, "step": 16000 }, { "epoch": 30.0, "eval_accuracy": 0.7775467775467776, "eval_loss": 0.7023281455039978, "eval_runtime": 1.9659, "eval_samples_per_second": 244.668, "eval_steps_per_second": 31.029, "step": 16230 }, { "epoch": 30.5, "grad_norm": 5.493412971496582, "learning_rate": 1.390018484288355e-05, "loss": 1.2735, "step": 16500 }, { "epoch": 31.0, "eval_accuracy": 0.7775467775467776, "eval_loss": 0.7047860026359558, "eval_runtime": 1.9718, "eval_samples_per_second": 243.937, "eval_steps_per_second": 30.936, "step": 16771 }, { "epoch": 31.42, "grad_norm": 6.2688093185424805, "learning_rate": 1.3715341959334567e-05, "loss": 1.2743, "step": 17000 }, { "epoch": 32.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6794067621231079, "eval_runtime": 1.9764, "eval_samples_per_second": 243.372, "eval_steps_per_second": 30.864, "step": 17312 }, { "epoch": 32.35, "grad_norm": 10.169917106628418, "learning_rate": 1.3530499075785584e-05, "loss": 1.2441, "step": 17500 }, { "epoch": 33.0, "eval_accuracy": 0.7858627858627859, "eval_loss": 0.693196713924408, "eval_runtime": 1.972, "eval_samples_per_second": 243.92, "eval_steps_per_second": 30.934, "step": 17853 }, { "epoch": 33.27, "grad_norm": 8.05045223236084, "learning_rate": 1.33456561922366e-05, "loss": 1.2282, "step": 18000 }, { "epoch": 34.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.7038751840591431, "eval_runtime": 1.992, "eval_samples_per_second": 241.466, "eval_steps_per_second": 30.623, "step": 18394 }, { "epoch": 34.2, "grad_norm": 5.410665035247803, "learning_rate": 1.3160813308687617e-05, "loss": 1.2204, "step": 18500 }, { "epoch": 35.0, "eval_accuracy": 0.8066528066528067, "eval_loss": 0.6860660910606384, "eval_runtime": 1.9773, "eval_samples_per_second": 243.258, "eval_steps_per_second": 30.85, "step": 18935 }, { "epoch": 35.12, "grad_norm": 11.123208045959473, "learning_rate": 1.2975970425138634e-05, "loss": 1.1808, "step": 19000 }, { "epoch": 36.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.6589930057525635, "eval_runtime": 1.9769, "eval_samples_per_second": 243.305, "eval_steps_per_second": 30.856, "step": 19476 }, { "epoch": 36.04, "grad_norm": 6.165465354919434, "learning_rate": 1.279112754158965e-05, "loss": 1.1933, "step": 19500 }, { "epoch": 36.97, "grad_norm": 6.407803535461426, "learning_rate": 1.2606284658040667e-05, "loss": 1.1928, "step": 20000 }, { "epoch": 37.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.678415834903717, "eval_runtime": 1.9844, "eval_samples_per_second": 242.388, "eval_steps_per_second": 30.739, "step": 20017 }, { "epoch": 37.89, "grad_norm": 4.849668979644775, "learning_rate": 1.2421441774491683e-05, "loss": 1.1914, "step": 20500 }, { "epoch": 38.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.6559053659439087, "eval_runtime": 1.9912, "eval_samples_per_second": 241.56, "eval_steps_per_second": 30.634, "step": 20558 }, { "epoch": 38.82, "grad_norm": 9.1309232711792, "learning_rate": 1.2236598890942698e-05, "loss": 1.1856, "step": 21000 }, { "epoch": 39.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.6769025325775146, "eval_runtime": 2.0066, "eval_samples_per_second": 239.713, "eval_steps_per_second": 30.4, "step": 21099 }, { "epoch": 39.74, "grad_norm": 5.001546382904053, "learning_rate": 1.2051756007393715e-05, "loss": 1.1585, "step": 21500 }, { "epoch": 40.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.64976966381073, "eval_runtime": 1.9804, "eval_samples_per_second": 242.874, "eval_steps_per_second": 30.801, "step": 21640 }, { "epoch": 40.67, "grad_norm": 14.044866561889648, "learning_rate": 1.1866913123844732e-05, "loss": 1.1713, "step": 22000 }, { "epoch": 41.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6447434425354004, "eval_runtime": 1.9973, "eval_samples_per_second": 240.829, "eval_steps_per_second": 30.542, "step": 22181 }, { "epoch": 41.59, "grad_norm": 10.289350509643555, "learning_rate": 1.1682070240295748e-05, "loss": 1.1183, "step": 22500 }, { "epoch": 42.0, "eval_accuracy": 0.7713097713097713, "eval_loss": 0.6748064756393433, "eval_runtime": 1.9672, "eval_samples_per_second": 244.509, "eval_steps_per_second": 31.008, "step": 22722 }, { "epoch": 42.51, "grad_norm": 12.7116117477417, "learning_rate": 1.1497227356746765e-05, "loss": 1.1564, "step": 23000 }, { "epoch": 43.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6545261740684509, "eval_runtime": 1.9659, "eval_samples_per_second": 244.669, "eval_steps_per_second": 31.029, "step": 23263 }, { "epoch": 43.44, "grad_norm": 3.0720624923706055, "learning_rate": 1.1312384473197783e-05, "loss": 1.1215, "step": 23500 }, { "epoch": 44.0, "eval_accuracy": 0.7879417879417879, "eval_loss": 0.6690270900726318, "eval_runtime": 1.9635, "eval_samples_per_second": 244.971, "eval_steps_per_second": 31.067, "step": 23804 }, { "epoch": 44.36, "grad_norm": 7.927094459533691, "learning_rate": 1.11275415896488e-05, "loss": 1.1008, "step": 24000 }, { "epoch": 45.0, "eval_accuracy": 0.7879417879417879, "eval_loss": 0.659792423248291, "eval_runtime": 1.9747, "eval_samples_per_second": 243.578, "eval_steps_per_second": 30.89, "step": 24345 }, { "epoch": 45.29, "grad_norm": 8.912357330322266, "learning_rate": 1.0942698706099817e-05, "loss": 1.1344, "step": 24500 }, { "epoch": 46.0, "eval_accuracy": 0.8024948024948025, "eval_loss": 0.6550182104110718, "eval_runtime": 2.0112, "eval_samples_per_second": 239.156, "eval_steps_per_second": 30.33, "step": 24886 }, { "epoch": 46.21, "grad_norm": 9.598004341125488, "learning_rate": 1.0757855822550833e-05, "loss": 1.126, "step": 25000 }, { "epoch": 47.0, "eval_accuracy": 0.7858627858627859, "eval_loss": 0.6521425247192383, "eval_runtime": 1.9713, "eval_samples_per_second": 244.004, "eval_steps_per_second": 30.944, "step": 25427 }, { "epoch": 47.13, "grad_norm": 4.670881271362305, "learning_rate": 1.057301293900185e-05, "loss": 1.125, "step": 25500 }, { "epoch": 48.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.6812848448753357, "eval_runtime": 2.016, "eval_samples_per_second": 238.588, "eval_steps_per_second": 30.258, "step": 25968 }, { "epoch": 48.06, "grad_norm": 8.11451244354248, "learning_rate": 1.0388170055452866e-05, "loss": 1.0682, "step": 26000 }, { "epoch": 48.98, "grad_norm": 8.960821151733398, "learning_rate": 1.0203327171903883e-05, "loss": 1.0855, "step": 26500 }, { "epoch": 49.0, "eval_accuracy": 0.7858627858627859, "eval_loss": 0.6419298052787781, "eval_runtime": 1.974, "eval_samples_per_second": 243.673, "eval_steps_per_second": 30.902, "step": 26509 }, { "epoch": 49.91, "grad_norm": 3.053118944168091, "learning_rate": 1.00184842883549e-05, "loss": 1.0452, "step": 27000 }, { "epoch": 50.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.6550863981246948, "eval_runtime": 2.0504, "eval_samples_per_second": 234.587, "eval_steps_per_second": 29.75, "step": 27050 }, { "epoch": 50.83, "grad_norm": 5.4594340324401855, "learning_rate": 9.833641404805916e-06, "loss": 1.0626, "step": 27500 }, { "epoch": 51.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6675499081611633, "eval_runtime": 1.9991, "eval_samples_per_second": 240.605, "eval_steps_per_second": 30.513, "step": 27591 }, { "epoch": 51.76, "grad_norm": 8.158236503601074, "learning_rate": 9.648798521256933e-06, "loss": 1.0155, "step": 28000 }, { "epoch": 52.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6945971846580505, "eval_runtime": 1.9873, "eval_samples_per_second": 242.042, "eval_steps_per_second": 30.696, "step": 28132 }, { "epoch": 52.68, "grad_norm": 5.626604080200195, "learning_rate": 9.46395563770795e-06, "loss": 1.0319, "step": 28500 }, { "epoch": 53.0, "eval_accuracy": 0.7796257796257796, "eval_loss": 0.6942130923271179, "eval_runtime": 1.966, "eval_samples_per_second": 244.665, "eval_steps_per_second": 31.028, "step": 28673 }, { "epoch": 53.6, "grad_norm": 6.82182502746582, "learning_rate": 9.279112754158966e-06, "loss": 1.0488, "step": 29000 }, { "epoch": 54.0, "eval_accuracy": 0.7983367983367984, "eval_loss": 0.6496003866195679, "eval_runtime": 2.0069, "eval_samples_per_second": 239.673, "eval_steps_per_second": 30.395, "step": 29214 }, { "epoch": 54.53, "grad_norm": 7.865675926208496, "learning_rate": 9.094269870609981e-06, "loss": 1.0558, "step": 29500 }, { "epoch": 55.0, "eval_accuracy": 0.8045738045738046, "eval_loss": 0.6465332508087158, "eval_runtime": 1.9938, "eval_samples_per_second": 241.25, "eval_steps_per_second": 30.595, "step": 29755 }, { "epoch": 55.45, "grad_norm": 7.172035217285156, "learning_rate": 8.909426987060998e-06, "loss": 0.9913, "step": 30000 }, { "epoch": 56.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6654109954833984, "eval_runtime": 1.9693, "eval_samples_per_second": 244.248, "eval_steps_per_second": 30.975, "step": 30296 }, { "epoch": 56.38, "grad_norm": 6.30518102645874, "learning_rate": 8.724584103512016e-06, "loss": 1.0555, "step": 30500 }, { "epoch": 57.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.656141996383667, "eval_runtime": 1.9741, "eval_samples_per_second": 243.66, "eval_steps_per_second": 30.901, "step": 30837 }, { "epoch": 57.3, "grad_norm": 3.0917370319366455, "learning_rate": 8.539741219963033e-06, "loss": 0.9803, "step": 31000 }, { "epoch": 58.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.673220157623291, "eval_runtime": 1.993, "eval_samples_per_second": 241.346, "eval_steps_per_second": 30.607, "step": 31378 }, { "epoch": 58.23, "grad_norm": 8.285308837890625, "learning_rate": 8.35489833641405e-06, "loss": 1.0393, "step": 31500 }, { "epoch": 59.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.6892696619033813, "eval_runtime": 1.9798, "eval_samples_per_second": 242.949, "eval_steps_per_second": 30.811, "step": 31919 }, { "epoch": 59.15, "grad_norm": 3.1396327018737793, "learning_rate": 8.170055452865066e-06, "loss": 0.9677, "step": 32000 }, { "epoch": 60.0, "eval_accuracy": 0.8045738045738046, "eval_loss": 0.6823599934577942, "eval_runtime": 2.0127, "eval_samples_per_second": 238.985, "eval_steps_per_second": 30.308, "step": 32460 }, { "epoch": 60.07, "grad_norm": 12.875879287719727, "learning_rate": 7.985212569316083e-06, "loss": 1.0366, "step": 32500 }, { "epoch": 61.0, "grad_norm": 8.770364761352539, "learning_rate": 7.8003696857671e-06, "loss": 1.0082, "step": 33000 }, { "epoch": 61.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.6618274450302124, "eval_runtime": 2.0008, "eval_samples_per_second": 240.403, "eval_steps_per_second": 30.488, "step": 33001 }, { "epoch": 61.92, "grad_norm": 6.0600972175598145, "learning_rate": 7.615526802218115e-06, "loss": 1.0096, "step": 33500 }, { "epoch": 62.0, "eval_accuracy": 0.7837837837837838, "eval_loss": 0.6691136360168457, "eval_runtime": 1.9817, "eval_samples_per_second": 242.719, "eval_steps_per_second": 30.781, "step": 33542 }, { "epoch": 62.85, "grad_norm": 10.777630805969238, "learning_rate": 7.430683918669132e-06, "loss": 0.9685, "step": 34000 }, { "epoch": 63.0, "eval_accuracy": 0.8024948024948025, "eval_loss": 0.6792653203010559, "eval_runtime": 2.0645, "eval_samples_per_second": 232.985, "eval_steps_per_second": 29.547, "step": 34083 }, { "epoch": 63.77, "grad_norm": 3.9615447521209717, "learning_rate": 7.245841035120148e-06, "loss": 0.9847, "step": 34500 }, { "epoch": 64.0, "eval_accuracy": 0.7837837837837838, "eval_loss": 0.6894533634185791, "eval_runtime": 2.0054, "eval_samples_per_second": 239.847, "eval_steps_per_second": 30.417, "step": 34624 }, { "epoch": 64.7, "grad_norm": 9.38687801361084, "learning_rate": 7.060998151571166e-06, "loss": 0.9639, "step": 35000 }, { "epoch": 65.0, "eval_accuracy": 0.7733887733887734, "eval_loss": 0.7297117114067078, "eval_runtime": 2.0147, "eval_samples_per_second": 238.744, "eval_steps_per_second": 30.277, "step": 35165 }, { "epoch": 65.62, "grad_norm": 12.292973518371582, "learning_rate": 6.876155268022182e-06, "loss": 0.9776, "step": 35500 }, { "epoch": 66.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6561179757118225, "eval_runtime": 1.9845, "eval_samples_per_second": 242.381, "eval_steps_per_second": 30.739, "step": 35706 }, { "epoch": 66.54, "grad_norm": 14.023015022277832, "learning_rate": 6.691312384473199e-06, "loss": 1.0074, "step": 36000 }, { "epoch": 67.0, "eval_accuracy": 0.7775467775467776, "eval_loss": 0.6998913884162903, "eval_runtime": 1.9686, "eval_samples_per_second": 244.338, "eval_steps_per_second": 30.987, "step": 36247 }, { "epoch": 67.47, "grad_norm": 13.870222091674805, "learning_rate": 6.506469500924215e-06, "loss": 0.9466, "step": 36500 }, { "epoch": 68.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.6880961656570435, "eval_runtime": 1.9686, "eval_samples_per_second": 244.34, "eval_steps_per_second": 30.987, "step": 36788 }, { "epoch": 68.39, "grad_norm": 6.1949639320373535, "learning_rate": 6.321626617375231e-06, "loss": 0.9425, "step": 37000 }, { "epoch": 69.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.6805587410926819, "eval_runtime": 1.9709, "eval_samples_per_second": 244.052, "eval_steps_per_second": 30.95, "step": 37329 }, { "epoch": 69.32, "grad_norm": 7.145143508911133, "learning_rate": 6.136783733826248e-06, "loss": 0.9594, "step": 37500 }, { "epoch": 70.0, "eval_accuracy": 0.7900207900207901, "eval_loss": 0.7202461361885071, "eval_runtime": 2.0125, "eval_samples_per_second": 239.001, "eval_steps_per_second": 30.31, "step": 37870 }, { "epoch": 70.24, "grad_norm": 9.215810775756836, "learning_rate": 5.951940850277265e-06, "loss": 0.9311, "step": 38000 }, { "epoch": 71.0, "eval_accuracy": 0.7754677754677755, "eval_loss": 0.7161967754364014, "eval_runtime": 1.977, "eval_samples_per_second": 243.297, "eval_steps_per_second": 30.855, "step": 38411 }, { "epoch": 71.16, "grad_norm": 6.461187362670898, "learning_rate": 5.767097966728281e-06, "loss": 0.9429, "step": 38500 }, { "epoch": 72.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.7284368276596069, "eval_runtime": 2.0663, "eval_samples_per_second": 232.783, "eval_steps_per_second": 29.521, "step": 38952 }, { "epoch": 72.09, "grad_norm": 11.850204467773438, "learning_rate": 5.582255083179298e-06, "loss": 0.9666, "step": 39000 }, { "epoch": 73.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.6871474981307983, "eval_runtime": 1.97, "eval_samples_per_second": 244.162, "eval_steps_per_second": 30.964, "step": 39493 }, { "epoch": 73.01, "grad_norm": 8.0579252243042, "learning_rate": 5.3974121996303146e-06, "loss": 0.932, "step": 39500 }, { "epoch": 73.94, "grad_norm": 1.1972132921218872, "learning_rate": 5.212569316081332e-06, "loss": 0.945, "step": 40000 }, { "epoch": 74.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.6778899431228638, "eval_runtime": 2.0376, "eval_samples_per_second": 236.067, "eval_steps_per_second": 29.938, "step": 40034 }, { "epoch": 74.86, "grad_norm": 5.484439849853516, "learning_rate": 5.027726432532349e-06, "loss": 0.9387, "step": 40500 }, { "epoch": 75.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.735752522945404, "eval_runtime": 1.9762, "eval_samples_per_second": 243.395, "eval_steps_per_second": 30.867, "step": 40575 }, { "epoch": 75.79, "grad_norm": 2.0908420085906982, "learning_rate": 4.8428835489833645e-06, "loss": 0.9132, "step": 41000 }, { "epoch": 76.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.7043733596801758, "eval_runtime": 1.9764, "eval_samples_per_second": 243.367, "eval_steps_per_second": 30.864, "step": 41116 }, { "epoch": 76.71, "grad_norm": 10.380330085754395, "learning_rate": 4.658040665434381e-06, "loss": 0.9181, "step": 41500 }, { "epoch": 77.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.7041053771972656, "eval_runtime": 2.0006, "eval_samples_per_second": 240.43, "eval_steps_per_second": 30.491, "step": 41657 }, { "epoch": 77.63, "grad_norm": 9.135781288146973, "learning_rate": 4.473197781885398e-06, "loss": 0.9218, "step": 42000 }, { "epoch": 78.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.6986111998558044, "eval_runtime": 1.9688, "eval_samples_per_second": 244.309, "eval_steps_per_second": 30.983, "step": 42198 }, { "epoch": 78.56, "grad_norm": 17.338001251220703, "learning_rate": 4.288354898336414e-06, "loss": 0.8621, "step": 42500 }, { "epoch": 79.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.6909247040748596, "eval_runtime": 1.997, "eval_samples_per_second": 240.86, "eval_steps_per_second": 30.546, "step": 42739 }, { "epoch": 79.48, "grad_norm": 6.793923854827881, "learning_rate": 4.103512014787431e-06, "loss": 0.9236, "step": 43000 }, { "epoch": 80.0, "eval_accuracy": 0.7983367983367984, "eval_loss": 0.7135599851608276, "eval_runtime": 1.9949, "eval_samples_per_second": 241.121, "eval_steps_per_second": 30.579, "step": 43280 }, { "epoch": 80.41, "grad_norm": 3.9345781803131104, "learning_rate": 3.918669131238448e-06, "loss": 0.8667, "step": 43500 }, { "epoch": 81.0, "eval_accuracy": 0.8024948024948025, "eval_loss": 0.7008742094039917, "eval_runtime": 1.992, "eval_samples_per_second": 241.461, "eval_steps_per_second": 30.622, "step": 43821 }, { "epoch": 81.33, "grad_norm": 16.883420944213867, "learning_rate": 3.7338262476894642e-06, "loss": 0.8856, "step": 44000 }, { "epoch": 82.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.7127683162689209, "eval_runtime": 2.0451, "eval_samples_per_second": 235.201, "eval_steps_per_second": 29.828, "step": 44362 }, { "epoch": 82.26, "grad_norm": 7.969069480895996, "learning_rate": 3.548983364140481e-06, "loss": 0.917, "step": 44500 }, { "epoch": 83.0, "eval_accuracy": 0.7983367983367984, "eval_loss": 0.7134777307510376, "eval_runtime": 2.0529, "eval_samples_per_second": 234.298, "eval_steps_per_second": 29.714, "step": 44903 }, { "epoch": 83.18, "grad_norm": 1.545163631439209, "learning_rate": 3.3641404805914975e-06, "loss": 0.8835, "step": 45000 }, { "epoch": 84.0, "eval_accuracy": 0.7900207900207901, "eval_loss": 0.7295302748680115, "eval_runtime": 1.9747, "eval_samples_per_second": 243.583, "eval_steps_per_second": 30.891, "step": 45444 }, { "epoch": 84.1, "grad_norm": 5.072544097900391, "learning_rate": 3.1792975970425146e-06, "loss": 0.8879, "step": 45500 }, { "epoch": 85.0, "eval_accuracy": 0.7900207900207901, "eval_loss": 0.7449509501457214, "eval_runtime": 2.0759, "eval_samples_per_second": 231.71, "eval_steps_per_second": 29.385, "step": 45985 }, { "epoch": 85.03, "grad_norm": 5.645694732666016, "learning_rate": 2.9944547134935308e-06, "loss": 0.9114, "step": 46000 }, { "epoch": 85.95, "grad_norm": 5.065194129943848, "learning_rate": 2.8096118299445474e-06, "loss": 0.8764, "step": 46500 }, { "epoch": 86.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.7362204194068909, "eval_runtime": 1.9869, "eval_samples_per_second": 242.081, "eval_steps_per_second": 30.7, "step": 46526 }, { "epoch": 86.88, "grad_norm": 5.654088020324707, "learning_rate": 2.624768946395564e-06, "loss": 0.8674, "step": 47000 }, { "epoch": 87.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.7232093811035156, "eval_runtime": 2.0232, "eval_samples_per_second": 237.746, "eval_steps_per_second": 30.151, "step": 47067 }, { "epoch": 87.8, "grad_norm": 12.72859001159668, "learning_rate": 2.4399260628465807e-06, "loss": 0.8583, "step": 47500 }, { "epoch": 88.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.7407870888710022, "eval_runtime": 1.9934, "eval_samples_per_second": 241.296, "eval_steps_per_second": 30.601, "step": 47608 }, { "epoch": 88.72, "grad_norm": 6.526777744293213, "learning_rate": 2.2550831792975973e-06, "loss": 0.881, "step": 48000 }, { "epoch": 89.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.7377821803092957, "eval_runtime": 1.9802, "eval_samples_per_second": 242.901, "eval_steps_per_second": 30.804, "step": 48149 }, { "epoch": 89.65, "grad_norm": 8.497318267822266, "learning_rate": 2.070240295748614e-06, "loss": 0.8668, "step": 48500 }, { "epoch": 90.0, "eval_accuracy": 0.7900207900207901, "eval_loss": 0.7473007440567017, "eval_runtime": 2.0137, "eval_samples_per_second": 238.867, "eval_steps_per_second": 30.293, "step": 48690 }, { "epoch": 90.57, "grad_norm": 6.455136775970459, "learning_rate": 1.8853974121996305e-06, "loss": 0.8779, "step": 49000 }, { "epoch": 91.0, "eval_accuracy": 0.7983367983367984, "eval_loss": 0.7438368201255798, "eval_runtime": 1.9731, "eval_samples_per_second": 243.774, "eval_steps_per_second": 30.915, "step": 49231 }, { "epoch": 91.5, "grad_norm": 5.713993072509766, "learning_rate": 1.700554528650647e-06, "loss": 0.8717, "step": 49500 }, { "epoch": 92.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.7389739751815796, "eval_runtime": 1.9686, "eval_samples_per_second": 244.34, "eval_steps_per_second": 30.987, "step": 49772 }, { "epoch": 92.42, "grad_norm": 5.342690467834473, "learning_rate": 1.5157116451016638e-06, "loss": 0.8781, "step": 50000 }, { "epoch": 93.0, "eval_accuracy": 0.7983367983367984, "eval_loss": 0.7473535537719727, "eval_runtime": 1.98, "eval_samples_per_second": 242.926, "eval_steps_per_second": 30.808, "step": 50313 }, { "epoch": 93.35, "grad_norm": 9.870634078979492, "learning_rate": 1.3308687615526802e-06, "loss": 0.8845, "step": 50500 }, { "epoch": 94.0, "eval_accuracy": 0.7900207900207901, "eval_loss": 0.7445840835571289, "eval_runtime": 1.9776, "eval_samples_per_second": 243.222, "eval_steps_per_second": 30.845, "step": 50854 }, { "epoch": 94.27, "grad_norm": 8.909347534179688, "learning_rate": 1.1460258780036969e-06, "loss": 0.8623, "step": 51000 }, { "epoch": 95.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.7315581440925598, "eval_runtime": 1.9728, "eval_samples_per_second": 243.814, "eval_steps_per_second": 30.92, "step": 51395 }, { "epoch": 95.19, "grad_norm": 10.748625755310059, "learning_rate": 9.611829944547135e-07, "loss": 0.8341, "step": 51500 }, { "epoch": 96.0, "eval_accuracy": 0.7879417879417879, "eval_loss": 0.7457364201545715, "eval_runtime": 2.0017, "eval_samples_per_second": 240.29, "eval_steps_per_second": 30.473, "step": 51936 }, { "epoch": 96.12, "grad_norm": 3.179774761199951, "learning_rate": 7.763401109057302e-07, "loss": 0.8766, "step": 52000 }, { "epoch": 97.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.743617832660675, "eval_runtime": 1.999, "eval_samples_per_second": 240.625, "eval_steps_per_second": 30.516, "step": 52477 }, { "epoch": 97.04, "grad_norm": 12.243720054626465, "learning_rate": 5.914972273567468e-07, "loss": 0.8101, "step": 52500 }, { "epoch": 97.97, "grad_norm": 18.670886993408203, "learning_rate": 4.066543438077634e-07, "loss": 0.8681, "step": 53000 }, { "epoch": 98.0, "eval_accuracy": 0.7900207900207901, "eval_loss": 0.7483807802200317, "eval_runtime": 2.0039, "eval_samples_per_second": 240.035, "eval_steps_per_second": 30.441, "step": 53018 }, { "epoch": 98.89, "grad_norm": 8.483085632324219, "learning_rate": 2.2181146025878005e-07, "loss": 0.8635, "step": 53500 }, { "epoch": 99.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.7391884922981262, "eval_runtime": 1.9875, "eval_samples_per_second": 242.013, "eval_steps_per_second": 30.692, "step": 53559 }, { "epoch": 99.82, "grad_norm": 10.068202018737793, "learning_rate": 3.696857670979668e-08, "loss": 0.8091, "step": 54000 }, { "epoch": 100.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.7390549182891846, "eval_runtime": 2.0448, "eval_samples_per_second": 235.228, "eval_steps_per_second": 29.831, "step": 54100 }, { "epoch": 100.0, "step": 54100, "total_flos": 3.355193271048192e+19, "train_loss": 1.2517024893769495, "train_runtime": 5380.7558, "train_samples_per_second": 80.379, "train_steps_per_second": 10.054 } ], "logging_steps": 500, "max_steps": 54100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 3.355193271048192e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }