{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23143277929728592, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004207868714496108, "grad_norm": 10.497783660888672, "learning_rate": 0.0004909090909090909, "loss": 0.7938, "step": 10 }, { "epoch": 0.004207868714496108, "eval_accuracy": 0.5304018259048462, "eval_loss": 0.7331877946853638, "eval_runtime": 568.2697, "eval_samples_per_second": 8.364, "eval_steps_per_second": 2.092, "step": 10 }, { "epoch": 0.008415737428992216, "grad_norm": 2.557694435119629, "learning_rate": 0.00048181818181818184, "loss": 0.7504, "step": 20 }, { "epoch": 0.008415737428992216, "eval_accuracy": 0.5634336471557617, "eval_loss": 0.7359612584114075, "eval_runtime": 559.3032, "eval_samples_per_second": 8.498, "eval_steps_per_second": 2.126, "step": 20 }, { "epoch": 0.012623606143488323, "grad_norm": 3.9564919471740723, "learning_rate": 0.0004727272727272727, "loss": 0.7866, "step": 30 }, { "epoch": 0.012623606143488323, "eval_accuracy": 0.4085840582847595, "eval_loss": 0.7127184271812439, "eval_runtime": 567.0848, "eval_samples_per_second": 8.381, "eval_steps_per_second": 2.097, "step": 30 }, { "epoch": 0.016831474857984433, "grad_norm": 29.89333724975586, "learning_rate": 0.00046363636363636366, "loss": 0.678, "step": 40 }, { "epoch": 0.016831474857984433, "eval_accuracy": 0.6875657439231873, "eval_loss": 0.6557931303977966, "eval_runtime": 563.9541, "eval_samples_per_second": 8.428, "eval_steps_per_second": 2.108, "step": 40 }, { "epoch": 0.021039343572480537, "grad_norm": 1.9482946395874023, "learning_rate": 0.00045454545454545455, "loss": 0.8634, "step": 50 }, { "epoch": 0.021039343572480537, "eval_accuracy": 0.6572691202163696, "eval_loss": 0.613047182559967, "eval_runtime": 565.9198, "eval_samples_per_second": 8.399, "eval_steps_per_second": 2.101, "step": 50 }, { "epoch": 0.025247212286976645, "grad_norm": 1.0520250797271729, "learning_rate": 0.00044545454545454543, "loss": 0.5117, "step": 60 }, { "epoch": 0.025247212286976645, "eval_accuracy": 0.7416368722915649, "eval_loss": 0.5154983997344971, "eval_runtime": 566.7191, "eval_samples_per_second": 8.387, "eval_steps_per_second": 2.098, "step": 60 }, { "epoch": 0.029455081001472753, "grad_norm": 3.958573341369629, "learning_rate": 0.00043636363636363637, "loss": 0.6066, "step": 70 }, { "epoch": 0.029455081001472753, "eval_accuracy": 0.7858194708824158, "eval_loss": 0.5255141854286194, "eval_runtime": 561.6163, "eval_samples_per_second": 8.463, "eval_steps_per_second": 2.117, "step": 70 }, { "epoch": 0.033662949715968865, "grad_norm": 7.255526542663574, "learning_rate": 0.00042727272727272726, "loss": 0.5282, "step": 80 }, { "epoch": 0.033662949715968865, "eval_accuracy": 0.8045445084571838, "eval_loss": 0.6335883140563965, "eval_runtime": 565.1826, "eval_samples_per_second": 8.41, "eval_steps_per_second": 2.104, "step": 80 }, { "epoch": 0.03787081843046497, "grad_norm": 4.439815521240234, "learning_rate": 0.00041818181818181814, "loss": 0.3694, "step": 90 }, { "epoch": 0.03787081843046497, "eval_accuracy": 0.8161161541938782, "eval_loss": 0.464495450258255, "eval_runtime": 562.0768, "eval_samples_per_second": 8.456, "eval_steps_per_second": 2.115, "step": 90 }, { "epoch": 0.042078687144961074, "grad_norm": 1.75038743019104, "learning_rate": 0.00040909090909090913, "loss": 0.4786, "step": 100 }, { "epoch": 0.042078687144961074, "eval_accuracy": 0.8079107999801636, "eval_loss": 0.4171961545944214, "eval_runtime": 555.7896, "eval_samples_per_second": 8.552, "eval_steps_per_second": 2.139, "step": 100 }, { "epoch": 0.046286555859457186, "grad_norm": 2.398599624633789, "learning_rate": 0.0004, "loss": 0.3394, "step": 110 }, { "epoch": 0.046286555859457186, "eval_accuracy": 0.8739743232727051, "eval_loss": 0.353726327419281, "eval_runtime": 558.2722, "eval_samples_per_second": 8.514, "eval_steps_per_second": 2.13, "step": 110 }, { "epoch": 0.05049442457395329, "grad_norm": 2.4667985439300537, "learning_rate": 0.00039090909090909096, "loss": 0.4313, "step": 120 }, { "epoch": 0.05049442457395329, "eval_accuracy": 0.8889122605323792, "eval_loss": 0.2820029556751251, "eval_runtime": 563.3341, "eval_samples_per_second": 8.437, "eval_steps_per_second": 2.111, "step": 120 }, { "epoch": 0.0547022932884494, "grad_norm": 1.1293903589248657, "learning_rate": 0.00038181818181818184, "loss": 0.3352, "step": 130 }, { "epoch": 0.0547022932884494, "eval_accuracy": 0.8889122605323792, "eval_loss": 0.28811973333358765, "eval_runtime": 563.2561, "eval_samples_per_second": 8.438, "eval_steps_per_second": 2.111, "step": 130 }, { "epoch": 0.05891016200294551, "grad_norm": 17.52765655517578, "learning_rate": 0.00037272727272727273, "loss": 0.3772, "step": 140 }, { "epoch": 0.05891016200294551, "eval_accuracy": 0.8116979002952576, "eval_loss": 0.4701159596443176, "eval_runtime": 565.5511, "eval_samples_per_second": 8.404, "eval_steps_per_second": 2.102, "step": 140 }, { "epoch": 0.06311803071744161, "grad_norm": 2.1096954345703125, "learning_rate": 0.00036363636363636367, "loss": 0.3699, "step": 150 }, { "epoch": 0.06311803071744161, "eval_accuracy": 0.8693456649780273, "eval_loss": 0.32162848114967346, "eval_runtime": 561.7642, "eval_samples_per_second": 8.461, "eval_steps_per_second": 2.117, "step": 150 }, { "epoch": 0.06732589943193773, "grad_norm": 0.7017167210578918, "learning_rate": 0.00035454545454545455, "loss": 0.4895, "step": 160 }, { "epoch": 0.06732589943193773, "eval_accuracy": 0.8817588686943054, "eval_loss": 0.36469992995262146, "eval_runtime": 563.6361, "eval_samples_per_second": 8.433, "eval_steps_per_second": 2.11, "step": 160 }, { "epoch": 0.07153376814643383, "grad_norm": 0.48568668961524963, "learning_rate": 0.00034545454545454544, "loss": 0.46, "step": 170 }, { "epoch": 0.07153376814643383, "eval_accuracy": 0.8449400663375854, "eval_loss": 0.35531896352767944, "eval_runtime": 566.3467, "eval_samples_per_second": 8.392, "eval_steps_per_second": 2.099, "step": 170 }, { "epoch": 0.07574163686092994, "grad_norm": 0.6846579909324646, "learning_rate": 0.0003363636363636364, "loss": 0.3931, "step": 180 }, { "epoch": 0.07574163686092994, "eval_accuracy": 0.8916473984718323, "eval_loss": 0.2812640070915222, "eval_runtime": 565.8446, "eval_samples_per_second": 8.4, "eval_steps_per_second": 2.101, "step": 180 }, { "epoch": 0.07994950557542604, "grad_norm": 2.391395092010498, "learning_rate": 0.00032727272727272726, "loss": 0.3666, "step": 190 }, { "epoch": 0.07994950557542604, "eval_accuracy": 0.8321060538291931, "eval_loss": 0.4278256595134735, "eval_runtime": 562.6059, "eval_samples_per_second": 8.448, "eval_steps_per_second": 2.113, "step": 190 }, { "epoch": 0.08415737428992215, "grad_norm": 1.708675503730774, "learning_rate": 0.0003181818181818182, "loss": 0.3471, "step": 200 }, { "epoch": 0.08415737428992215, "eval_accuracy": 0.9156322479248047, "eval_loss": 0.23789770901203156, "eval_runtime": 565.4642, "eval_samples_per_second": 8.405, "eval_steps_per_second": 2.103, "step": 200 }, { "epoch": 0.08836524300441827, "grad_norm": 0.4107959270477295, "learning_rate": 0.0003090909090909091, "loss": 0.4351, "step": 210 }, { "epoch": 0.08836524300441827, "eval_accuracy": 0.8889122605323792, "eval_loss": 0.29446446895599365, "eval_runtime": 562.2329, "eval_samples_per_second": 8.454, "eval_steps_per_second": 2.115, "step": 210 }, { "epoch": 0.09257311171891437, "grad_norm": 10.296160697937012, "learning_rate": 0.0003, "loss": 0.262, "step": 220 }, { "epoch": 0.09257311171891437, "eval_accuracy": 0.9017462730407715, "eval_loss": 0.307280033826828, "eval_runtime": 561.3167, "eval_samples_per_second": 8.468, "eval_steps_per_second": 2.118, "step": 220 }, { "epoch": 0.09678098043341048, "grad_norm": 0.6356618404388428, "learning_rate": 0.0002909090909090909, "loss": 0.3291, "step": 230 }, { "epoch": 0.09678098043341048, "eval_accuracy": 0.8752366900444031, "eval_loss": 0.3042367994785309, "eval_runtime": 567.4141, "eval_samples_per_second": 8.377, "eval_steps_per_second": 2.095, "step": 230 }, { "epoch": 0.10098884914790658, "grad_norm": 1.1508342027664185, "learning_rate": 0.0002818181818181818, "loss": 0.239, "step": 240 }, { "epoch": 0.10098884914790658, "eval_accuracy": 0.9150010347366333, "eval_loss": 0.20924389362335205, "eval_runtime": 564.144, "eval_samples_per_second": 8.425, "eval_steps_per_second": 2.108, "step": 240 }, { "epoch": 0.1051967178624027, "grad_norm": 1.0988367795944214, "learning_rate": 0.00027272727272727274, "loss": 0.4306, "step": 250 }, { "epoch": 0.1051967178624027, "eval_accuracy": 0.905954122543335, "eval_loss": 0.24196282029151917, "eval_runtime": 565.5038, "eval_samples_per_second": 8.405, "eval_steps_per_second": 2.103, "step": 250 }, { "epoch": 0.1094045865768988, "grad_norm": 1.5871909856796265, "learning_rate": 0.0002636363636363636, "loss": 0.2757, "step": 260 }, { "epoch": 0.1094045865768988, "eval_accuracy": 0.8415737152099609, "eval_loss": 0.3900565505027771, "eval_runtime": 566.4852, "eval_samples_per_second": 8.39, "eval_steps_per_second": 2.099, "step": 260 }, { "epoch": 0.11361245529139491, "grad_norm": 0.33946001529693604, "learning_rate": 0.0002545454545454545, "loss": 0.1851, "step": 270 }, { "epoch": 0.11361245529139491, "eval_accuracy": 0.9040606021881104, "eval_loss": 0.2729853093624115, "eval_runtime": 563.3502, "eval_samples_per_second": 8.437, "eval_steps_per_second": 2.111, "step": 270 }, { "epoch": 0.11782032400589101, "grad_norm": 0.7806110978126526, "learning_rate": 0.00024545454545454545, "loss": 0.2655, "step": 280 }, { "epoch": 0.11782032400589101, "eval_accuracy": 0.8645066022872925, "eval_loss": 0.3478306829929352, "eval_runtime": 565.4573, "eval_samples_per_second": 8.406, "eval_steps_per_second": 2.103, "step": 280 }, { "epoch": 0.12202819272038712, "grad_norm": 1.8319846391677856, "learning_rate": 0.00023636363636363636, "loss": 0.2119, "step": 290 }, { "epoch": 0.12202819272038712, "eval_accuracy": 0.9112139940261841, "eval_loss": 0.23500923812389374, "eval_runtime": 563.7148, "eval_samples_per_second": 8.432, "eval_steps_per_second": 2.109, "step": 290 }, { "epoch": 0.12623606143488322, "grad_norm": 1.5416840314865112, "learning_rate": 0.00022727272727272727, "loss": 0.1477, "step": 300 }, { "epoch": 0.12623606143488322, "eval_accuracy": 0.9219440221786499, "eval_loss": 0.3015914261341095, "eval_runtime": 561.7407, "eval_samples_per_second": 8.461, "eval_steps_per_second": 2.117, "step": 300 }, { "epoch": 0.13044393014937933, "grad_norm": 3.576409101486206, "learning_rate": 0.00021818181818181818, "loss": 0.2137, "step": 310 }, { "epoch": 0.13044393014937933, "eval_accuracy": 0.9282558560371399, "eval_loss": 0.30129292607307434, "eval_runtime": 563.5899, "eval_samples_per_second": 8.433, "eval_steps_per_second": 2.11, "step": 310 }, { "epoch": 0.13465179886387546, "grad_norm": 9.926487922668457, "learning_rate": 0.00020909090909090907, "loss": 0.1136, "step": 320 }, { "epoch": 0.13465179886387546, "eval_accuracy": 0.9307805299758911, "eval_loss": 0.2483457773923874, "eval_runtime": 564.5525, "eval_samples_per_second": 8.419, "eval_steps_per_second": 2.106, "step": 320 }, { "epoch": 0.13885966757837157, "grad_norm": 1.2627246379852295, "learning_rate": 0.0002, "loss": 0.197, "step": 330 }, { "epoch": 0.13885966757837157, "eval_accuracy": 0.913528323173523, "eval_loss": 0.3319007456302643, "eval_runtime": 557.94, "eval_samples_per_second": 8.519, "eval_steps_per_second": 2.131, "step": 330 }, { "epoch": 0.14306753629286767, "grad_norm": 14.595890998840332, "learning_rate": 0.00019090909090909092, "loss": 0.1492, "step": 340 }, { "epoch": 0.14306753629286767, "eval_accuracy": 0.8773406147956848, "eval_loss": 0.4927380383014679, "eval_runtime": 565.4025, "eval_samples_per_second": 8.406, "eval_steps_per_second": 2.103, "step": 340 }, { "epoch": 0.14727540500736377, "grad_norm": 0.08015768229961395, "learning_rate": 0.00018181818181818183, "loss": 0.0394, "step": 350 }, { "epoch": 0.14727540500736377, "eval_accuracy": 0.8889122605323792, "eval_loss": 0.5241063833236694, "eval_runtime": 567.4727, "eval_samples_per_second": 8.376, "eval_steps_per_second": 2.095, "step": 350 }, { "epoch": 0.15148327372185988, "grad_norm": 1.9829438924789429, "learning_rate": 0.00017272727272727272, "loss": 0.284, "step": 360 }, { "epoch": 0.15148327372185988, "eval_accuracy": 0.924679160118103, "eval_loss": 0.3283034563064575, "eval_runtime": 565.6368, "eval_samples_per_second": 8.403, "eval_steps_per_second": 2.102, "step": 360 }, { "epoch": 0.15569114243635598, "grad_norm": 0.26033639907836914, "learning_rate": 0.00016363636363636363, "loss": 0.1096, "step": 370 }, { "epoch": 0.15569114243635598, "eval_accuracy": 0.9179465770721436, "eval_loss": 0.2719731330871582, "eval_runtime": 568.9149, "eval_samples_per_second": 8.355, "eval_steps_per_second": 2.09, "step": 370 }, { "epoch": 0.1598990111508521, "grad_norm": 0.4041542112827301, "learning_rate": 0.00015454545454545454, "loss": 0.3147, "step": 380 }, { "epoch": 0.1598990111508521, "eval_accuracy": 0.9389858841896057, "eval_loss": 0.20388783514499664, "eval_runtime": 576.8924, "eval_samples_per_second": 8.239, "eval_steps_per_second": 2.061, "step": 380 }, { "epoch": 0.1641068798653482, "grad_norm": 0.7307087182998657, "learning_rate": 0.00014545454545454546, "loss": 0.1, "step": 390 }, { "epoch": 0.1641068798653482, "eval_accuracy": 0.9320428967475891, "eval_loss": 0.2630753219127655, "eval_runtime": 572.9749, "eval_samples_per_second": 8.295, "eval_steps_per_second": 2.075, "step": 390 }, { "epoch": 0.1683147485798443, "grad_norm": 0.9732351303100586, "learning_rate": 0.00013636363636363637, "loss": 0.1954, "step": 400 }, { "epoch": 0.1683147485798443, "eval_accuracy": 0.93309485912323, "eval_loss": 0.265785813331604, "eval_runtime": 571.5753, "eval_samples_per_second": 8.316, "eval_steps_per_second": 2.08, "step": 400 }, { "epoch": 0.17252261729434043, "grad_norm": 3.890988826751709, "learning_rate": 0.00012727272727272725, "loss": 0.1332, "step": 410 }, { "epoch": 0.17252261729434043, "eval_accuracy": 0.9469808340072632, "eval_loss": 0.2041083574295044, "eval_runtime": 572.7877, "eval_samples_per_second": 8.298, "eval_steps_per_second": 2.076, "step": 410 }, { "epoch": 0.17673048600883653, "grad_norm": 0.22084006667137146, "learning_rate": 0.00011818181818181818, "loss": 0.116, "step": 420 }, { "epoch": 0.17673048600883653, "eval_accuracy": 0.9162634015083313, "eval_loss": 0.3249155879020691, "eval_runtime": 572.8138, "eval_samples_per_second": 8.298, "eval_steps_per_second": 2.076, "step": 420 }, { "epoch": 0.18093835472333264, "grad_norm": 0.09070427715778351, "learning_rate": 0.00010909090909090909, "loss": 0.3625, "step": 430 }, { "epoch": 0.18093835472333264, "eval_accuracy": 0.9541342258453369, "eval_loss": 0.16712911427021027, "eval_runtime": 570.8705, "eval_samples_per_second": 8.326, "eval_steps_per_second": 2.083, "step": 430 }, { "epoch": 0.18514622343782874, "grad_norm": 3.719890594482422, "learning_rate": 0.0001, "loss": 0.203, "step": 440 }, { "epoch": 0.18514622343782874, "eval_accuracy": 0.9591836929321289, "eval_loss": 0.11601703613996506, "eval_runtime": 559.6205, "eval_samples_per_second": 8.493, "eval_steps_per_second": 2.125, "step": 440 }, { "epoch": 0.18935409215232485, "grad_norm": 1.8163635730743408, "learning_rate": 9.090909090909092e-05, "loss": 0.1065, "step": 450 }, { "epoch": 0.18935409215232485, "eval_accuracy": 0.9663370251655579, "eval_loss": 0.11517694592475891, "eval_runtime": 564.4747, "eval_samples_per_second": 8.42, "eval_steps_per_second": 2.106, "step": 450 }, { "epoch": 0.19356196086682095, "grad_norm": 0.489663302898407, "learning_rate": 8.181818181818182e-05, "loss": 0.0797, "step": 460 }, { "epoch": 0.19356196086682095, "eval_accuracy": 0.9524511098861694, "eval_loss": 0.17924398183822632, "eval_runtime": 566.5969, "eval_samples_per_second": 8.389, "eval_steps_per_second": 2.098, "step": 460 }, { "epoch": 0.19776982958131706, "grad_norm": 0.12014146894216537, "learning_rate": 7.272727272727273e-05, "loss": 0.158, "step": 470 }, { "epoch": 0.19776982958131706, "eval_accuracy": 0.9545550346374512, "eval_loss": 0.17613738775253296, "eval_runtime": 565.847, "eval_samples_per_second": 8.4, "eval_steps_per_second": 2.101, "step": 470 }, { "epoch": 0.20197769829581316, "grad_norm": 0.30841195583343506, "learning_rate": 6.363636363636363e-05, "loss": 0.1226, "step": 480 }, { "epoch": 0.20197769829581316, "eval_accuracy": 0.954344630241394, "eval_loss": 0.1746922880411148, "eval_runtime": 566.983, "eval_samples_per_second": 8.383, "eval_steps_per_second": 2.097, "step": 480 }, { "epoch": 0.20618556701030927, "grad_norm": 0.17474113404750824, "learning_rate": 5.4545454545454546e-05, "loss": 0.0502, "step": 490 }, { "epoch": 0.20618556701030927, "eval_accuracy": 0.9596044421195984, "eval_loss": 0.16830527782440186, "eval_runtime": 566.0043, "eval_samples_per_second": 8.397, "eval_steps_per_second": 2.101, "step": 490 }, { "epoch": 0.2103934357248054, "grad_norm": 0.19428321719169617, "learning_rate": 4.545454545454546e-05, "loss": 0.0346, "step": 500 }, { "epoch": 0.2103934357248054, "eval_accuracy": 0.9585524797439575, "eval_loss": 0.17505739629268646, "eval_runtime": 564.2226, "eval_samples_per_second": 8.424, "eval_steps_per_second": 2.107, "step": 500 }, { "epoch": 0.2146013044393015, "grad_norm": 0.13528193533420563, "learning_rate": 3.6363636363636364e-05, "loss": 0.0168, "step": 510 }, { "epoch": 0.2146013044393015, "eval_accuracy": 0.958131730556488, "eval_loss": 0.18199646472930908, "eval_runtime": 563.8378, "eval_samples_per_second": 8.43, "eval_steps_per_second": 2.109, "step": 510 }, { "epoch": 0.2188091731537976, "grad_norm": 0.03537715971469879, "learning_rate": 2.7272727272727273e-05, "loss": 0.1428, "step": 520 }, { "epoch": 0.2188091731537976, "eval_accuracy": 0.9562381505966187, "eval_loss": 0.1906786561012268, "eval_runtime": 567.1515, "eval_samples_per_second": 8.38, "eval_steps_per_second": 2.096, "step": 520 }, { "epoch": 0.22301704186829371, "grad_norm": 0.5100229978561401, "learning_rate": 1.8181818181818182e-05, "loss": 0.0336, "step": 530 }, { "epoch": 0.22301704186829371, "eval_accuracy": 0.9503471255302429, "eval_loss": 0.2207876741886139, "eval_runtime": 564.0513, "eval_samples_per_second": 8.427, "eval_steps_per_second": 2.108, "step": 530 }, { "epoch": 0.22722491058278982, "grad_norm": 5.639271259307861, "learning_rate": 9.090909090909091e-06, "loss": 0.1161, "step": 540 }, { "epoch": 0.22722491058278982, "eval_accuracy": 0.9488744139671326, "eval_loss": 0.2279575914144516, "eval_runtime": 564.6939, "eval_samples_per_second": 8.417, "eval_steps_per_second": 2.106, "step": 540 }, { "epoch": 0.23143277929728592, "grad_norm": 0.09282880276441574, "learning_rate": 0.0, "loss": 0.0377, "step": 550 }, { "epoch": 0.23143277929728592, "eval_accuracy": 0.9495055675506592, "eval_loss": 0.22491338849067688, "eval_runtime": 556.6035, "eval_samples_per_second": 8.539, "eval_steps_per_second": 2.136, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.208452205931052e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }