|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.23143277929728592, |
|
"eval_steps": 10, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004207868714496108, |
|
"grad_norm": 10.497783660888672, |
|
"learning_rate": 0.0004909090909090909, |
|
"loss": 0.7938, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004207868714496108, |
|
"eval_accuracy": 0.5304018259048462, |
|
"eval_loss": 0.7331877946853638, |
|
"eval_runtime": 568.2697, |
|
"eval_samples_per_second": 8.364, |
|
"eval_steps_per_second": 2.092, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008415737428992216, |
|
"grad_norm": 2.557694435119629, |
|
"learning_rate": 0.00048181818181818184, |
|
"loss": 0.7504, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008415737428992216, |
|
"eval_accuracy": 0.5634336471557617, |
|
"eval_loss": 0.7359612584114075, |
|
"eval_runtime": 559.3032, |
|
"eval_samples_per_second": 8.498, |
|
"eval_steps_per_second": 2.126, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012623606143488323, |
|
"grad_norm": 3.9564919471740723, |
|
"learning_rate": 0.0004727272727272727, |
|
"loss": 0.7866, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012623606143488323, |
|
"eval_accuracy": 0.4085840582847595, |
|
"eval_loss": 0.7127184271812439, |
|
"eval_runtime": 567.0848, |
|
"eval_samples_per_second": 8.381, |
|
"eval_steps_per_second": 2.097, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.016831474857984433, |
|
"grad_norm": 29.89333724975586, |
|
"learning_rate": 0.00046363636363636366, |
|
"loss": 0.678, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016831474857984433, |
|
"eval_accuracy": 0.6875657439231873, |
|
"eval_loss": 0.6557931303977966, |
|
"eval_runtime": 563.9541, |
|
"eval_samples_per_second": 8.428, |
|
"eval_steps_per_second": 2.108, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.021039343572480537, |
|
"grad_norm": 1.9482946395874023, |
|
"learning_rate": 0.00045454545454545455, |
|
"loss": 0.8634, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021039343572480537, |
|
"eval_accuracy": 0.6572691202163696, |
|
"eval_loss": 0.613047182559967, |
|
"eval_runtime": 565.9198, |
|
"eval_samples_per_second": 8.399, |
|
"eval_steps_per_second": 2.101, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025247212286976645, |
|
"grad_norm": 1.0520250797271729, |
|
"learning_rate": 0.00044545454545454543, |
|
"loss": 0.5117, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025247212286976645, |
|
"eval_accuracy": 0.7416368722915649, |
|
"eval_loss": 0.5154983997344971, |
|
"eval_runtime": 566.7191, |
|
"eval_samples_per_second": 8.387, |
|
"eval_steps_per_second": 2.098, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"grad_norm": 3.958573341369629, |
|
"learning_rate": 0.00043636363636363637, |
|
"loss": 0.6066, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"eval_accuracy": 0.7858194708824158, |
|
"eval_loss": 0.5255141854286194, |
|
"eval_runtime": 561.6163, |
|
"eval_samples_per_second": 8.463, |
|
"eval_steps_per_second": 2.117, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.033662949715968865, |
|
"grad_norm": 7.255526542663574, |
|
"learning_rate": 0.00042727272727272726, |
|
"loss": 0.5282, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.033662949715968865, |
|
"eval_accuracy": 0.8045445084571838, |
|
"eval_loss": 0.6335883140563965, |
|
"eval_runtime": 565.1826, |
|
"eval_samples_per_second": 8.41, |
|
"eval_steps_per_second": 2.104, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03787081843046497, |
|
"grad_norm": 4.439815521240234, |
|
"learning_rate": 0.00041818181818181814, |
|
"loss": 0.3694, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03787081843046497, |
|
"eval_accuracy": 0.8161161541938782, |
|
"eval_loss": 0.464495450258255, |
|
"eval_runtime": 562.0768, |
|
"eval_samples_per_second": 8.456, |
|
"eval_steps_per_second": 2.115, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.042078687144961074, |
|
"grad_norm": 1.75038743019104, |
|
"learning_rate": 0.00040909090909090913, |
|
"loss": 0.4786, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.042078687144961074, |
|
"eval_accuracy": 0.8079107999801636, |
|
"eval_loss": 0.4171961545944214, |
|
"eval_runtime": 555.7896, |
|
"eval_samples_per_second": 8.552, |
|
"eval_steps_per_second": 2.139, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.046286555859457186, |
|
"grad_norm": 2.398599624633789, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3394, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.046286555859457186, |
|
"eval_accuracy": 0.8739743232727051, |
|
"eval_loss": 0.353726327419281, |
|
"eval_runtime": 558.2722, |
|
"eval_samples_per_second": 8.514, |
|
"eval_steps_per_second": 2.13, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05049442457395329, |
|
"grad_norm": 2.4667985439300537, |
|
"learning_rate": 0.00039090909090909096, |
|
"loss": 0.4313, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05049442457395329, |
|
"eval_accuracy": 0.8889122605323792, |
|
"eval_loss": 0.2820029556751251, |
|
"eval_runtime": 563.3341, |
|
"eval_samples_per_second": 8.437, |
|
"eval_steps_per_second": 2.111, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0547022932884494, |
|
"grad_norm": 1.1293903589248657, |
|
"learning_rate": 0.00038181818181818184, |
|
"loss": 0.3352, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0547022932884494, |
|
"eval_accuracy": 0.8889122605323792, |
|
"eval_loss": 0.28811973333358765, |
|
"eval_runtime": 563.2561, |
|
"eval_samples_per_second": 8.438, |
|
"eval_steps_per_second": 2.111, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"grad_norm": 17.52765655517578, |
|
"learning_rate": 0.00037272727272727273, |
|
"loss": 0.3772, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"eval_accuracy": 0.8116979002952576, |
|
"eval_loss": 0.4701159596443176, |
|
"eval_runtime": 565.5511, |
|
"eval_samples_per_second": 8.404, |
|
"eval_steps_per_second": 2.102, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06311803071744161, |
|
"grad_norm": 2.1096954345703125, |
|
"learning_rate": 0.00036363636363636367, |
|
"loss": 0.3699, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06311803071744161, |
|
"eval_accuracy": 0.8693456649780273, |
|
"eval_loss": 0.32162848114967346, |
|
"eval_runtime": 561.7642, |
|
"eval_samples_per_second": 8.461, |
|
"eval_steps_per_second": 2.117, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06732589943193773, |
|
"grad_norm": 0.7017167210578918, |
|
"learning_rate": 0.00035454545454545455, |
|
"loss": 0.4895, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06732589943193773, |
|
"eval_accuracy": 0.8817588686943054, |
|
"eval_loss": 0.36469992995262146, |
|
"eval_runtime": 563.6361, |
|
"eval_samples_per_second": 8.433, |
|
"eval_steps_per_second": 2.11, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07153376814643383, |
|
"grad_norm": 0.48568668961524963, |
|
"learning_rate": 0.00034545454545454544, |
|
"loss": 0.46, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07153376814643383, |
|
"eval_accuracy": 0.8449400663375854, |
|
"eval_loss": 0.35531896352767944, |
|
"eval_runtime": 566.3467, |
|
"eval_samples_per_second": 8.392, |
|
"eval_steps_per_second": 2.099, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07574163686092994, |
|
"grad_norm": 0.6846579909324646, |
|
"learning_rate": 0.0003363636363636364, |
|
"loss": 0.3931, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07574163686092994, |
|
"eval_accuracy": 0.8916473984718323, |
|
"eval_loss": 0.2812640070915222, |
|
"eval_runtime": 565.8446, |
|
"eval_samples_per_second": 8.4, |
|
"eval_steps_per_second": 2.101, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07994950557542604, |
|
"grad_norm": 2.391395092010498, |
|
"learning_rate": 0.00032727272727272726, |
|
"loss": 0.3666, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07994950557542604, |
|
"eval_accuracy": 0.8321060538291931, |
|
"eval_loss": 0.4278256595134735, |
|
"eval_runtime": 562.6059, |
|
"eval_samples_per_second": 8.448, |
|
"eval_steps_per_second": 2.113, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08415737428992215, |
|
"grad_norm": 1.708675503730774, |
|
"learning_rate": 0.0003181818181818182, |
|
"loss": 0.3471, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08415737428992215, |
|
"eval_accuracy": 0.9156322479248047, |
|
"eval_loss": 0.23789770901203156, |
|
"eval_runtime": 565.4642, |
|
"eval_samples_per_second": 8.405, |
|
"eval_steps_per_second": 2.103, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"grad_norm": 0.4107959270477295, |
|
"learning_rate": 0.0003090909090909091, |
|
"loss": 0.4351, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"eval_accuracy": 0.8889122605323792, |
|
"eval_loss": 0.29446446895599365, |
|
"eval_runtime": 562.2329, |
|
"eval_samples_per_second": 8.454, |
|
"eval_steps_per_second": 2.115, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09257311171891437, |
|
"grad_norm": 10.296160697937012, |
|
"learning_rate": 0.0003, |
|
"loss": 0.262, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09257311171891437, |
|
"eval_accuracy": 0.9017462730407715, |
|
"eval_loss": 0.307280033826828, |
|
"eval_runtime": 561.3167, |
|
"eval_samples_per_second": 8.468, |
|
"eval_steps_per_second": 2.118, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09678098043341048, |
|
"grad_norm": 0.6356618404388428, |
|
"learning_rate": 0.0002909090909090909, |
|
"loss": 0.3291, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09678098043341048, |
|
"eval_accuracy": 0.8752366900444031, |
|
"eval_loss": 0.3042367994785309, |
|
"eval_runtime": 567.4141, |
|
"eval_samples_per_second": 8.377, |
|
"eval_steps_per_second": 2.095, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10098884914790658, |
|
"grad_norm": 1.1508342027664185, |
|
"learning_rate": 0.0002818181818181818, |
|
"loss": 0.239, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10098884914790658, |
|
"eval_accuracy": 0.9150010347366333, |
|
"eval_loss": 0.20924389362335205, |
|
"eval_runtime": 564.144, |
|
"eval_samples_per_second": 8.425, |
|
"eval_steps_per_second": 2.108, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1051967178624027, |
|
"grad_norm": 1.0988367795944214, |
|
"learning_rate": 0.00027272727272727274, |
|
"loss": 0.4306, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1051967178624027, |
|
"eval_accuracy": 0.905954122543335, |
|
"eval_loss": 0.24196282029151917, |
|
"eval_runtime": 565.5038, |
|
"eval_samples_per_second": 8.405, |
|
"eval_steps_per_second": 2.103, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1094045865768988, |
|
"grad_norm": 1.5871909856796265, |
|
"learning_rate": 0.0002636363636363636, |
|
"loss": 0.2757, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1094045865768988, |
|
"eval_accuracy": 0.8415737152099609, |
|
"eval_loss": 0.3900565505027771, |
|
"eval_runtime": 566.4852, |
|
"eval_samples_per_second": 8.39, |
|
"eval_steps_per_second": 2.099, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11361245529139491, |
|
"grad_norm": 0.33946001529693604, |
|
"learning_rate": 0.0002545454545454545, |
|
"loss": 0.1851, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11361245529139491, |
|
"eval_accuracy": 0.9040606021881104, |
|
"eval_loss": 0.2729853093624115, |
|
"eval_runtime": 563.3502, |
|
"eval_samples_per_second": 8.437, |
|
"eval_steps_per_second": 2.111, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"grad_norm": 0.7806110978126526, |
|
"learning_rate": 0.00024545454545454545, |
|
"loss": 0.2655, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"eval_accuracy": 0.8645066022872925, |
|
"eval_loss": 0.3478306829929352, |
|
"eval_runtime": 565.4573, |
|
"eval_samples_per_second": 8.406, |
|
"eval_steps_per_second": 2.103, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12202819272038712, |
|
"grad_norm": 1.8319846391677856, |
|
"learning_rate": 0.00023636363636363636, |
|
"loss": 0.2119, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12202819272038712, |
|
"eval_accuracy": 0.9112139940261841, |
|
"eval_loss": 0.23500923812389374, |
|
"eval_runtime": 563.7148, |
|
"eval_samples_per_second": 8.432, |
|
"eval_steps_per_second": 2.109, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12623606143488322, |
|
"grad_norm": 1.5416840314865112, |
|
"learning_rate": 0.00022727272727272727, |
|
"loss": 0.1477, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12623606143488322, |
|
"eval_accuracy": 0.9219440221786499, |
|
"eval_loss": 0.3015914261341095, |
|
"eval_runtime": 561.7407, |
|
"eval_samples_per_second": 8.461, |
|
"eval_steps_per_second": 2.117, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13044393014937933, |
|
"grad_norm": 3.576409101486206, |
|
"learning_rate": 0.00021818181818181818, |
|
"loss": 0.2137, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13044393014937933, |
|
"eval_accuracy": 0.9282558560371399, |
|
"eval_loss": 0.30129292607307434, |
|
"eval_runtime": 563.5899, |
|
"eval_samples_per_second": 8.433, |
|
"eval_steps_per_second": 2.11, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13465179886387546, |
|
"grad_norm": 9.926487922668457, |
|
"learning_rate": 0.00020909090909090907, |
|
"loss": 0.1136, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13465179886387546, |
|
"eval_accuracy": 0.9307805299758911, |
|
"eval_loss": 0.2483457773923874, |
|
"eval_runtime": 564.5525, |
|
"eval_samples_per_second": 8.419, |
|
"eval_steps_per_second": 2.106, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13885966757837157, |
|
"grad_norm": 1.2627246379852295, |
|
"learning_rate": 0.0002, |
|
"loss": 0.197, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13885966757837157, |
|
"eval_accuracy": 0.913528323173523, |
|
"eval_loss": 0.3319007456302643, |
|
"eval_runtime": 557.94, |
|
"eval_samples_per_second": 8.519, |
|
"eval_steps_per_second": 2.131, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14306753629286767, |
|
"grad_norm": 14.595890998840332, |
|
"learning_rate": 0.00019090909090909092, |
|
"loss": 0.1492, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14306753629286767, |
|
"eval_accuracy": 0.8773406147956848, |
|
"eval_loss": 0.4927380383014679, |
|
"eval_runtime": 565.4025, |
|
"eval_samples_per_second": 8.406, |
|
"eval_steps_per_second": 2.103, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"grad_norm": 0.08015768229961395, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.0394, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"eval_accuracy": 0.8889122605323792, |
|
"eval_loss": 0.5241063833236694, |
|
"eval_runtime": 567.4727, |
|
"eval_samples_per_second": 8.376, |
|
"eval_steps_per_second": 2.095, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15148327372185988, |
|
"grad_norm": 1.9829438924789429, |
|
"learning_rate": 0.00017272727272727272, |
|
"loss": 0.284, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15148327372185988, |
|
"eval_accuracy": 0.924679160118103, |
|
"eval_loss": 0.3283034563064575, |
|
"eval_runtime": 565.6368, |
|
"eval_samples_per_second": 8.403, |
|
"eval_steps_per_second": 2.102, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15569114243635598, |
|
"grad_norm": 0.26033639907836914, |
|
"learning_rate": 0.00016363636363636363, |
|
"loss": 0.1096, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15569114243635598, |
|
"eval_accuracy": 0.9179465770721436, |
|
"eval_loss": 0.2719731330871582, |
|
"eval_runtime": 568.9149, |
|
"eval_samples_per_second": 8.355, |
|
"eval_steps_per_second": 2.09, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1598990111508521, |
|
"grad_norm": 0.4041542112827301, |
|
"learning_rate": 0.00015454545454545454, |
|
"loss": 0.3147, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1598990111508521, |
|
"eval_accuracy": 0.9389858841896057, |
|
"eval_loss": 0.20388783514499664, |
|
"eval_runtime": 576.8924, |
|
"eval_samples_per_second": 8.239, |
|
"eval_steps_per_second": 2.061, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1641068798653482, |
|
"grad_norm": 0.7307087182998657, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.1, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1641068798653482, |
|
"eval_accuracy": 0.9320428967475891, |
|
"eval_loss": 0.2630753219127655, |
|
"eval_runtime": 572.9749, |
|
"eval_samples_per_second": 8.295, |
|
"eval_steps_per_second": 2.075, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1683147485798443, |
|
"grad_norm": 0.9732351303100586, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 0.1954, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1683147485798443, |
|
"eval_accuracy": 0.93309485912323, |
|
"eval_loss": 0.265785813331604, |
|
"eval_runtime": 571.5753, |
|
"eval_samples_per_second": 8.316, |
|
"eval_steps_per_second": 2.08, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17252261729434043, |
|
"grad_norm": 3.890988826751709, |
|
"learning_rate": 0.00012727272727272725, |
|
"loss": 0.1332, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17252261729434043, |
|
"eval_accuracy": 0.9469808340072632, |
|
"eval_loss": 0.2041083574295044, |
|
"eval_runtime": 572.7877, |
|
"eval_samples_per_second": 8.298, |
|
"eval_steps_per_second": 2.076, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"grad_norm": 0.22084006667137146, |
|
"learning_rate": 0.00011818181818181818, |
|
"loss": 0.116, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"eval_accuracy": 0.9162634015083313, |
|
"eval_loss": 0.3249155879020691, |
|
"eval_runtime": 572.8138, |
|
"eval_samples_per_second": 8.298, |
|
"eval_steps_per_second": 2.076, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18093835472333264, |
|
"grad_norm": 0.09070427715778351, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.3625, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18093835472333264, |
|
"eval_accuracy": 0.9541342258453369, |
|
"eval_loss": 0.16712911427021027, |
|
"eval_runtime": 570.8705, |
|
"eval_samples_per_second": 8.326, |
|
"eval_steps_per_second": 2.083, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18514622343782874, |
|
"grad_norm": 3.719890594482422, |
|
"learning_rate": 0.0001, |
|
"loss": 0.203, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18514622343782874, |
|
"eval_accuracy": 0.9591836929321289, |
|
"eval_loss": 0.11601703613996506, |
|
"eval_runtime": 559.6205, |
|
"eval_samples_per_second": 8.493, |
|
"eval_steps_per_second": 2.125, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18935409215232485, |
|
"grad_norm": 1.8163635730743408, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.1065, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18935409215232485, |
|
"eval_accuracy": 0.9663370251655579, |
|
"eval_loss": 0.11517694592475891, |
|
"eval_runtime": 564.4747, |
|
"eval_samples_per_second": 8.42, |
|
"eval_steps_per_second": 2.106, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.19356196086682095, |
|
"grad_norm": 0.489663302898407, |
|
"learning_rate": 8.181818181818182e-05, |
|
"loss": 0.0797, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19356196086682095, |
|
"eval_accuracy": 0.9524511098861694, |
|
"eval_loss": 0.17924398183822632, |
|
"eval_runtime": 566.5969, |
|
"eval_samples_per_second": 8.389, |
|
"eval_steps_per_second": 2.098, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19776982958131706, |
|
"grad_norm": 0.12014146894216537, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.158, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.19776982958131706, |
|
"eval_accuracy": 0.9545550346374512, |
|
"eval_loss": 0.17613738775253296, |
|
"eval_runtime": 565.847, |
|
"eval_samples_per_second": 8.4, |
|
"eval_steps_per_second": 2.101, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20197769829581316, |
|
"grad_norm": 0.30841195583343506, |
|
"learning_rate": 6.363636363636363e-05, |
|
"loss": 0.1226, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20197769829581316, |
|
"eval_accuracy": 0.954344630241394, |
|
"eval_loss": 0.1746922880411148, |
|
"eval_runtime": 566.983, |
|
"eval_samples_per_second": 8.383, |
|
"eval_steps_per_second": 2.097, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.17474113404750824, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.0502, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"eval_accuracy": 0.9596044421195984, |
|
"eval_loss": 0.16830527782440186, |
|
"eval_runtime": 566.0043, |
|
"eval_samples_per_second": 8.397, |
|
"eval_steps_per_second": 2.101, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2103934357248054, |
|
"grad_norm": 0.19428321719169617, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.0346, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2103934357248054, |
|
"eval_accuracy": 0.9585524797439575, |
|
"eval_loss": 0.17505739629268646, |
|
"eval_runtime": 564.2226, |
|
"eval_samples_per_second": 8.424, |
|
"eval_steps_per_second": 2.107, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2146013044393015, |
|
"grad_norm": 0.13528193533420563, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.0168, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2146013044393015, |
|
"eval_accuracy": 0.958131730556488, |
|
"eval_loss": 0.18199646472930908, |
|
"eval_runtime": 563.8378, |
|
"eval_samples_per_second": 8.43, |
|
"eval_steps_per_second": 2.109, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2188091731537976, |
|
"grad_norm": 0.03537715971469879, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.1428, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2188091731537976, |
|
"eval_accuracy": 0.9562381505966187, |
|
"eval_loss": 0.1906786561012268, |
|
"eval_runtime": 567.1515, |
|
"eval_samples_per_second": 8.38, |
|
"eval_steps_per_second": 2.096, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.22301704186829371, |
|
"grad_norm": 0.5100229978561401, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.0336, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22301704186829371, |
|
"eval_accuracy": 0.9503471255302429, |
|
"eval_loss": 0.2207876741886139, |
|
"eval_runtime": 564.0513, |
|
"eval_samples_per_second": 8.427, |
|
"eval_steps_per_second": 2.108, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22722491058278982, |
|
"grad_norm": 5.639271259307861, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.1161, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22722491058278982, |
|
"eval_accuracy": 0.9488744139671326, |
|
"eval_loss": 0.2279575914144516, |
|
"eval_runtime": 564.6939, |
|
"eval_samples_per_second": 8.417, |
|
"eval_steps_per_second": 2.106, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.23143277929728592, |
|
"grad_norm": 0.09282880276441574, |
|
"learning_rate": 0.0, |
|
"loss": 0.0377, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.23143277929728592, |
|
"eval_accuracy": 0.9495055675506592, |
|
"eval_loss": 0.22491338849067688, |
|
"eval_runtime": 556.6035, |
|
"eval_samples_per_second": 8.539, |
|
"eval_steps_per_second": 2.136, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.208452205931052e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|