|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.23143277929728592, |
|
"eval_steps": 10, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004207868714496108, |
|
"grad_norm": 3.7386245727539062, |
|
"learning_rate": 0.00019636363636363636, |
|
"loss": 0.6798, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004207868714496108, |
|
"eval_accuracy": 0.6183463335037231, |
|
"eval_loss": 0.6594762802124023, |
|
"eval_runtime": 597.7321, |
|
"eval_samples_per_second": 7.952, |
|
"eval_steps_per_second": 1.989, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008415737428992216, |
|
"grad_norm": 23.72800636291504, |
|
"learning_rate": 0.00019272727272727274, |
|
"loss": 0.734, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008415737428992216, |
|
"eval_accuracy": 0.6175047159194946, |
|
"eval_loss": 0.6504878401756287, |
|
"eval_runtime": 581.0465, |
|
"eval_samples_per_second": 8.18, |
|
"eval_steps_per_second": 2.046, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012623606143488323, |
|
"grad_norm": 9.25478458404541, |
|
"learning_rate": 0.0001890909090909091, |
|
"loss": 0.6702, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012623606143488323, |
|
"eval_accuracy": 0.6919839978218079, |
|
"eval_loss": 0.5777503252029419, |
|
"eval_runtime": 578.0228, |
|
"eval_samples_per_second": 8.223, |
|
"eval_steps_per_second": 2.057, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.016831474857984433, |
|
"grad_norm": 7.304983615875244, |
|
"learning_rate": 0.00018545454545454545, |
|
"loss": 0.4556, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016831474857984433, |
|
"eval_accuracy": 0.8396801948547363, |
|
"eval_loss": 0.40889662504196167, |
|
"eval_runtime": 571.8895, |
|
"eval_samples_per_second": 8.311, |
|
"eval_steps_per_second": 2.079, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.021039343572480537, |
|
"grad_norm": 7.751391887664795, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.569, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021039343572480537, |
|
"eval_accuracy": 0.7944456338882446, |
|
"eval_loss": 0.4470515251159668, |
|
"eval_runtime": 573.7399, |
|
"eval_samples_per_second": 8.284, |
|
"eval_steps_per_second": 2.072, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025247212286976645, |
|
"grad_norm": 1.6625334024429321, |
|
"learning_rate": 0.0001781818181818182, |
|
"loss": 0.4117, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025247212286976645, |
|
"eval_accuracy": 0.8788133859634399, |
|
"eval_loss": 0.2980070412158966, |
|
"eval_runtime": 568.2097, |
|
"eval_samples_per_second": 8.365, |
|
"eval_steps_per_second": 2.093, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"grad_norm": 3.0882933139801025, |
|
"learning_rate": 0.00017454545454545454, |
|
"loss": 0.4465, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"eval_accuracy": 0.8990111351013184, |
|
"eval_loss": 0.24757108092308044, |
|
"eval_runtime": 561.4024, |
|
"eval_samples_per_second": 8.466, |
|
"eval_steps_per_second": 2.118, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.033662949715968865, |
|
"grad_norm": 1.2464115619659424, |
|
"learning_rate": 0.0001709090909090909, |
|
"loss": 0.4248, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.033662949715968865, |
|
"eval_accuracy": 0.9202609062194824, |
|
"eval_loss": 0.2152455598115921, |
|
"eval_runtime": 561.7534, |
|
"eval_samples_per_second": 8.461, |
|
"eval_steps_per_second": 2.117, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03787081843046497, |
|
"grad_norm": 3.7030742168426514, |
|
"learning_rate": 0.00016727272727272728, |
|
"loss": 0.2108, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03787081843046497, |
|
"eval_accuracy": 0.9377235174179077, |
|
"eval_loss": 0.18127086758613586, |
|
"eval_runtime": 560.5533, |
|
"eval_samples_per_second": 8.479, |
|
"eval_steps_per_second": 2.121, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.042078687144961074, |
|
"grad_norm": 3.4107439517974854, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 0.3354, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.042078687144961074, |
|
"eval_accuracy": 0.9215232729911804, |
|
"eval_loss": 0.19136299192905426, |
|
"eval_runtime": 554.2596, |
|
"eval_samples_per_second": 8.575, |
|
"eval_steps_per_second": 2.145, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.046286555859457186, |
|
"grad_norm": 9.938206672668457, |
|
"learning_rate": 0.00016, |
|
"loss": 0.2655, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.046286555859457186, |
|
"eval_accuracy": 0.9354091882705688, |
|
"eval_loss": 0.1784314066171646, |
|
"eval_runtime": 556.5406, |
|
"eval_samples_per_second": 8.54, |
|
"eval_steps_per_second": 2.136, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05049442457395329, |
|
"grad_norm": 7.809841156005859, |
|
"learning_rate": 0.00015636363636363637, |
|
"loss": 0.3885, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05049442457395329, |
|
"eval_accuracy": 0.8979591727256775, |
|
"eval_loss": 0.2238394021987915, |
|
"eval_runtime": 558.4451, |
|
"eval_samples_per_second": 8.511, |
|
"eval_steps_per_second": 2.129, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0547022932884494, |
|
"grad_norm": 0.18238992989063263, |
|
"learning_rate": 0.00015272727272727275, |
|
"loss": 0.1268, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0547022932884494, |
|
"eval_accuracy": 0.965285062789917, |
|
"eval_loss": 0.08994818478822708, |
|
"eval_runtime": 564.2804, |
|
"eval_samples_per_second": 8.423, |
|
"eval_steps_per_second": 2.107, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"grad_norm": 21.266611099243164, |
|
"learning_rate": 0.0001490909090909091, |
|
"loss": 0.222, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"eval_accuracy": 0.9591836929321289, |
|
"eval_loss": 0.2074023336172104, |
|
"eval_runtime": 563.4722, |
|
"eval_samples_per_second": 8.435, |
|
"eval_steps_per_second": 2.11, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06311803071744161, |
|
"grad_norm": 11.509130477905273, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.4098, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06311803071744161, |
|
"eval_accuracy": 0.8276877999305725, |
|
"eval_loss": 0.7424523234367371, |
|
"eval_runtime": 560.1213, |
|
"eval_samples_per_second": 8.486, |
|
"eval_steps_per_second": 2.123, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06732589943193773, |
|
"grad_norm": 7.520476818084717, |
|
"learning_rate": 0.00014181818181818184, |
|
"loss": 0.4331, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06732589943193773, |
|
"eval_accuracy": 0.9326741099357605, |
|
"eval_loss": 0.2065199315547943, |
|
"eval_runtime": 562.1855, |
|
"eval_samples_per_second": 8.455, |
|
"eval_steps_per_second": 2.115, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07153376814643383, |
|
"grad_norm": 12.3318510055542, |
|
"learning_rate": 0.0001381818181818182, |
|
"loss": 0.6336, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07153376814643383, |
|
"eval_accuracy": 0.859036386013031, |
|
"eval_loss": 0.29903730750083923, |
|
"eval_runtime": 564.8753, |
|
"eval_samples_per_second": 8.414, |
|
"eval_steps_per_second": 2.105, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07574163686092994, |
|
"grad_norm": 7.766119956970215, |
|
"learning_rate": 0.00013454545454545455, |
|
"loss": 0.2881, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07574163686092994, |
|
"eval_accuracy": 0.948032796382904, |
|
"eval_loss": 0.11360938847064972, |
|
"eval_runtime": 565.4404, |
|
"eval_samples_per_second": 8.406, |
|
"eval_steps_per_second": 2.103, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07994950557542604, |
|
"grad_norm": 6.830145359039307, |
|
"learning_rate": 0.00013090909090909093, |
|
"loss": 0.277, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07994950557542604, |
|
"eval_accuracy": 0.9400378465652466, |
|
"eval_loss": 0.209593266248703, |
|
"eval_runtime": 562.6503, |
|
"eval_samples_per_second": 8.448, |
|
"eval_steps_per_second": 2.113, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08415737428992215, |
|
"grad_norm": 4.35469913482666, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 0.1586, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08415737428992215, |
|
"eval_accuracy": 0.9337260723114014, |
|
"eval_loss": 0.23578013479709625, |
|
"eval_runtime": 562.1129, |
|
"eval_samples_per_second": 8.456, |
|
"eval_steps_per_second": 2.115, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"grad_norm": 2.565586566925049, |
|
"learning_rate": 0.00012363636363636364, |
|
"loss": 0.1664, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"eval_accuracy": 0.967809796333313, |
|
"eval_loss": 0.11096801608800888, |
|
"eval_runtime": 556.8442, |
|
"eval_samples_per_second": 8.536, |
|
"eval_steps_per_second": 2.135, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09257311171891437, |
|
"grad_norm": 12.48508358001709, |
|
"learning_rate": 0.00012, |
|
"loss": 0.3419, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09257311171891437, |
|
"eval_accuracy": 0.9421418309211731, |
|
"eval_loss": 0.14782342314720154, |
|
"eval_runtime": 560.1835, |
|
"eval_samples_per_second": 8.485, |
|
"eval_steps_per_second": 2.123, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09678098043341048, |
|
"grad_norm": 10.233806610107422, |
|
"learning_rate": 0.00011636363636363636, |
|
"loss": 0.187, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09678098043341048, |
|
"eval_accuracy": 0.9579213261604309, |
|
"eval_loss": 0.11080725491046906, |
|
"eval_runtime": 563.1087, |
|
"eval_samples_per_second": 8.441, |
|
"eval_steps_per_second": 2.111, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10098884914790658, |
|
"grad_norm": 0.16775061190128326, |
|
"learning_rate": 0.00011272727272727272, |
|
"loss": 0.1834, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10098884914790658, |
|
"eval_accuracy": 0.8727119565010071, |
|
"eval_loss": 0.44505125284194946, |
|
"eval_runtime": 564.6564, |
|
"eval_samples_per_second": 8.418, |
|
"eval_steps_per_second": 2.106, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1051967178624027, |
|
"grad_norm": 5.313834190368652, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.2383, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1051967178624027, |
|
"eval_accuracy": 0.9726488590240479, |
|
"eval_loss": 0.07795428484678268, |
|
"eval_runtime": 565.2854, |
|
"eval_samples_per_second": 8.408, |
|
"eval_steps_per_second": 2.103, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1094045865768988, |
|
"grad_norm": 0.07698975503444672, |
|
"learning_rate": 0.00010545454545454545, |
|
"loss": 0.2157, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1094045865768988, |
|
"eval_accuracy": 0.9758047461509705, |
|
"eval_loss": 0.0759287104010582, |
|
"eval_runtime": 564.058, |
|
"eval_samples_per_second": 8.426, |
|
"eval_steps_per_second": 2.108, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11361245529139491, |
|
"grad_norm": 0.13711199164390564, |
|
"learning_rate": 0.00010181818181818181, |
|
"loss": 0.2665, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11361245529139491, |
|
"eval_accuracy": 0.9787502884864807, |
|
"eval_loss": 0.06089087575674057, |
|
"eval_runtime": 563.7962, |
|
"eval_samples_per_second": 8.43, |
|
"eval_steps_per_second": 2.109, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"grad_norm": 0.17997244000434875, |
|
"learning_rate": 9.818181818181818e-05, |
|
"loss": 0.1226, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"eval_accuracy": 0.9808542132377625, |
|
"eval_loss": 0.052085429430007935, |
|
"eval_runtime": 564.5871, |
|
"eval_samples_per_second": 8.419, |
|
"eval_steps_per_second": 2.106, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12202819272038712, |
|
"grad_norm": 0.1597593128681183, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 0.1287, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12202819272038712, |
|
"eval_accuracy": 0.9558174014091492, |
|
"eval_loss": 0.09927653521299362, |
|
"eval_runtime": 563.2882, |
|
"eval_samples_per_second": 8.438, |
|
"eval_steps_per_second": 2.111, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12623606143488322, |
|
"grad_norm": 14.694509506225586, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.1088, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12623606143488322, |
|
"eval_accuracy": 0.9309909343719482, |
|
"eval_loss": 0.19473901391029358, |
|
"eval_runtime": 565.488, |
|
"eval_samples_per_second": 8.405, |
|
"eval_steps_per_second": 2.103, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13044393014937933, |
|
"grad_norm": 4.671857833862305, |
|
"learning_rate": 8.727272727272727e-05, |
|
"loss": 0.1778, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13044393014937933, |
|
"eval_accuracy": 0.9814853668212891, |
|
"eval_loss": 0.057908281683921814, |
|
"eval_runtime": 564.4128, |
|
"eval_samples_per_second": 8.421, |
|
"eval_steps_per_second": 2.107, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13465179886387546, |
|
"grad_norm": 1.1494044065475464, |
|
"learning_rate": 8.363636363636364e-05, |
|
"loss": 0.1686, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13465179886387546, |
|
"eval_accuracy": 0.9798022508621216, |
|
"eval_loss": 0.06721032410860062, |
|
"eval_runtime": 563.963, |
|
"eval_samples_per_second": 8.428, |
|
"eval_steps_per_second": 2.108, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13885966757837157, |
|
"grad_norm": 20.857423782348633, |
|
"learning_rate": 8e-05, |
|
"loss": 0.19, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13885966757837157, |
|
"eval_accuracy": 0.9793814420700073, |
|
"eval_loss": 0.07669834792613983, |
|
"eval_runtime": 558.0764, |
|
"eval_samples_per_second": 8.517, |
|
"eval_steps_per_second": 2.131, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14306753629286767, |
|
"grad_norm": 0.02689620479941368, |
|
"learning_rate": 7.636363636363637e-05, |
|
"loss": 0.0889, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14306753629286767, |
|
"eval_accuracy": 0.9776982665061951, |
|
"eval_loss": 0.0889260396361351, |
|
"eval_runtime": 561.1157, |
|
"eval_samples_per_second": 8.471, |
|
"eval_steps_per_second": 2.119, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"grad_norm": 0.01651788130402565, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.0046, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"eval_accuracy": 0.9774879217147827, |
|
"eval_loss": 0.0977482944726944, |
|
"eval_runtime": 557.4467, |
|
"eval_samples_per_second": 8.526, |
|
"eval_steps_per_second": 2.133, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15148327372185988, |
|
"grad_norm": 0.49462929368019104, |
|
"learning_rate": 6.90909090909091e-05, |
|
"loss": 0.2327, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15148327372185988, |
|
"eval_accuracy": 0.9833789467811584, |
|
"eval_loss": 0.05215293914079666, |
|
"eval_runtime": 559.0767, |
|
"eval_samples_per_second": 8.502, |
|
"eval_steps_per_second": 2.127, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15569114243635598, |
|
"grad_norm": 1.9083374738693237, |
|
"learning_rate": 6.545454545454546e-05, |
|
"loss": 0.021, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15569114243635598, |
|
"eval_accuracy": 0.9629707336425781, |
|
"eval_loss": 0.11694369465112686, |
|
"eval_runtime": 559.3448, |
|
"eval_samples_per_second": 8.497, |
|
"eval_steps_per_second": 2.126, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1598990111508521, |
|
"grad_norm": 23.373567581176758, |
|
"learning_rate": 6.181818181818182e-05, |
|
"loss": 0.2001, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1598990111508521, |
|
"eval_accuracy": 0.9829581379890442, |
|
"eval_loss": 0.060037508606910706, |
|
"eval_runtime": 560.337, |
|
"eval_samples_per_second": 8.482, |
|
"eval_steps_per_second": 2.122, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1641068798653482, |
|
"grad_norm": 0.10927582532167435, |
|
"learning_rate": 5.818181818181818e-05, |
|
"loss": 0.0711, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1641068798653482, |
|
"eval_accuracy": 0.9859036207199097, |
|
"eval_loss": 0.04903951659798622, |
|
"eval_runtime": 555.1166, |
|
"eval_samples_per_second": 8.562, |
|
"eval_steps_per_second": 2.142, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1683147485798443, |
|
"grad_norm": 1.898078203201294, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.0983, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1683147485798443, |
|
"eval_accuracy": 0.9831685423851013, |
|
"eval_loss": 0.045510243624448776, |
|
"eval_runtime": 558.1218, |
|
"eval_samples_per_second": 8.516, |
|
"eval_steps_per_second": 2.13, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17252261729434043, |
|
"grad_norm": 0.12654219567775726, |
|
"learning_rate": 5.090909090909091e-05, |
|
"loss": 0.0548, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17252261729434043, |
|
"eval_accuracy": 0.9888491630554199, |
|
"eval_loss": 0.04250233247876167, |
|
"eval_runtime": 558.6685, |
|
"eval_samples_per_second": 8.508, |
|
"eval_steps_per_second": 2.128, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"grad_norm": 2.7145519256591797, |
|
"learning_rate": 4.7272727272727275e-05, |
|
"loss": 0.0505, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"eval_accuracy": 0.9880075454711914, |
|
"eval_loss": 0.04300825670361519, |
|
"eval_runtime": 559.6635, |
|
"eval_samples_per_second": 8.493, |
|
"eval_steps_per_second": 2.124, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18093835472333264, |
|
"grad_norm": 0.08013765513896942, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 0.2609, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18093835472333264, |
|
"eval_accuracy": 0.9899011254310608, |
|
"eval_loss": 0.03251320868730545, |
|
"eval_runtime": 560.0268, |
|
"eval_samples_per_second": 8.487, |
|
"eval_steps_per_second": 2.123, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18514622343782874, |
|
"grad_norm": 0.1592099815607071, |
|
"learning_rate": 4e-05, |
|
"loss": 0.058, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18514622343782874, |
|
"eval_accuracy": 0.9783294796943665, |
|
"eval_loss": 0.0525953434407711, |
|
"eval_runtime": 553.5191, |
|
"eval_samples_per_second": 8.587, |
|
"eval_steps_per_second": 2.148, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18935409215232485, |
|
"grad_norm": 0.064529187977314, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.1082, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18935409215232485, |
|
"eval_accuracy": 0.9888491630554199, |
|
"eval_loss": 0.03384377807378769, |
|
"eval_runtime": 557.5032, |
|
"eval_samples_per_second": 8.526, |
|
"eval_steps_per_second": 2.133, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.19356196086682095, |
|
"grad_norm": 0.35138818621635437, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 0.03, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19356196086682095, |
|
"eval_accuracy": 0.9924258589744568, |
|
"eval_loss": 0.029149439185857773, |
|
"eval_runtime": 559.8975, |
|
"eval_samples_per_second": 8.489, |
|
"eval_steps_per_second": 2.124, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19776982958131706, |
|
"grad_norm": 0.020687425509095192, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 0.0702, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.19776982958131706, |
|
"eval_accuracy": 0.9930570125579834, |
|
"eval_loss": 0.027508797124028206, |
|
"eval_runtime": 558.5715, |
|
"eval_samples_per_second": 8.509, |
|
"eval_steps_per_second": 2.129, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20197769829581316, |
|
"grad_norm": 0.08153369277715683, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 0.0412, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20197769829581316, |
|
"eval_accuracy": 0.9924258589744568, |
|
"eval_loss": 0.02858472615480423, |
|
"eval_runtime": 557.0232, |
|
"eval_samples_per_second": 8.533, |
|
"eval_steps_per_second": 2.135, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.027780650183558464, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 0.0056, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"eval_accuracy": 0.9922154545783997, |
|
"eval_loss": 0.028572723269462585, |
|
"eval_runtime": 557.9364, |
|
"eval_samples_per_second": 8.519, |
|
"eval_steps_per_second": 2.131, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2103934357248054, |
|
"grad_norm": 0.013943745754659176, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.0202, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2103934357248054, |
|
"eval_accuracy": 0.9934778213500977, |
|
"eval_loss": 0.02832806296646595, |
|
"eval_runtime": 590.2498, |
|
"eval_samples_per_second": 8.053, |
|
"eval_steps_per_second": 2.014, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2146013044393015, |
|
"grad_norm": 0.06914212554693222, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.0032, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2146013044393015, |
|
"eval_accuracy": 0.9932674169540405, |
|
"eval_loss": 0.02869958057999611, |
|
"eval_runtime": 574.9638, |
|
"eval_samples_per_second": 8.267, |
|
"eval_steps_per_second": 2.068, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2188091731537976, |
|
"grad_norm": 0.014089811593294144, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 0.0932, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2188091731537976, |
|
"eval_accuracy": 0.9924258589744568, |
|
"eval_loss": 0.029511602595448494, |
|
"eval_runtime": 574.493, |
|
"eval_samples_per_second": 8.273, |
|
"eval_steps_per_second": 2.07, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.22301704186829371, |
|
"grad_norm": 0.006211976986378431, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.0597, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22301704186829371, |
|
"eval_accuracy": 0.9926362037658691, |
|
"eval_loss": 0.029444053769111633, |
|
"eval_runtime": 566.9552, |
|
"eval_samples_per_second": 8.383, |
|
"eval_steps_per_second": 2.097, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22722491058278982, |
|
"grad_norm": 0.030956320464611053, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.0069, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22722491058278982, |
|
"eval_accuracy": 0.9926362037658691, |
|
"eval_loss": 0.02989383228123188, |
|
"eval_runtime": 560.5938, |
|
"eval_samples_per_second": 8.479, |
|
"eval_steps_per_second": 2.121, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.23143277929728592, |
|
"grad_norm": 0.10614718496799469, |
|
"learning_rate": 0.0, |
|
"loss": 0.024, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.23143277929728592, |
|
"eval_accuracy": 0.9924258589744568, |
|
"eval_loss": 0.030369577929377556, |
|
"eval_runtime": 551.1375, |
|
"eval_samples_per_second": 8.624, |
|
"eval_steps_per_second": 2.157, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.208452205931052e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|