|
{ |
|
"best_metric": 1.396972417831421, |
|
"best_model_checkpoint": "./qwen_t/qwen_o5/checkpoint-320", |
|
"epoch": 0.11695906432748537, |
|
"eval_steps": 10, |
|
"global_step": 350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003341687552213868, |
|
"grad_norm": 0.7599132657051086, |
|
"learning_rate": 0.0002, |
|
"loss": 3.626, |
|
"mean_token_accuracy": 0.36439715698361397, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003341687552213868, |
|
"eval_loss": 3.443260669708252, |
|
"eval_mean_token_accuracy": 0.4417985293127242, |
|
"eval_runtime": 41.9693, |
|
"eval_samples_per_second": 80.058, |
|
"eval_steps_per_second": 10.007, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006683375104427736, |
|
"grad_norm": 0.8953952193260193, |
|
"learning_rate": 0.0002, |
|
"loss": 2.9254, |
|
"mean_token_accuracy": 0.49326967149972917, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006683375104427736, |
|
"eval_loss": 2.5516397953033447, |
|
"eval_mean_token_accuracy": 0.565807048479716, |
|
"eval_runtime": 40.7965, |
|
"eval_samples_per_second": 82.36, |
|
"eval_steps_per_second": 10.295, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010025062656641603, |
|
"grad_norm": 0.6876745223999023, |
|
"learning_rate": 0.0002, |
|
"loss": 2.231, |
|
"mean_token_accuracy": 0.6093000993132591, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010025062656641603, |
|
"eval_loss": 2.153578042984009, |
|
"eval_mean_token_accuracy": 0.6297694771062761, |
|
"eval_runtime": 35.4077, |
|
"eval_samples_per_second": 94.895, |
|
"eval_steps_per_second": 11.862, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.013366750208855471, |
|
"grad_norm": 0.9506922960281372, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8838, |
|
"mean_token_accuracy": 0.6746647953987122, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013366750208855471, |
|
"eval_loss": 2.019120454788208, |
|
"eval_mean_token_accuracy": 0.6349048288805145, |
|
"eval_runtime": 35.8494, |
|
"eval_samples_per_second": 93.725, |
|
"eval_steps_per_second": 11.716, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01670843776106934, |
|
"grad_norm": 2.049982786178589, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5276, |
|
"mean_token_accuracy": 0.7170954093337059, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01670843776106934, |
|
"eval_loss": 1.813344120979309, |
|
"eval_mean_token_accuracy": 0.6365837232697578, |
|
"eval_runtime": 29.3868, |
|
"eval_samples_per_second": 114.337, |
|
"eval_steps_per_second": 14.292, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.020050125313283207, |
|
"grad_norm": 0.7795844674110413, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2638, |
|
"mean_token_accuracy": 0.5529197990894318, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.020050125313283207, |
|
"eval_loss": 1.6835161447525024, |
|
"eval_mean_token_accuracy": 0.6496368288993836, |
|
"eval_runtime": 67.7706, |
|
"eval_samples_per_second": 49.579, |
|
"eval_steps_per_second": 6.197, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.023391812865497075, |
|
"grad_norm": 0.6929437518119812, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7755, |
|
"mean_token_accuracy": 0.6449611410498619, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.023391812865497075, |
|
"eval_loss": 1.5908516645431519, |
|
"eval_mean_token_accuracy": 0.6815834226352828, |
|
"eval_runtime": 63.7232, |
|
"eval_samples_per_second": 52.728, |
|
"eval_steps_per_second": 6.591, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.026733500417710943, |
|
"grad_norm": 0.5863602161407471, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4411, |
|
"mean_token_accuracy": 0.6989624485373497, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.026733500417710943, |
|
"eval_loss": 1.5338634252548218, |
|
"eval_mean_token_accuracy": 0.6856980549437659, |
|
"eval_runtime": 50.2939, |
|
"eval_samples_per_second": 66.807, |
|
"eval_steps_per_second": 8.351, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03007518796992481, |
|
"grad_norm": 1.1920981407165527, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2398, |
|
"mean_token_accuracy": 0.733481515944004, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03007518796992481, |
|
"eval_loss": 1.5052729845046997, |
|
"eval_mean_token_accuracy": 0.6931079140731267, |
|
"eval_runtime": 64.5138, |
|
"eval_samples_per_second": 52.082, |
|
"eval_steps_per_second": 6.51, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03341687552213868, |
|
"grad_norm": 0.6549275517463684, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0111, |
|
"mean_token_accuracy": 0.7853028282523156, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03341687552213868, |
|
"eval_loss": 1.518865704536438, |
|
"eval_mean_token_accuracy": 0.6946799146987143, |
|
"eval_runtime": 34.4958, |
|
"eval_samples_per_second": 97.403, |
|
"eval_steps_per_second": 12.175, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.036758563074352546, |
|
"grad_norm": 0.5133540630340576, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9947, |
|
"mean_token_accuracy": 0.6003070399165154, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.036758563074352546, |
|
"eval_loss": 1.4736624956130981, |
|
"eval_mean_token_accuracy": 0.6929171987232707, |
|
"eval_runtime": 31.0194, |
|
"eval_samples_per_second": 108.319, |
|
"eval_steps_per_second": 13.54, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.040100250626566414, |
|
"grad_norm": 0.4258256256580353, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5854, |
|
"mean_token_accuracy": 0.6611790612339974, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.040100250626566414, |
|
"eval_loss": 1.458544373512268, |
|
"eval_mean_token_accuracy": 0.6966695138386317, |
|
"eval_runtime": 32.5414, |
|
"eval_samples_per_second": 103.253, |
|
"eval_steps_per_second": 12.907, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04344193817878028, |
|
"grad_norm": 0.5019882321357727, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3932, |
|
"mean_token_accuracy": 0.717147932946682, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04344193817878028, |
|
"eval_loss": 1.4401620626449585, |
|
"eval_mean_token_accuracy": 0.7001797112680617, |
|
"eval_runtime": 30.1368, |
|
"eval_samples_per_second": 111.492, |
|
"eval_steps_per_second": 13.936, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04678362573099415, |
|
"grad_norm": 0.5241239070892334, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1731, |
|
"mean_token_accuracy": 0.7519763350486756, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04678362573099415, |
|
"eval_loss": 1.4381680488586426, |
|
"eval_mean_token_accuracy": 0.6980286110724722, |
|
"eval_runtime": 29.4312, |
|
"eval_samples_per_second": 114.164, |
|
"eval_steps_per_second": 14.271, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05012531328320802, |
|
"grad_norm": 0.5657021999359131, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9886, |
|
"mean_token_accuracy": 0.804797975718975, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05012531328320802, |
|
"eval_loss": 1.4500703811645508, |
|
"eval_mean_token_accuracy": 0.7004464421243894, |
|
"eval_runtime": 29.379, |
|
"eval_samples_per_second": 114.368, |
|
"eval_steps_per_second": 14.296, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.053467000835421885, |
|
"grad_norm": 0.48124462366104126, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8415, |
|
"mean_token_accuracy": 0.6195126965641975, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.053467000835421885, |
|
"eval_loss": 1.4379223585128784, |
|
"eval_mean_token_accuracy": 0.6959139862940424, |
|
"eval_runtime": 29.496, |
|
"eval_samples_per_second": 113.914, |
|
"eval_steps_per_second": 14.239, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05680868838763575, |
|
"grad_norm": 0.4167322516441345, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5117, |
|
"mean_token_accuracy": 0.6729505002498627, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05680868838763575, |
|
"eval_loss": 1.4370402097702026, |
|
"eval_mean_token_accuracy": 0.6991755010116668, |
|
"eval_runtime": 30.4827, |
|
"eval_samples_per_second": 110.227, |
|
"eval_steps_per_second": 13.778, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06015037593984962, |
|
"grad_norm": 0.44749510288238525, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2954, |
|
"mean_token_accuracy": 0.7225258648395538, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06015037593984962, |
|
"eval_loss": 1.423570156097412, |
|
"eval_mean_token_accuracy": 0.7020650133490562, |
|
"eval_runtime": 30.6017, |
|
"eval_samples_per_second": 109.798, |
|
"eval_steps_per_second": 13.725, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"grad_norm": 0.3989886939525604, |
|
"learning_rate": 0.0002, |
|
"loss": 1.213, |
|
"mean_token_accuracy": 0.7481454327702522, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"eval_loss": 1.4212963581085205, |
|
"eval_mean_token_accuracy": 0.7001493394374847, |
|
"eval_runtime": 41.8797, |
|
"eval_samples_per_second": 80.23, |
|
"eval_steps_per_second": 10.029, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.06683375104427736, |
|
"grad_norm": 0.5422595739364624, |
|
"learning_rate": 0.0002, |
|
"loss": 0.942, |
|
"mean_token_accuracy": 0.7962346121668815, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06683375104427736, |
|
"eval_loss": 1.421792984008789, |
|
"eval_mean_token_accuracy": 0.7010365227858225, |
|
"eval_runtime": 53.936, |
|
"eval_samples_per_second": 62.296, |
|
"eval_steps_per_second": 7.787, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07017543859649122, |
|
"grad_norm": 0.39737701416015625, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9107, |
|
"mean_token_accuracy": 0.5946269743144512, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07017543859649122, |
|
"eval_loss": 1.4200433492660522, |
|
"eval_mean_token_accuracy": 0.6980209651447478, |
|
"eval_runtime": 48.7912, |
|
"eval_samples_per_second": 68.865, |
|
"eval_steps_per_second": 8.608, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07351712614870509, |
|
"grad_norm": 0.3731982707977295, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4745, |
|
"mean_token_accuracy": 0.6861546367406846, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07351712614870509, |
|
"eval_loss": 1.426990032196045, |
|
"eval_mean_token_accuracy": 0.6979587059645426, |
|
"eval_runtime": 45.8174, |
|
"eval_samples_per_second": 73.335, |
|
"eval_steps_per_second": 9.167, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07685881370091896, |
|
"grad_norm": 0.5165483951568604, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3166, |
|
"mean_token_accuracy": 0.717747439444065, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.07685881370091896, |
|
"eval_loss": 1.4164931774139404, |
|
"eval_mean_token_accuracy": 0.7008462209077109, |
|
"eval_runtime": 35.0477, |
|
"eval_samples_per_second": 95.869, |
|
"eval_steps_per_second": 11.984, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08020050125313283, |
|
"grad_norm": 0.3445465862751007, |
|
"learning_rate": 0.0002, |
|
"loss": 1.138, |
|
"mean_token_accuracy": 0.7411063179373741, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08020050125313283, |
|
"eval_loss": 1.4141920804977417, |
|
"eval_mean_token_accuracy": 0.7027672590953963, |
|
"eval_runtime": 34.6315, |
|
"eval_samples_per_second": 97.022, |
|
"eval_steps_per_second": 12.128, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0835421888053467, |
|
"grad_norm": 0.9735682606697083, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8767, |
|
"mean_token_accuracy": 0.79200878739357, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0835421888053467, |
|
"eval_loss": 1.421015977859497, |
|
"eval_mean_token_accuracy": 0.6934712292892592, |
|
"eval_runtime": 34.4466, |
|
"eval_samples_per_second": 97.542, |
|
"eval_steps_per_second": 12.193, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08688387635756056, |
|
"grad_norm": 0.4343126118183136, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9246, |
|
"mean_token_accuracy": 0.5945770829916001, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.08688387635756056, |
|
"eval_loss": 1.4099905490875244, |
|
"eval_mean_token_accuracy": 0.7017570126624334, |
|
"eval_runtime": 29.6092, |
|
"eval_samples_per_second": 113.478, |
|
"eval_steps_per_second": 14.185, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09022556390977443, |
|
"grad_norm": 0.3334052562713623, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4759, |
|
"mean_token_accuracy": 0.6715509802103042, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09022556390977443, |
|
"eval_loss": 1.4085925817489624, |
|
"eval_mean_token_accuracy": 0.7048604423091525, |
|
"eval_runtime": 38.8094, |
|
"eval_samples_per_second": 86.577, |
|
"eval_steps_per_second": 10.822, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0935672514619883, |
|
"grad_norm": 0.5291116237640381, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3162, |
|
"mean_token_accuracy": 0.7213364154100418, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0935672514619883, |
|
"eval_loss": 1.4125802516937256, |
|
"eval_mean_token_accuracy": 0.7007201626896858, |
|
"eval_runtime": 41.0882, |
|
"eval_samples_per_second": 81.775, |
|
"eval_steps_per_second": 10.222, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09690893901420217, |
|
"grad_norm": 0.3959917724132538, |
|
"learning_rate": 0.0002, |
|
"loss": 1.192, |
|
"mean_token_accuracy": 0.7394131779670715, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.09690893901420217, |
|
"eval_loss": 1.4038469791412354, |
|
"eval_mean_token_accuracy": 0.7027802584426743, |
|
"eval_runtime": 31.4074, |
|
"eval_samples_per_second": 106.981, |
|
"eval_steps_per_second": 13.373, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.10025062656641603, |
|
"grad_norm": 0.6445237398147583, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8861, |
|
"mean_token_accuracy": 0.7985727787017822, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10025062656641603, |
|
"eval_loss": 1.4055042266845703, |
|
"eval_mean_token_accuracy": 0.7020482325837726, |
|
"eval_runtime": 36.2285, |
|
"eval_samples_per_second": 92.745, |
|
"eval_steps_per_second": 11.593, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1035923141186299, |
|
"grad_norm": 0.3228004276752472, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9139, |
|
"mean_token_accuracy": 0.6049163021147251, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1035923141186299, |
|
"eval_loss": 1.401644229888916, |
|
"eval_mean_token_accuracy": 0.70244310824644, |
|
"eval_runtime": 29.3696, |
|
"eval_samples_per_second": 114.404, |
|
"eval_steps_per_second": 14.301, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.10693400167084377, |
|
"grad_norm": 0.35528433322906494, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4937, |
|
"mean_token_accuracy": 0.6764601737260818, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.10693400167084377, |
|
"eval_loss": 1.396972417831421, |
|
"eval_mean_token_accuracy": 0.699098062302385, |
|
"eval_runtime": 40.4694, |
|
"eval_samples_per_second": 83.026, |
|
"eval_steps_per_second": 10.378, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11027568922305764, |
|
"grad_norm": 0.4269411563873291, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2599, |
|
"mean_token_accuracy": 0.7228553861379623, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.11027568922305764, |
|
"eval_loss": 1.398388385772705, |
|
"eval_mean_token_accuracy": 0.7035389555352075, |
|
"eval_runtime": 33.2865, |
|
"eval_samples_per_second": 100.942, |
|
"eval_steps_per_second": 12.618, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1136173767752715, |
|
"grad_norm": 0.372363805770874, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1301, |
|
"mean_token_accuracy": 0.7524892643094063, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1136173767752715, |
|
"eval_loss": 1.398653507232666, |
|
"eval_mean_token_accuracy": 0.6984730128731046, |
|
"eval_runtime": 44.4473, |
|
"eval_samples_per_second": 75.595, |
|
"eval_steps_per_second": 9.449, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.11695906432748537, |
|
"grad_norm": 0.4013306796550751, |
|
"learning_rate": 0.0002, |
|
"loss": 0.939, |
|
"mean_token_accuracy": 0.7970763191580772, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.11695906432748537, |
|
"eval_loss": 1.3984951972961426, |
|
"eval_mean_token_accuracy": 0.7021824714683351, |
|
"eval_runtime": 35.5812, |
|
"eval_samples_per_second": 94.432, |
|
"eval_steps_per_second": 11.804, |
|
"step": 350 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 14960, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 934541258360832.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|