{ "best_metric": 1.396972417831421, "best_model_checkpoint": "./qwen_t/qwen_o5/checkpoint-320", "epoch": 0.11695906432748537, "eval_steps": 10, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003341687552213868, "grad_norm": 0.7599132657051086, "learning_rate": 0.0002, "loss": 3.626, "mean_token_accuracy": 0.36439715698361397, "step": 10 }, { "epoch": 0.003341687552213868, "eval_loss": 3.443260669708252, "eval_mean_token_accuracy": 0.4417985293127242, "eval_runtime": 41.9693, "eval_samples_per_second": 80.058, "eval_steps_per_second": 10.007, "step": 10 }, { "epoch": 0.006683375104427736, "grad_norm": 0.8953952193260193, "learning_rate": 0.0002, "loss": 2.9254, "mean_token_accuracy": 0.49326967149972917, "step": 20 }, { "epoch": 0.006683375104427736, "eval_loss": 2.5516397953033447, "eval_mean_token_accuracy": 0.565807048479716, "eval_runtime": 40.7965, "eval_samples_per_second": 82.36, "eval_steps_per_second": 10.295, "step": 20 }, { "epoch": 0.010025062656641603, "grad_norm": 0.6876745223999023, "learning_rate": 0.0002, "loss": 2.231, "mean_token_accuracy": 0.6093000993132591, "step": 30 }, { "epoch": 0.010025062656641603, "eval_loss": 2.153578042984009, "eval_mean_token_accuracy": 0.6297694771062761, "eval_runtime": 35.4077, "eval_samples_per_second": 94.895, "eval_steps_per_second": 11.862, "step": 30 }, { "epoch": 0.013366750208855471, "grad_norm": 0.9506922960281372, "learning_rate": 0.0002, "loss": 1.8838, "mean_token_accuracy": 0.6746647953987122, "step": 40 }, { "epoch": 0.013366750208855471, "eval_loss": 2.019120454788208, "eval_mean_token_accuracy": 0.6349048288805145, "eval_runtime": 35.8494, "eval_samples_per_second": 93.725, "eval_steps_per_second": 11.716, "step": 40 }, { "epoch": 0.01670843776106934, "grad_norm": 2.049982786178589, "learning_rate": 0.0002, "loss": 1.5276, "mean_token_accuracy": 0.7170954093337059, "step": 50 }, { "epoch": 0.01670843776106934, "eval_loss": 1.813344120979309, "eval_mean_token_accuracy": 0.6365837232697578, "eval_runtime": 29.3868, "eval_samples_per_second": 114.337, "eval_steps_per_second": 14.292, "step": 50 }, { "epoch": 0.020050125313283207, "grad_norm": 0.7795844674110413, "learning_rate": 0.0002, "loss": 2.2638, "mean_token_accuracy": 0.5529197990894318, "step": 60 }, { "epoch": 0.020050125313283207, "eval_loss": 1.6835161447525024, "eval_mean_token_accuracy": 0.6496368288993836, "eval_runtime": 67.7706, "eval_samples_per_second": 49.579, "eval_steps_per_second": 6.197, "step": 60 }, { "epoch": 0.023391812865497075, "grad_norm": 0.6929437518119812, "learning_rate": 0.0002, "loss": 1.7755, "mean_token_accuracy": 0.6449611410498619, "step": 70 }, { "epoch": 0.023391812865497075, "eval_loss": 1.5908516645431519, "eval_mean_token_accuracy": 0.6815834226352828, "eval_runtime": 63.7232, "eval_samples_per_second": 52.728, "eval_steps_per_second": 6.591, "step": 70 }, { "epoch": 0.026733500417710943, "grad_norm": 0.5863602161407471, "learning_rate": 0.0002, "loss": 1.4411, "mean_token_accuracy": 0.6989624485373497, "step": 80 }, { "epoch": 0.026733500417710943, "eval_loss": 1.5338634252548218, "eval_mean_token_accuracy": 0.6856980549437659, "eval_runtime": 50.2939, "eval_samples_per_second": 66.807, "eval_steps_per_second": 8.351, "step": 80 }, { "epoch": 0.03007518796992481, "grad_norm": 1.1920981407165527, "learning_rate": 0.0002, "loss": 1.2398, "mean_token_accuracy": 0.733481515944004, "step": 90 }, { "epoch": 0.03007518796992481, "eval_loss": 1.5052729845046997, "eval_mean_token_accuracy": 0.6931079140731267, "eval_runtime": 64.5138, "eval_samples_per_second": 52.082, "eval_steps_per_second": 6.51, "step": 90 }, { "epoch": 0.03341687552213868, "grad_norm": 0.6549275517463684, "learning_rate": 0.0002, "loss": 1.0111, "mean_token_accuracy": 0.7853028282523156, "step": 100 }, { "epoch": 0.03341687552213868, "eval_loss": 1.518865704536438, "eval_mean_token_accuracy": 0.6946799146987143, "eval_runtime": 34.4958, "eval_samples_per_second": 97.403, "eval_steps_per_second": 12.175, "step": 100 }, { "epoch": 0.036758563074352546, "grad_norm": 0.5133540630340576, "learning_rate": 0.0002, "loss": 1.9947, "mean_token_accuracy": 0.6003070399165154, "step": 110 }, { "epoch": 0.036758563074352546, "eval_loss": 1.4736624956130981, "eval_mean_token_accuracy": 0.6929171987232707, "eval_runtime": 31.0194, "eval_samples_per_second": 108.319, "eval_steps_per_second": 13.54, "step": 110 }, { "epoch": 0.040100250626566414, "grad_norm": 0.4258256256580353, "learning_rate": 0.0002, "loss": 1.5854, "mean_token_accuracy": 0.6611790612339974, "step": 120 }, { "epoch": 0.040100250626566414, "eval_loss": 1.458544373512268, "eval_mean_token_accuracy": 0.6966695138386317, "eval_runtime": 32.5414, "eval_samples_per_second": 103.253, "eval_steps_per_second": 12.907, "step": 120 }, { "epoch": 0.04344193817878028, "grad_norm": 0.5019882321357727, "learning_rate": 0.0002, "loss": 1.3932, "mean_token_accuracy": 0.717147932946682, "step": 130 }, { "epoch": 0.04344193817878028, "eval_loss": 1.4401620626449585, "eval_mean_token_accuracy": 0.7001797112680617, "eval_runtime": 30.1368, "eval_samples_per_second": 111.492, "eval_steps_per_second": 13.936, "step": 130 }, { "epoch": 0.04678362573099415, "grad_norm": 0.5241239070892334, "learning_rate": 0.0002, "loss": 1.1731, "mean_token_accuracy": 0.7519763350486756, "step": 140 }, { "epoch": 0.04678362573099415, "eval_loss": 1.4381680488586426, "eval_mean_token_accuracy": 0.6980286110724722, "eval_runtime": 29.4312, "eval_samples_per_second": 114.164, "eval_steps_per_second": 14.271, "step": 140 }, { "epoch": 0.05012531328320802, "grad_norm": 0.5657021999359131, "learning_rate": 0.0002, "loss": 0.9886, "mean_token_accuracy": 0.804797975718975, "step": 150 }, { "epoch": 0.05012531328320802, "eval_loss": 1.4500703811645508, "eval_mean_token_accuracy": 0.7004464421243894, "eval_runtime": 29.379, "eval_samples_per_second": 114.368, "eval_steps_per_second": 14.296, "step": 150 }, { "epoch": 0.053467000835421885, "grad_norm": 0.48124462366104126, "learning_rate": 0.0002, "loss": 1.8415, "mean_token_accuracy": 0.6195126965641975, "step": 160 }, { "epoch": 0.053467000835421885, "eval_loss": 1.4379223585128784, "eval_mean_token_accuracy": 0.6959139862940424, "eval_runtime": 29.496, "eval_samples_per_second": 113.914, "eval_steps_per_second": 14.239, "step": 160 }, { "epoch": 0.05680868838763575, "grad_norm": 0.4167322516441345, "learning_rate": 0.0002, "loss": 1.5117, "mean_token_accuracy": 0.6729505002498627, "step": 170 }, { "epoch": 0.05680868838763575, "eval_loss": 1.4370402097702026, "eval_mean_token_accuracy": 0.6991755010116668, "eval_runtime": 30.4827, "eval_samples_per_second": 110.227, "eval_steps_per_second": 13.778, "step": 170 }, { "epoch": 0.06015037593984962, "grad_norm": 0.44749510288238525, "learning_rate": 0.0002, "loss": 1.2954, "mean_token_accuracy": 0.7225258648395538, "step": 180 }, { "epoch": 0.06015037593984962, "eval_loss": 1.423570156097412, "eval_mean_token_accuracy": 0.7020650133490562, "eval_runtime": 30.6017, "eval_samples_per_second": 109.798, "eval_steps_per_second": 13.725, "step": 180 }, { "epoch": 0.06349206349206349, "grad_norm": 0.3989886939525604, "learning_rate": 0.0002, "loss": 1.213, "mean_token_accuracy": 0.7481454327702522, "step": 190 }, { "epoch": 0.06349206349206349, "eval_loss": 1.4212963581085205, "eval_mean_token_accuracy": 0.7001493394374847, "eval_runtime": 41.8797, "eval_samples_per_second": 80.23, "eval_steps_per_second": 10.029, "step": 190 }, { "epoch": 0.06683375104427736, "grad_norm": 0.5422595739364624, "learning_rate": 0.0002, "loss": 0.942, "mean_token_accuracy": 0.7962346121668815, "step": 200 }, { "epoch": 0.06683375104427736, "eval_loss": 1.421792984008789, "eval_mean_token_accuracy": 0.7010365227858225, "eval_runtime": 53.936, "eval_samples_per_second": 62.296, "eval_steps_per_second": 7.787, "step": 200 }, { "epoch": 0.07017543859649122, "grad_norm": 0.39737701416015625, "learning_rate": 0.0002, "loss": 1.9107, "mean_token_accuracy": 0.5946269743144512, "step": 210 }, { "epoch": 0.07017543859649122, "eval_loss": 1.4200433492660522, "eval_mean_token_accuracy": 0.6980209651447478, "eval_runtime": 48.7912, "eval_samples_per_second": 68.865, "eval_steps_per_second": 8.608, "step": 210 }, { "epoch": 0.07351712614870509, "grad_norm": 0.3731982707977295, "learning_rate": 0.0002, "loss": 1.4745, "mean_token_accuracy": 0.6861546367406846, "step": 220 }, { "epoch": 0.07351712614870509, "eval_loss": 1.426990032196045, "eval_mean_token_accuracy": 0.6979587059645426, "eval_runtime": 45.8174, "eval_samples_per_second": 73.335, "eval_steps_per_second": 9.167, "step": 220 }, { "epoch": 0.07685881370091896, "grad_norm": 0.5165483951568604, "learning_rate": 0.0002, "loss": 1.3166, "mean_token_accuracy": 0.717747439444065, "step": 230 }, { "epoch": 0.07685881370091896, "eval_loss": 1.4164931774139404, "eval_mean_token_accuracy": 0.7008462209077109, "eval_runtime": 35.0477, "eval_samples_per_second": 95.869, "eval_steps_per_second": 11.984, "step": 230 }, { "epoch": 0.08020050125313283, "grad_norm": 0.3445465862751007, "learning_rate": 0.0002, "loss": 1.138, "mean_token_accuracy": 0.7411063179373741, "step": 240 }, { "epoch": 0.08020050125313283, "eval_loss": 1.4141920804977417, "eval_mean_token_accuracy": 0.7027672590953963, "eval_runtime": 34.6315, "eval_samples_per_second": 97.022, "eval_steps_per_second": 12.128, "step": 240 }, { "epoch": 0.0835421888053467, "grad_norm": 0.9735682606697083, "learning_rate": 0.0002, "loss": 0.8767, "mean_token_accuracy": 0.79200878739357, "step": 250 }, { "epoch": 0.0835421888053467, "eval_loss": 1.421015977859497, "eval_mean_token_accuracy": 0.6934712292892592, "eval_runtime": 34.4466, "eval_samples_per_second": 97.542, "eval_steps_per_second": 12.193, "step": 250 }, { "epoch": 0.08688387635756056, "grad_norm": 0.4343126118183136, "learning_rate": 0.0002, "loss": 1.9246, "mean_token_accuracy": 0.5945770829916001, "step": 260 }, { "epoch": 0.08688387635756056, "eval_loss": 1.4099905490875244, "eval_mean_token_accuracy": 0.7017570126624334, "eval_runtime": 29.6092, "eval_samples_per_second": 113.478, "eval_steps_per_second": 14.185, "step": 260 }, { "epoch": 0.09022556390977443, "grad_norm": 0.3334052562713623, "learning_rate": 0.0002, "loss": 1.4759, "mean_token_accuracy": 0.6715509802103042, "step": 270 }, { "epoch": 0.09022556390977443, "eval_loss": 1.4085925817489624, "eval_mean_token_accuracy": 0.7048604423091525, "eval_runtime": 38.8094, "eval_samples_per_second": 86.577, "eval_steps_per_second": 10.822, "step": 270 }, { "epoch": 0.0935672514619883, "grad_norm": 0.5291116237640381, "learning_rate": 0.0002, "loss": 1.3162, "mean_token_accuracy": 0.7213364154100418, "step": 280 }, { "epoch": 0.0935672514619883, "eval_loss": 1.4125802516937256, "eval_mean_token_accuracy": 0.7007201626896858, "eval_runtime": 41.0882, "eval_samples_per_second": 81.775, "eval_steps_per_second": 10.222, "step": 280 }, { "epoch": 0.09690893901420217, "grad_norm": 0.3959917724132538, "learning_rate": 0.0002, "loss": 1.192, "mean_token_accuracy": 0.7394131779670715, "step": 290 }, { "epoch": 0.09690893901420217, "eval_loss": 1.4038469791412354, "eval_mean_token_accuracy": 0.7027802584426743, "eval_runtime": 31.4074, "eval_samples_per_second": 106.981, "eval_steps_per_second": 13.373, "step": 290 }, { "epoch": 0.10025062656641603, "grad_norm": 0.6445237398147583, "learning_rate": 0.0002, "loss": 0.8861, "mean_token_accuracy": 0.7985727787017822, "step": 300 }, { "epoch": 0.10025062656641603, "eval_loss": 1.4055042266845703, "eval_mean_token_accuracy": 0.7020482325837726, "eval_runtime": 36.2285, "eval_samples_per_second": 92.745, "eval_steps_per_second": 11.593, "step": 300 }, { "epoch": 0.1035923141186299, "grad_norm": 0.3228004276752472, "learning_rate": 0.0002, "loss": 1.9139, "mean_token_accuracy": 0.6049163021147251, "step": 310 }, { "epoch": 0.1035923141186299, "eval_loss": 1.401644229888916, "eval_mean_token_accuracy": 0.70244310824644, "eval_runtime": 29.3696, "eval_samples_per_second": 114.404, "eval_steps_per_second": 14.301, "step": 310 }, { "epoch": 0.10693400167084377, "grad_norm": 0.35528433322906494, "learning_rate": 0.0002, "loss": 1.4937, "mean_token_accuracy": 0.6764601737260818, "step": 320 }, { "epoch": 0.10693400167084377, "eval_loss": 1.396972417831421, "eval_mean_token_accuracy": 0.699098062302385, "eval_runtime": 40.4694, "eval_samples_per_second": 83.026, "eval_steps_per_second": 10.378, "step": 320 }, { "epoch": 0.11027568922305764, "grad_norm": 0.4269411563873291, "learning_rate": 0.0002, "loss": 1.2599, "mean_token_accuracy": 0.7228553861379623, "step": 330 }, { "epoch": 0.11027568922305764, "eval_loss": 1.398388385772705, "eval_mean_token_accuracy": 0.7035389555352075, "eval_runtime": 33.2865, "eval_samples_per_second": 100.942, "eval_steps_per_second": 12.618, "step": 330 }, { "epoch": 0.1136173767752715, "grad_norm": 0.372363805770874, "learning_rate": 0.0002, "loss": 1.1301, "mean_token_accuracy": 0.7524892643094063, "step": 340 }, { "epoch": 0.1136173767752715, "eval_loss": 1.398653507232666, "eval_mean_token_accuracy": 0.6984730128731046, "eval_runtime": 44.4473, "eval_samples_per_second": 75.595, "eval_steps_per_second": 9.449, "step": 340 }, { "epoch": 0.11695906432748537, "grad_norm": 0.4013306796550751, "learning_rate": 0.0002, "loss": 0.939, "mean_token_accuracy": 0.7970763191580772, "step": 350 }, { "epoch": 0.11695906432748537, "eval_loss": 1.3984951972961426, "eval_mean_token_accuracy": 0.7021824714683351, "eval_runtime": 35.5812, "eval_samples_per_second": 94.432, "eval_steps_per_second": 11.804, "step": 350 } ], "logging_steps": 10, "max_steps": 14960, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 934541258360832.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }