| { |
| "best_global_step": 2500, |
| "best_metric": 3.794926643371582, |
| "best_model_checkpoint": "./qwen3moe_tinystories_sft_global_balance/checkpoint-2500", |
| "epoch": 0.9997473684210526, |
| "eval_steps": 500, |
| "global_step": 2968, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03368421052631579, |
| "grad_norm": 492097.03125, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 11.6957, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06736842105263158, |
| "grad_norm": 526197.25, |
| "learning_rate": 3.35016835016835e-05, |
| "loss": 10.5167, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.10105263157894737, |
| "grad_norm": 486177.34375, |
| "learning_rate": 4.999993082936328e-05, |
| "loss": 8.9635, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13473684210526315, |
| "grad_norm": 394882.59375, |
| "learning_rate": 4.982030277845304e-05, |
| "loss": 7.2522, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16842105263157894, |
| "grad_norm": 279043.53125, |
| "learning_rate": 4.9297703006544226e-05, |
| "loss": 6.0898, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16842105263157894, |
| "eval_loss": 5.723438262939453, |
| "eval_runtime": 135.9602, |
| "eval_samples_per_second": 36.775, |
| "eval_steps_per_second": 2.302, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.20210526315789473, |
| "grad_norm": 258667.03125, |
| "learning_rate": 4.843935289787076e-05, |
| "loss": 5.4572, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.23578947368421052, |
| "grad_norm": 308293.84375, |
| "learning_rate": 4.725711329944238e-05, |
| "loss": 5.051, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2694736842105263, |
| "grad_norm": 280231.3125, |
| "learning_rate": 4.5767320625577836e-05, |
| "loss": 4.7862, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.3031578947368421, |
| "grad_norm": 383468.1875, |
| "learning_rate": 4.399056111818752e-05, |
| "loss": 4.5928, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.3368421052631579, |
| "grad_norm": 332836.21875, |
| "learning_rate": 4.19513863821205e-05, |
| "loss": 4.4388, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3368421052631579, |
| "eval_loss": 4.37507438659668, |
| "eval_runtime": 134.9276, |
| "eval_samples_per_second": 37.057, |
| "eval_steps_per_second": 2.32, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3705263157894737, |
| "grad_norm": 378902.09375, |
| "learning_rate": 3.967797412636315e-05, |
| "loss": 4.3341, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.40421052631578946, |
| "grad_norm": 377249.0, |
| "learning_rate": 3.7201738799033065e-05, |
| "loss": 4.2245, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4378947368421053, |
| "grad_norm": 357383.0625, |
| "learning_rate": 3.4556897496488504e-05, |
| "loss": 4.1398, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.47157894736842104, |
| "grad_norm": 475150.9375, |
| "learning_rate": 3.177999714490516e-05, |
| "loss": 4.1092, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5052631578947369, |
| "grad_norm": 475546.25, |
| "learning_rate": 2.890940948781592e-05, |
| "loss": 4.0588, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5052631578947369, |
| "eval_loss": 4.014543533325195, |
| "eval_runtime": 136.1182, |
| "eval_samples_per_second": 36.733, |
| "eval_steps_per_second": 2.299, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5389473684210526, |
| "grad_norm": 510377.0625, |
| "learning_rate": 2.5984800857973353e-05, |
| "loss": 3.9888, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5726315789473684, |
| "grad_norm": 433969.625, |
| "learning_rate": 2.3046584060329007e-05, |
| "loss": 3.9544, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6063157894736843, |
| "grad_norm": 430580.25, |
| "learning_rate": 2.0135359940116327e-05, |
| "loss": 3.9086, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 443697.03125, |
| "learning_rate": 1.729135635255667e-05, |
| "loss": 3.9023, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.6736842105263158, |
| "grad_norm": 382403.0, |
| "learning_rate": 1.455387228661314e-05, |
| "loss": 3.8641, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6736842105263158, |
| "eval_loss": 3.8544445037841797, |
| "eval_runtime": 134.3764, |
| "eval_samples_per_second": 37.209, |
| "eval_steps_per_second": 2.329, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7073684210526315, |
| "grad_norm": 408598.40625, |
| "learning_rate": 1.1960734823997168e-05, |
| "loss": 3.8511, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.7410526315789474, |
| "grad_norm": 461325.09375, |
| "learning_rate": 9.547776437272746e-06, |
| "loss": 3.832, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.7747368421052632, |
| "grad_norm": 427811.28125, |
| "learning_rate": 7.348339849853858e-06, |
| "loss": 3.8217, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.8084210526315789, |
| "grad_norm": 465533.09375, |
| "learning_rate": 5.39281729983474e-06, |
| "loss": 3.81, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 420344.15625, |
| "learning_rate": 3.7082305741943213e-06, |
| "loss": 3.7949, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "eval_loss": 3.794926643371582, |
| "eval_runtime": 133.9027, |
| "eval_samples_per_second": 37.341, |
| "eval_steps_per_second": 2.338, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8757894736842106, |
| "grad_norm": 433467.875, |
| "learning_rate": 2.3178576165427735e-06, |
| "loss": 3.7939, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.9094736842105263, |
| "grad_norm": 398190.46875, |
| "learning_rate": 1.2409108680163734e-06, |
| "loss": 3.7832, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.9431578947368421, |
| "grad_norm": 369212.90625, |
| "learning_rate": 4.922717860680298e-07, |
| "loss": 3.7862, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.9768421052631578, |
| "grad_norm": 457141.46875, |
| "learning_rate": 8.228520962394182e-08, |
| "loss": 3.8013, |
| "step": 2900 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 2968, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9101710322884608.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|