| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.0, |
| "eval_steps": 1000, |
| "global_step": 16548, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00036258158085569254, |
| "grad_norm": 7.228061676025391, |
| "learning_rate": 0.0, |
| "loss": 4.1707, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.36258158085569253, |
| "grad_norm": 5.762327194213867, |
| "learning_rate": 3.018126888217523e-06, |
| "loss": 3.7074, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.36258158085569253, |
| "eval_cosine_accuracy": 0.943001389503479, |
| "eval_loss": 0.5847920179367065, |
| "eval_runtime": 35.6274, |
| "eval_samples_per_second": 266.902, |
| "eval_steps_per_second": 1.067, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.7251631617113851, |
| "grad_norm": 7.7132134437561035, |
| "learning_rate": 6.0392749244713e-06, |
| "loss": 2.5733, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7251631617113851, |
| "eval_cosine_accuracy": 0.9467872381210327, |
| "eval_loss": 0.5230411887168884, |
| "eval_runtime": 35.5129, |
| "eval_samples_per_second": 267.762, |
| "eval_steps_per_second": 1.07, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.0877447425670776, |
| "grad_norm": 4.678545951843262, |
| "learning_rate": 9.060422960725076e-06, |
| "loss": 2.1499, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.0877447425670776, |
| "eval_cosine_accuracy": 0.9545693397521973, |
| "eval_loss": 0.48575976490974426, |
| "eval_runtime": 34.81, |
| "eval_samples_per_second": 273.169, |
| "eval_steps_per_second": 1.092, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.4503263234227701, |
| "grad_norm": 6.152897357940674, |
| "learning_rate": 9.479528629702373e-06, |
| "loss": 2.3929, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.4503263234227701, |
| "eval_cosine_accuracy": 0.9578294157981873, |
| "eval_loss": 0.46934935450553894, |
| "eval_runtime": 34.6393, |
| "eval_samples_per_second": 274.515, |
| "eval_steps_per_second": 1.097, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.8129079042784626, |
| "grad_norm": 7.723774433135986, |
| "learning_rate": 8.724882912826712e-06, |
| "loss": 1.6541, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.8129079042784626, |
| "eval_cosine_accuracy": 0.9597223401069641, |
| "eval_loss": 0.4414854943752289, |
| "eval_runtime": 36.0299, |
| "eval_samples_per_second": 263.92, |
| "eval_steps_per_second": 1.055, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.1754894851341553, |
| "grad_norm": 4.850948810577393, |
| "learning_rate": 7.97023719595105e-06, |
| "loss": 1.8335, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.1754894851341553, |
| "eval_cosine_accuracy": 0.9615101218223572, |
| "eval_loss": 0.4473753869533539, |
| "eval_runtime": 36.7886, |
| "eval_samples_per_second": 258.477, |
| "eval_steps_per_second": 1.033, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.5380710659898478, |
| "grad_norm": 6.111043930053711, |
| "learning_rate": 7.214836077957396e-06, |
| "loss": 1.839, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.5380710659898478, |
| "eval_cosine_accuracy": 0.9624566435813904, |
| "eval_loss": 0.43305832147598267, |
| "eval_runtime": 35.3588, |
| "eval_samples_per_second": 268.929, |
| "eval_steps_per_second": 1.075, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.9006526468455403, |
| "grad_norm": 7.39910364151001, |
| "learning_rate": 6.459434959963742e-06, |
| "loss": 1.3238, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.9006526468455403, |
| "eval_cosine_accuracy": 0.9623514413833618, |
| "eval_loss": 0.4196876585483551, |
| "eval_runtime": 35.0727, |
| "eval_samples_per_second": 271.123, |
| "eval_steps_per_second": 1.083, |
| "step": 8000 |
| }, |
| { |
| "epoch": 3.2632342277012327, |
| "grad_norm": 5.353915691375732, |
| "learning_rate": 5.7047892430880805e-06, |
| "loss": 1.8409, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.2632342277012327, |
| "eval_cosine_accuracy": 0.9646650552749634, |
| "eval_loss": 0.42811915278434753, |
| "eval_runtime": 34.3676, |
| "eval_samples_per_second": 276.685, |
| "eval_steps_per_second": 1.106, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.625815808556925, |
| "grad_norm": 6.544925212860107, |
| "learning_rate": 4.949388125094425e-06, |
| "loss": 1.511, |
| "step": 10000 |
| }, |
| { |
| "epoch": 3.625815808556925, |
| "eval_cosine_accuracy": 0.9652960300445557, |
| "eval_loss": 0.4206908643245697, |
| "eval_runtime": 38.3351, |
| "eval_samples_per_second": 248.049, |
| "eval_steps_per_second": 0.991, |
| "step": 10000 |
| }, |
| { |
| "epoch": 3.9883973894126177, |
| "grad_norm": 7.497806549072266, |
| "learning_rate": 4.193987007100771e-06, |
| "loss": 1.1623, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.9883973894126177, |
| "eval_cosine_accuracy": 0.9646650552749634, |
| "eval_loss": 0.4107648432254791, |
| "eval_runtime": 40.8372, |
| "eval_samples_per_second": 232.851, |
| "eval_steps_per_second": 0.931, |
| "step": 11000 |
| }, |
| { |
| "epoch": 4.350978970268311, |
| "grad_norm": 5.718387603759766, |
| "learning_rate": 3.438585889107116e-06, |
| "loss": 1.8788, |
| "step": 12000 |
| }, |
| { |
| "epoch": 4.350978970268311, |
| "eval_cosine_accuracy": 0.9658218622207642, |
| "eval_loss": 0.4195900559425354, |
| "eval_runtime": 39.6046, |
| "eval_samples_per_second": 240.098, |
| "eval_steps_per_second": 0.959, |
| "step": 12000 |
| }, |
| { |
| "epoch": 4.713560551124003, |
| "grad_norm": 7.414130210876465, |
| "learning_rate": 2.683940172231455e-06, |
| "loss": 1.3249, |
| "step": 13000 |
| }, |
| { |
| "epoch": 4.713560551124003, |
| "eval_cosine_accuracy": 0.9666631817817688, |
| "eval_loss": 0.4121050536632538, |
| "eval_runtime": 39.6143, |
| "eval_samples_per_second": 240.04, |
| "eval_steps_per_second": 0.959, |
| "step": 13000 |
| }, |
| { |
| "epoch": 5.0761421319796955, |
| "grad_norm": 4.764642238616943, |
| "learning_rate": 1.9285390542378005e-06, |
| "loss": 1.2635, |
| "step": 14000 |
| }, |
| { |
| "epoch": 5.0761421319796955, |
| "eval_cosine_accuracy": 0.9668734669685364, |
| "eval_loss": 0.4072125554084778, |
| "eval_runtime": 39.4765, |
| "eval_samples_per_second": 240.877, |
| "eval_steps_per_second": 0.963, |
| "step": 14000 |
| }, |
| { |
| "epoch": 5.438723712835388, |
| "grad_norm": 5.749317169189453, |
| "learning_rate": 1.1738933373621394e-06, |
| "loss": 1.7305, |
| "step": 15000 |
| }, |
| { |
| "epoch": 5.438723712835388, |
| "eval_cosine_accuracy": 0.9663476943969727, |
| "eval_loss": 0.4133176803588867, |
| "eval_runtime": 40.307, |
| "eval_samples_per_second": 235.914, |
| "eval_steps_per_second": 0.943, |
| "step": 15000 |
| }, |
| { |
| "epoch": 5.8013052936910805, |
| "grad_norm": 6.543480396270752, |
| "learning_rate": 4.192476204864784e-07, |
| "loss": 1.2114, |
| "step": 16000 |
| }, |
| { |
| "epoch": 5.8013052936910805, |
| "eval_cosine_accuracy": 0.9664528369903564, |
| "eval_loss": 0.4110718071460724, |
| "eval_runtime": 39.1791, |
| "eval_samples_per_second": 242.706, |
| "eval_steps_per_second": 0.97, |
| "step": 16000 |
| } |
| ], |
| "logging_steps": 1000, |
| "max_steps": 16548, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 256, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|