|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2, |
|
"eval_steps": 50, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 14.438831329345703, |
|
"learning_rate": 0.00015, |
|
"loss": 1.116, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 0.8588868975639343, |
|
"eval_runtime": 680.2426, |
|
"eval_samples_per_second": 8.601, |
|
"eval_steps_per_second": 1.076, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.0438328981399536, |
|
"learning_rate": 0.0003, |
|
"loss": 0.723, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 0.8119707703590393, |
|
"eval_runtime": 681.2671, |
|
"eval_samples_per_second": 8.588, |
|
"eval_steps_per_second": 1.074, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8140527009963989, |
|
"learning_rate": 0.0002833333333333333, |
|
"loss": 0.6987, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.7736836075782776, |
|
"eval_runtime": 681.2882, |
|
"eval_samples_per_second": 8.588, |
|
"eval_steps_per_second": 1.074, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7538554668426514, |
|
"learning_rate": 0.0002666666666666666, |
|
"loss": 0.6701, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.7494252324104309, |
|
"eval_runtime": 681.1585, |
|
"eval_samples_per_second": 8.59, |
|
"eval_steps_per_second": 1.075, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8082383871078491, |
|
"learning_rate": 0.00025, |
|
"loss": 0.6585, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.7359848022460938, |
|
"eval_runtime": 681.3996, |
|
"eval_samples_per_second": 8.587, |
|
"eval_steps_per_second": 1.074, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7390263080596924, |
|
"learning_rate": 0.0002333333333333333, |
|
"loss": 0.6451, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.7315810322761536, |
|
"eval_runtime": 680.7597, |
|
"eval_samples_per_second": 8.595, |
|
"eval_steps_per_second": 1.075, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.719605565071106, |
|
"learning_rate": 0.00021666666666666666, |
|
"loss": 0.6382, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 0.6773383617401123, |
|
"eval_runtime": 680.8905, |
|
"eval_samples_per_second": 8.593, |
|
"eval_steps_per_second": 1.075, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8071795701980591, |
|
"learning_rate": 0.00019999999999999998, |
|
"loss": 0.6304, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.6790196299552917, |
|
"eval_runtime": 680.4041, |
|
"eval_samples_per_second": 8.599, |
|
"eval_steps_per_second": 1.076, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6562775373458862, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.6236, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 0.667582631111145, |
|
"eval_runtime": 680.9279, |
|
"eval_samples_per_second": 8.593, |
|
"eval_steps_per_second": 1.075, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6858498454093933, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 0.6127, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.6611046195030212, |
|
"eval_runtime": 681.3448, |
|
"eval_samples_per_second": 8.587, |
|
"eval_steps_per_second": 1.074, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7050228714942932, |
|
"learning_rate": 0.00015, |
|
"loss": 0.6109, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 0.6624142527580261, |
|
"eval_runtime": 680.8132, |
|
"eval_samples_per_second": 8.594, |
|
"eval_steps_per_second": 1.075, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7099040746688843, |
|
"learning_rate": 0.0001333333333333333, |
|
"loss": 0.6074, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.6503908634185791, |
|
"eval_runtime": 680.9458, |
|
"eval_samples_per_second": 8.592, |
|
"eval_steps_per_second": 1.075, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7423191070556641, |
|
"learning_rate": 0.00011666666666666665, |
|
"loss": 0.5972, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 0.6401746273040771, |
|
"eval_runtime": 680.6123, |
|
"eval_samples_per_second": 8.597, |
|
"eval_steps_per_second": 1.076, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6712120175361633, |
|
"learning_rate": 9.999999999999999e-05, |
|
"loss": 0.5912, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 0.6332426071166992, |
|
"eval_runtime": 680.7596, |
|
"eval_samples_per_second": 8.595, |
|
"eval_steps_per_second": 1.075, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7194417715072632, |
|
"learning_rate": 8.333333333333333e-05, |
|
"loss": 0.5934, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 0.6242749094963074, |
|
"eval_runtime": 679.731, |
|
"eval_samples_per_second": 8.608, |
|
"eval_steps_per_second": 1.077, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7845382690429688, |
|
"learning_rate": 6.666666666666666e-05, |
|
"loss": 0.5908, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.6116130352020264, |
|
"eval_runtime": 681.1204, |
|
"eval_samples_per_second": 8.59, |
|
"eval_steps_per_second": 1.075, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7342799305915833, |
|
"learning_rate": 4.9999999999999996e-05, |
|
"loss": 0.5824, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 0.6023569703102112, |
|
"eval_runtime": 680.663, |
|
"eval_samples_per_second": 8.596, |
|
"eval_steps_per_second": 1.075, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6745529174804688, |
|
"learning_rate": 3.333333333333333e-05, |
|
"loss": 0.5823, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 0.5980194211006165, |
|
"eval_runtime": 680.2295, |
|
"eval_samples_per_second": 8.602, |
|
"eval_steps_per_second": 1.076, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7388492226600647, |
|
"learning_rate": 1.6666666666666664e-05, |
|
"loss": 0.5876, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 0.5954298973083496, |
|
"eval_runtime": 680.8083, |
|
"eval_samples_per_second": 8.594, |
|
"eval_steps_per_second": 1.075, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8207566738128662, |
|
"learning_rate": 0.0, |
|
"loss": 0.5748, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.5945320725440979, |
|
"eval_runtime": 680.8349, |
|
"eval_samples_per_second": 8.594, |
|
"eval_steps_per_second": 1.075, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.187924717080576e+16, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|