|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 166, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012048192771084338, |
|
"grad_norm": 5.144875488861497, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 5.2383, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.060240963855421686, |
|
"grad_norm": 4.400640807642775, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 5.1426, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 2.8424185774279995, |
|
"learning_rate": 0.00019990989662046818, |
|
"loss": 4.9195, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.18072289156626506, |
|
"grad_norm": 2.1742733252180533, |
|
"learning_rate": 0.0001967732946933499, |
|
"loss": 4.5773, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 2.0811257888315406, |
|
"learning_rate": 0.00018929258581495685, |
|
"loss": 4.3453, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.30120481927710846, |
|
"grad_norm": 1.3196766642254727, |
|
"learning_rate": 0.00017780357543184397, |
|
"loss": 4.1766, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 1.7074769394742433, |
|
"learning_rate": 0.00016282199972956425, |
|
"loss": 4.0355, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.42168674698795183, |
|
"grad_norm": 1.786794986516103, |
|
"learning_rate": 0.00014502037448176734, |
|
"loss": 3.9418, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 1.712555578478064, |
|
"learning_rate": 0.00012519780613851254, |
|
"loss": 3.8637, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5421686746987951, |
|
"grad_norm": 1.1317257705596584, |
|
"learning_rate": 0.00010424412031961484, |
|
"loss": 3.8164, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 0.8808395413039651, |
|
"learning_rate": 8.309991796781511e-05, |
|
"loss": 3.7855, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6626506024096386, |
|
"grad_norm": 0.9070705182297121, |
|
"learning_rate": 6.271435222196916e-05, |
|
"loss": 3.7594, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 0.800430133148163, |
|
"learning_rate": 4.4002521386240466e-05, |
|
"loss": 3.723, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7831325301204819, |
|
"grad_norm": 0.9084524552496485, |
|
"learning_rate": 2.7804390604547557e-05, |
|
"loss": 3.7258, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 0.7470706061804011, |
|
"learning_rate": 1.4847086226668872e-05, |
|
"loss": 3.6902, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9036144578313253, |
|
"grad_norm": 0.697773024233771, |
|
"learning_rate": 5.71225545389158e-06, |
|
"loss": 3.691, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.70777044124217, |
|
"learning_rate": 8.099564741123166e-07, |
|
"loss": 3.6707, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5120481927710844, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 3.6468, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5421686746987951, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 8.23529411764706e-05, |
|
"loss": 3.6491, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.572289156626506, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0001411764705882353, |
|
"loss": 3.6893, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0002, |
|
"loss": 3.6404, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6325301204819277, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00019944481853548335, |
|
"loss": 3.6761, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6626506024096386, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00019778543867110426, |
|
"loss": 3.6328, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6927710843373494, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00019504028554572864, |
|
"loss": 3.6478, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00019123984032200586, |
|
"loss": 3.6035, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7530120481927711, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00018642630173483832, |
|
"loss": 3.6189, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7831325301204819, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00018065311753227273, |
|
"loss": 3.6364, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8132530120481928, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00017398439101151905, |
|
"loss": 3.6213, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001664941692397025, |
|
"loss": 3.558, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8734939759036144, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00015826562086267956, |
|
"loss": 3.5992, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9036144578313253, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00014939011263122634, |
|
"loss": 3.5675, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9337349397590361, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00013996619489850822, |
|
"loss": 3.5625, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0001300985073534919, |
|
"loss": 3.5525, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9939759036144579, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00011989661714062999, |
|
"loss": 3.5954, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 166, |
|
"total_flos": 2783887138750464.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 3.0865, |
|
"train_samples_per_second": 3442.1, |
|
"train_steps_per_second": 53.783 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 166, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2783887138750464.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|