|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2801, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03570153516601214, |
|
"grad_norm": 0.2294982522726059, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.4571, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07140307033202428, |
|
"grad_norm": 0.17154090106487274, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.4541, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10710460549803641, |
|
"grad_norm": 0.19298459589481354, |
|
"learning_rate": 6e-06, |
|
"loss": 2.4134, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14280614066404856, |
|
"grad_norm": 0.32980746030807495, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.3925, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1785076758300607, |
|
"grad_norm": 0.4573824107646942, |
|
"learning_rate": 1e-05, |
|
"loss": 2.309, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.21420921099607282, |
|
"grad_norm": 0.438753604888916, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.3336, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.24991074616208497, |
|
"grad_norm": 0.5054428577423096, |
|
"learning_rate": 1.4e-05, |
|
"loss": 2.2911, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.28561228132809713, |
|
"grad_norm": 0.5664726495742798, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.234, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.32131381649410923, |
|
"grad_norm": 0.6149884462356567, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.2339, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3570153516601214, |
|
"grad_norm": 0.7202581763267517, |
|
"learning_rate": 2e-05, |
|
"loss": 2.198, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.39271688682613354, |
|
"grad_norm": 0.6320284008979797, |
|
"learning_rate": 1.98482457644138e-05, |
|
"loss": 2.1665, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.42841842199214564, |
|
"grad_norm": 0.6476149559020996, |
|
"learning_rate": 1.9397588927258876e-05, |
|
"loss": 2.2035, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4641199571581578, |
|
"grad_norm": 0.6560292840003967, |
|
"learning_rate": 1.8661707305302052e-05, |
|
"loss": 2.1686, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.49982149232416995, |
|
"grad_norm": 0.6389040350914001, |
|
"learning_rate": 1.7662935529147725e-05, |
|
"loss": 2.1021, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5355230274901821, |
|
"grad_norm": 0.8602333068847656, |
|
"learning_rate": 1.643158716827897e-05, |
|
"loss": 2.1302, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5712245626561943, |
|
"grad_norm": 0.764371395111084, |
|
"learning_rate": 1.500503468854458e-05, |
|
"loss": 2.147, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6069260978222063, |
|
"grad_norm": 0.809414267539978, |
|
"learning_rate": 1.342657516616169e-05, |
|
"loss": 2.1208, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6426276329882185, |
|
"grad_norm": 0.8461398482322693, |
|
"learning_rate": 1.1744116184774898e-05, |
|
"loss": 2.067, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6783291681542306, |
|
"grad_norm": 0.7336994409561157, |
|
"learning_rate": 1.00087217997093e-05, |
|
"loss": 2.1056, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7140307033202428, |
|
"grad_norm": 0.8887193202972412, |
|
"learning_rate": 8.273062700634137e-06, |
|
"loss": 2.0957, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7497322384862549, |
|
"grad_norm": 0.7335702776908875, |
|
"learning_rate": 6.589817611513086e-06, |
|
"loss": 2.0847, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7854337736522671, |
|
"grad_norm": 0.7930261492729187, |
|
"learning_rate": 5.010074446706905e-06, |
|
"loss": 2.1111, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8211353088182792, |
|
"grad_norm": 0.754761278629303, |
|
"learning_rate": 3.5817797494951313e-06, |
|
"loss": 2.1029, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8568368439842913, |
|
"grad_norm": 0.7991265058517456, |
|
"learning_rate": 2.3482834738712023e-06, |
|
"loss": 2.0618, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8925383791503034, |
|
"grad_norm": 1.1803473234176636, |
|
"learning_rate": 1.347023276716265e-06, |
|
"loss": 2.0831, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9282399143163156, |
|
"grad_norm": 0.8793602585792542, |
|
"learning_rate": 6.083882531987495e-07, |
|
"loss": 2.1029, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9639414494823277, |
|
"grad_norm": 1.1215691566467285, |
|
"learning_rate": 1.5479660199286927e-07, |
|
"loss": 2.0779, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9996429846483399, |
|
"grad_norm": 0.8478527665138245, |
|
"learning_rate": 1.5213958033388766e-11, |
|
"loss": 2.0917, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2801, |
|
"total_flos": 5.10776634114048e+16, |
|
"train_loss": 2.1906768205718286, |
|
"train_runtime": 755.3629, |
|
"train_samples_per_second": 7.416, |
|
"train_steps_per_second": 3.708 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 2801, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.10776634114048e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|