|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.6, |
|
"eval_steps": 50, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.656943321228027, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.6339, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.9032842516899109, |
|
"eval_runtime": 1.8285, |
|
"eval_samples_per_second": 62.347, |
|
"eval_steps_per_second": 3.281, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.039907693862915, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.2834, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.8288049101829529, |
|
"eval_runtime": 1.8279, |
|
"eval_samples_per_second": 62.366, |
|
"eval_steps_per_second": 3.282, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.98620343208313, |
|
"learning_rate": 3.6e-06, |
|
"loss": 1.1369, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.8595764636993408, |
|
"eval_runtime": 1.8209, |
|
"eval_samples_per_second": 62.607, |
|
"eval_steps_per_second": 3.295, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.9106926918029785, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.1002, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.8750876784324646, |
|
"eval_runtime": 1.8177, |
|
"eval_samples_per_second": 62.716, |
|
"eval_steps_per_second": 3.301, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.5830564498901367, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0671, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.8878025412559509, |
|
"eval_runtime": 1.8312, |
|
"eval_samples_per_second": 62.255, |
|
"eval_steps_per_second": 3.277, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.3819241523742676, |
|
"learning_rate": 7.2e-06, |
|
"loss": 1.0332, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.8920303583145142, |
|
"eval_runtime": 1.8258, |
|
"eval_samples_per_second": 62.439, |
|
"eval_steps_per_second": 3.286, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.588942050933838, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 1.0199, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 0.8735177516937256, |
|
"eval_runtime": 1.8258, |
|
"eval_samples_per_second": 62.438, |
|
"eval_steps_per_second": 3.286, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.964590311050415, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.0218, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.8841342329978943, |
|
"eval_runtime": 1.8275, |
|
"eval_samples_per_second": 62.38, |
|
"eval_steps_per_second": 3.283, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.193049907684326, |
|
"learning_rate": 1.08e-05, |
|
"loss": 0.995, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 0.8781883120536804, |
|
"eval_runtime": 1.8206, |
|
"eval_samples_per_second": 62.616, |
|
"eval_steps_per_second": 3.296, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.1733250617980957, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.9752, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.8925071358680725, |
|
"eval_runtime": 1.8212, |
|
"eval_samples_per_second": 62.598, |
|
"eval_steps_per_second": 3.295, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.2617034912109375, |
|
"learning_rate": 1.32e-05, |
|
"loss": 1.004, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.8857673406600952, |
|
"eval_runtime": 1.8247, |
|
"eval_samples_per_second": 62.475, |
|
"eval_steps_per_second": 3.288, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.422171115875244, |
|
"learning_rate": 1.44e-05, |
|
"loss": 1.0094, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.885766327381134, |
|
"eval_runtime": 1.8226, |
|
"eval_samples_per_second": 62.547, |
|
"eval_steps_per_second": 3.292, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.6416265964508057, |
|
"learning_rate": 1.56e-05, |
|
"loss": 0.9922, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 0.9024901986122131, |
|
"eval_runtime": 1.8283, |
|
"eval_samples_per_second": 62.353, |
|
"eval_steps_per_second": 3.282, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.2401561737060547, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.9848, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.8976176977157593, |
|
"eval_runtime": 1.8213, |
|
"eval_samples_per_second": 62.593, |
|
"eval_steps_per_second": 3.294, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.61985182762146, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.9916, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.873446524143219, |
|
"eval_runtime": 1.8236, |
|
"eval_samples_per_second": 62.515, |
|
"eval_steps_per_second": 3.29, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.0905354022979736, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.9961, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.8803545236587524, |
|
"eval_runtime": 1.8244, |
|
"eval_samples_per_second": 62.488, |
|
"eval_steps_per_second": 3.289, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.7789053916931152, |
|
"learning_rate": 2.04e-05, |
|
"loss": 1.02, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 0.8905940651893616, |
|
"eval_runtime": 1.827, |
|
"eval_samples_per_second": 62.399, |
|
"eval_steps_per_second": 3.284, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.842257261276245, |
|
"learning_rate": 2.16e-05, |
|
"loss": 1.001, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.8835523128509521, |
|
"eval_runtime": 1.8313, |
|
"eval_samples_per_second": 62.251, |
|
"eval_steps_per_second": 3.276, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.3730099201202393, |
|
"learning_rate": 2.2800000000000002e-05, |
|
"loss": 1.015, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.953080952167511, |
|
"eval_runtime": 1.8303, |
|
"eval_samples_per_second": 62.285, |
|
"eval_steps_per_second": 3.278, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.283881664276123, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.0089, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.9305523037910461, |
|
"eval_runtime": 1.8222, |
|
"eval_samples_per_second": 62.56, |
|
"eval_steps_per_second": 3.293, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.0285823345184326, |
|
"learning_rate": 2.52e-05, |
|
"loss": 1.0215, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.9584159255027771, |
|
"eval_runtime": 1.8216, |
|
"eval_samples_per_second": 62.584, |
|
"eval_steps_per_second": 3.294, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.6454532146453857, |
|
"learning_rate": 2.64e-05, |
|
"loss": 1.0535, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.9347971677780151, |
|
"eval_runtime": 1.8265, |
|
"eval_samples_per_second": 62.414, |
|
"eval_steps_per_second": 3.285, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.3046398162841797, |
|
"learning_rate": 2.7600000000000003e-05, |
|
"loss": 1.036, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 0.9546946883201599, |
|
"eval_runtime": 1.8227, |
|
"eval_samples_per_second": 62.545, |
|
"eval_steps_per_second": 3.292, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.419645309448242, |
|
"learning_rate": 2.88e-05, |
|
"loss": 1.0732, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.9314932823181152, |
|
"eval_runtime": 1.8207, |
|
"eval_samples_per_second": 62.613, |
|
"eval_steps_per_second": 3.295, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.927753210067749, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0643, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.9249414801597595, |
|
"eval_runtime": 1.8261, |
|
"eval_samples_per_second": 62.427, |
|
"eval_steps_per_second": 3.286, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.8154520988464355, |
|
"learning_rate": 2.9998537860139564e-05, |
|
"loss": 0.6328, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 1.008348822593689, |
|
"eval_runtime": 1.8294, |
|
"eval_samples_per_second": 62.316, |
|
"eval_steps_per_second": 3.28, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 6.605989456176758, |
|
"learning_rate": 2.9994151725605313e-05, |
|
"loss": 0.708, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 0.9453077912330627, |
|
"eval_runtime": 1.8194, |
|
"eval_samples_per_second": 62.657, |
|
"eval_steps_per_second": 3.298, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.298051357269287, |
|
"learning_rate": 2.9986842451482876e-05, |
|
"loss": 0.6654, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 1.035285234451294, |
|
"eval_runtime": 1.8281, |
|
"eval_samples_per_second": 62.359, |
|
"eval_steps_per_second": 3.282, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.0220792293548584, |
|
"learning_rate": 2.9976611462729715e-05, |
|
"loss": 0.7063, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 1.0368359088897705, |
|
"eval_runtime": 1.8316, |
|
"eval_samples_per_second": 62.24, |
|
"eval_steps_per_second": 3.276, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.9698407649993896, |
|
"learning_rate": 2.9963460753897364e-05, |
|
"loss": 0.6812, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 1.0127767324447632, |
|
"eval_runtime": 1.8272, |
|
"eval_samples_per_second": 62.392, |
|
"eval_steps_per_second": 3.284, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.5555665493011475, |
|
"learning_rate": 2.9947392888742566e-05, |
|
"loss": 0.7029, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 1.045531153678894, |
|
"eval_runtime": 1.8258, |
|
"eval_samples_per_second": 62.438, |
|
"eval_steps_per_second": 3.286, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.3660807609558105, |
|
"learning_rate": 2.992841099972747e-05, |
|
"loss": 0.6799, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 1.0579793453216553, |
|
"eval_runtime": 1.8239, |
|
"eval_samples_per_second": 62.504, |
|
"eval_steps_per_second": 3.29, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.915484666824341, |
|
"learning_rate": 2.9906518787408948e-05, |
|
"loss": 0.7182, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 1.061601161956787, |
|
"eval_runtime": 1.8217, |
|
"eval_samples_per_second": 62.581, |
|
"eval_steps_per_second": 3.294, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 2.7276549339294434, |
|
"learning_rate": 2.988172051971717e-05, |
|
"loss": 0.7168, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"eval_loss": 1.0619702339172363, |
|
"eval_runtime": 1.8419, |
|
"eval_samples_per_second": 61.891, |
|
"eval_steps_per_second": 3.257, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.576150894165039, |
|
"learning_rate": 2.9854021031123555e-05, |
|
"loss": 0.729, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 1.0389442443847656, |
|
"eval_runtime": 1.8255, |
|
"eval_samples_per_second": 62.447, |
|
"eval_steps_per_second": 3.287, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.996159553527832, |
|
"learning_rate": 2.9823425721698293e-05, |
|
"loss": 0.7319, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 1.0429059267044067, |
|
"eval_runtime": 1.8338, |
|
"eval_samples_per_second": 62.167, |
|
"eval_steps_per_second": 3.272, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 3.9739811420440674, |
|
"learning_rate": 2.9789940556057574e-05, |
|
"loss": 0.7281, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 1.0586906671524048, |
|
"eval_runtime": 1.8323, |
|
"eval_samples_per_second": 62.218, |
|
"eval_steps_per_second": 3.275, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.907900810241699, |
|
"learning_rate": 2.975357206220079e-05, |
|
"loss": 0.743, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 1.0349457263946533, |
|
"eval_runtime": 1.8285, |
|
"eval_samples_per_second": 62.345, |
|
"eval_steps_per_second": 3.281, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 3.018134117126465, |
|
"learning_rate": 2.9714327330237873e-05, |
|
"loss": 0.7251, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 1.0124412775039673, |
|
"eval_runtime": 1.8281, |
|
"eval_samples_per_second": 62.359, |
|
"eval_steps_per_second": 3.282, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.7769956588745117, |
|
"learning_rate": 2.9672214011007087e-05, |
|
"loss": 0.7403, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 1.0399030447006226, |
|
"eval_runtime": 1.8304, |
|
"eval_samples_per_second": 62.282, |
|
"eval_steps_per_second": 3.278, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 12500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 2000, |
|
"total_flos": 1.081999679773737e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|