|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.001000500250125, |
|
"eval_steps": 25, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05002501250625312, |
|
"grad_norm": 0.517996072769165, |
|
"learning_rate": 0.0001951951951951952, |
|
"loss": 1.677, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05002501250625312, |
|
"eval_loss": 1.3813503980636597, |
|
"eval_runtime": 148.0614, |
|
"eval_samples_per_second": 3.37, |
|
"eval_steps_per_second": 0.425, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10005002501250625, |
|
"grad_norm": 0.5020231604576111, |
|
"learning_rate": 0.0001901901901901902, |
|
"loss": 1.2016, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10005002501250625, |
|
"eval_loss": 1.347744107246399, |
|
"eval_runtime": 151.7258, |
|
"eval_samples_per_second": 3.289, |
|
"eval_steps_per_second": 0.415, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1500750375187594, |
|
"grad_norm": 0.3798060119152069, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 1.4491, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1500750375187594, |
|
"eval_loss": 1.3210723400115967, |
|
"eval_runtime": 150.0032, |
|
"eval_samples_per_second": 3.327, |
|
"eval_steps_per_second": 0.42, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2001000500250125, |
|
"grad_norm": 0.3365944027900696, |
|
"learning_rate": 0.00018018018018018018, |
|
"loss": 1.2076, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2001000500250125, |
|
"eval_loss": 1.3334178924560547, |
|
"eval_runtime": 151.2551, |
|
"eval_samples_per_second": 3.299, |
|
"eval_steps_per_second": 0.417, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25012506253126565, |
|
"grad_norm": 0.22820694744586945, |
|
"learning_rate": 0.0001751751751751752, |
|
"loss": 1.4415, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.25012506253126565, |
|
"eval_loss": 1.309592366218567, |
|
"eval_runtime": 149.299, |
|
"eval_samples_per_second": 3.342, |
|
"eval_steps_per_second": 0.422, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3001500750375188, |
|
"grad_norm": 0.3848935663700104, |
|
"learning_rate": 0.0001701701701701702, |
|
"loss": 1.139, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3001500750375188, |
|
"eval_loss": 1.3208202123641968, |
|
"eval_runtime": 149.5811, |
|
"eval_samples_per_second": 3.336, |
|
"eval_steps_per_second": 0.421, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3501750875437719, |
|
"grad_norm": 0.2774136960506439, |
|
"learning_rate": 0.00016516516516516518, |
|
"loss": 1.4055, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3501750875437719, |
|
"eval_loss": 1.3086917400360107, |
|
"eval_runtime": 150.1042, |
|
"eval_samples_per_second": 3.324, |
|
"eval_steps_per_second": 0.42, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.400200100050025, |
|
"grad_norm": 0.32166117429733276, |
|
"learning_rate": 0.00016016016016016018, |
|
"loss": 1.1459, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.400200100050025, |
|
"eval_loss": 1.306862473487854, |
|
"eval_runtime": 150.7168, |
|
"eval_samples_per_second": 3.311, |
|
"eval_steps_per_second": 0.418, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4502251125562781, |
|
"grad_norm": 0.23773141205310822, |
|
"learning_rate": 0.00015515515515515516, |
|
"loss": 1.4444, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4502251125562781, |
|
"eval_loss": 1.3020325899124146, |
|
"eval_runtime": 148.5364, |
|
"eval_samples_per_second": 3.359, |
|
"eval_steps_per_second": 0.424, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5002501250625313, |
|
"grad_norm": 0.37095341086387634, |
|
"learning_rate": 0.00015015015015015014, |
|
"loss": 1.2264, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5002501250625313, |
|
"eval_loss": 1.3001904487609863, |
|
"eval_runtime": 152.658, |
|
"eval_samples_per_second": 3.269, |
|
"eval_steps_per_second": 0.413, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5502751375687844, |
|
"grad_norm": 0.2519828677177429, |
|
"learning_rate": 0.00014514514514514515, |
|
"loss": 1.4605, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5502751375687844, |
|
"eval_loss": 1.299567699432373, |
|
"eval_runtime": 148.4653, |
|
"eval_samples_per_second": 3.361, |
|
"eval_steps_per_second": 0.424, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6003001500750376, |
|
"grad_norm": 0.3685779273509979, |
|
"learning_rate": 0.00014014014014014013, |
|
"loss": 1.1655, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6003001500750376, |
|
"eval_loss": 1.2988265752792358, |
|
"eval_runtime": 151.1788, |
|
"eval_samples_per_second": 3.301, |
|
"eval_steps_per_second": 0.417, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6503251625812907, |
|
"grad_norm": 0.26966241002082825, |
|
"learning_rate": 0.00013513513513513514, |
|
"loss": 1.4313, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6503251625812907, |
|
"eval_loss": 1.298296332359314, |
|
"eval_runtime": 152.0718, |
|
"eval_samples_per_second": 3.281, |
|
"eval_steps_per_second": 0.414, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7003501750875438, |
|
"grad_norm": 0.35637611150741577, |
|
"learning_rate": 0.00013013013013013014, |
|
"loss": 1.2002, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7003501750875438, |
|
"eval_loss": 1.2959158420562744, |
|
"eval_runtime": 151.1585, |
|
"eval_samples_per_second": 3.301, |
|
"eval_steps_per_second": 0.417, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7503751875937968, |
|
"grad_norm": 0.22513383626937866, |
|
"learning_rate": 0.00012512512512512512, |
|
"loss": 1.3994, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7503751875937968, |
|
"eval_loss": 1.2951635122299194, |
|
"eval_runtime": 148.5372, |
|
"eval_samples_per_second": 3.359, |
|
"eval_steps_per_second": 0.424, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.80040020010005, |
|
"grad_norm": 0.35314086079597473, |
|
"learning_rate": 0.00012012012012012013, |
|
"loss": 1.1836, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.80040020010005, |
|
"eval_loss": 1.294690728187561, |
|
"eval_runtime": 149.3769, |
|
"eval_samples_per_second": 3.341, |
|
"eval_steps_per_second": 0.422, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8504252126063031, |
|
"grad_norm": 0.240916907787323, |
|
"learning_rate": 0.00011511511511511512, |
|
"loss": 1.4378, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8504252126063031, |
|
"eval_loss": 1.2916043996810913, |
|
"eval_runtime": 152.0772, |
|
"eval_samples_per_second": 3.281, |
|
"eval_steps_per_second": 0.414, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9004502251125562, |
|
"grad_norm": 0.31087398529052734, |
|
"learning_rate": 0.00011011011011011012, |
|
"loss": 1.1989, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9004502251125562, |
|
"eval_loss": 1.2893831729888916, |
|
"eval_runtime": 150.4895, |
|
"eval_samples_per_second": 3.316, |
|
"eval_steps_per_second": 0.419, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9504752376188094, |
|
"grad_norm": 0.2413586527109146, |
|
"learning_rate": 0.00010510510510510511, |
|
"loss": 1.4508, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9504752376188094, |
|
"eval_loss": 1.2888984680175781, |
|
"eval_runtime": 151.1108, |
|
"eval_samples_per_second": 3.302, |
|
"eval_steps_per_second": 0.417, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.0005002501250626, |
|
"grad_norm": 0.40069064497947693, |
|
"learning_rate": 0.00010010010010010012, |
|
"loss": 1.2076, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0005002501250626, |
|
"eval_loss": 1.2911962270736694, |
|
"eval_runtime": 148.6843, |
|
"eval_samples_per_second": 3.356, |
|
"eval_steps_per_second": 0.424, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0505252626313157, |
|
"grad_norm": 0.22050493955612183, |
|
"learning_rate": 9.50950950950951e-05, |
|
"loss": 1.3994, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.0505252626313157, |
|
"eval_loss": 1.2921332120895386, |
|
"eval_runtime": 149.3015, |
|
"eval_samples_per_second": 3.342, |
|
"eval_steps_per_second": 0.422, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.1005502751375689, |
|
"grad_norm": 0.3588818907737732, |
|
"learning_rate": 9.009009009009009e-05, |
|
"loss": 1.177, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1005502751375689, |
|
"eval_loss": 1.2903811931610107, |
|
"eval_runtime": 149.8093, |
|
"eval_samples_per_second": 3.331, |
|
"eval_steps_per_second": 0.421, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.150575287643822, |
|
"grad_norm": 0.2672303020954132, |
|
"learning_rate": 8.50850850850851e-05, |
|
"loss": 1.4015, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.150575287643822, |
|
"eval_loss": 1.2898900508880615, |
|
"eval_runtime": 149.8311, |
|
"eval_samples_per_second": 3.33, |
|
"eval_steps_per_second": 0.42, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.2006003001500751, |
|
"grad_norm": 0.31220486760139465, |
|
"learning_rate": 8.008008008008009e-05, |
|
"loss": 1.192, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2006003001500751, |
|
"eval_loss": 1.288824439048767, |
|
"eval_runtime": 151.038, |
|
"eval_samples_per_second": 3.304, |
|
"eval_steps_per_second": 0.417, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2506253126563283, |
|
"grad_norm": 0.2526504695415497, |
|
"learning_rate": 7.507507507507507e-05, |
|
"loss": 1.3829, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.2506253126563283, |
|
"eval_loss": 1.2878332138061523, |
|
"eval_runtime": 151.5015, |
|
"eval_samples_per_second": 3.294, |
|
"eval_steps_per_second": 0.416, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.3006503251625814, |
|
"grad_norm": 0.28051283955574036, |
|
"learning_rate": 7.007007007007007e-05, |
|
"loss": 1.1514, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3006503251625814, |
|
"eval_loss": 1.2859280109405518, |
|
"eval_runtime": 150.4738, |
|
"eval_samples_per_second": 3.316, |
|
"eval_steps_per_second": 0.419, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3506753376688345, |
|
"grad_norm": 0.26419979333877563, |
|
"learning_rate": 6.506506506506507e-05, |
|
"loss": 1.4028, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.3506753376688345, |
|
"eval_loss": 1.2848296165466309, |
|
"eval_runtime": 149.0963, |
|
"eval_samples_per_second": 3.347, |
|
"eval_steps_per_second": 0.423, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.4007003501750876, |
|
"grad_norm": 0.3227976858615875, |
|
"learning_rate": 6.0060060060060066e-05, |
|
"loss": 1.1778, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4007003501750876, |
|
"eval_loss": 1.285400152206421, |
|
"eval_runtime": 149.1519, |
|
"eval_samples_per_second": 3.346, |
|
"eval_steps_per_second": 0.422, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4507253626813408, |
|
"grad_norm": 0.24903441965579987, |
|
"learning_rate": 5.505505505505506e-05, |
|
"loss": 1.4058, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.4507253626813408, |
|
"eval_loss": 1.2824435234069824, |
|
"eval_runtime": 149.5232, |
|
"eval_samples_per_second": 3.337, |
|
"eval_steps_per_second": 0.421, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.500750375187594, |
|
"grad_norm": 0.31187903881073, |
|
"learning_rate": 5.005005005005006e-05, |
|
"loss": 1.1698, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.500750375187594, |
|
"eval_loss": 1.2831988334655762, |
|
"eval_runtime": 150.4227, |
|
"eval_samples_per_second": 3.317, |
|
"eval_steps_per_second": 0.419, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.550775387693847, |
|
"grad_norm": 0.2889004051685333, |
|
"learning_rate": 4.5045045045045046e-05, |
|
"loss": 1.3516, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.550775387693847, |
|
"eval_loss": 1.2823545932769775, |
|
"eval_runtime": 149.8614, |
|
"eval_samples_per_second": 3.33, |
|
"eval_steps_per_second": 0.42, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.6008004002001002, |
|
"grad_norm": 0.37189939618110657, |
|
"learning_rate": 4.0040040040040046e-05, |
|
"loss": 1.1264, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6008004002001002, |
|
"eval_loss": 1.2828818559646606, |
|
"eval_runtime": 150.672, |
|
"eval_samples_per_second": 3.312, |
|
"eval_steps_per_second": 0.418, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6508254127063533, |
|
"grad_norm": 0.25290611386299133, |
|
"learning_rate": 3.503503503503503e-05, |
|
"loss": 1.4113, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.6508254127063533, |
|
"eval_loss": 1.2822470664978027, |
|
"eval_runtime": 149.3988, |
|
"eval_samples_per_second": 3.34, |
|
"eval_steps_per_second": 0.422, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.7008504252126064, |
|
"grad_norm": 0.3559873104095459, |
|
"learning_rate": 3.0030030030030033e-05, |
|
"loss": 1.1248, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7008504252126064, |
|
"eval_loss": 1.2828270196914673, |
|
"eval_runtime": 149.9897, |
|
"eval_samples_per_second": 3.327, |
|
"eval_steps_per_second": 0.42, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7508754377188596, |
|
"grad_norm": 0.3052867352962494, |
|
"learning_rate": 2.502502502502503e-05, |
|
"loss": 1.336, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.7508754377188596, |
|
"eval_loss": 1.282852053642273, |
|
"eval_runtime": 151.397, |
|
"eval_samples_per_second": 3.296, |
|
"eval_steps_per_second": 0.416, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.8009004502251127, |
|
"grad_norm": 0.33662667870521545, |
|
"learning_rate": 2.0020020020020023e-05, |
|
"loss": 1.0725, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8009004502251127, |
|
"eval_loss": 1.2822794914245605, |
|
"eval_runtime": 150.7632, |
|
"eval_samples_per_second": 3.31, |
|
"eval_steps_per_second": 0.418, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8509254627313658, |
|
"grad_norm": 0.29956212639808655, |
|
"learning_rate": 1.5015015015015016e-05, |
|
"loss": 1.3989, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.8509254627313658, |
|
"eval_loss": 1.2824186086654663, |
|
"eval_runtime": 150.6938, |
|
"eval_samples_per_second": 3.311, |
|
"eval_steps_per_second": 0.418, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.900950475237619, |
|
"grad_norm": 0.3255136013031006, |
|
"learning_rate": 1.0010010010010011e-05, |
|
"loss": 1.112, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.900950475237619, |
|
"eval_loss": 1.28144371509552, |
|
"eval_runtime": 149.8969, |
|
"eval_samples_per_second": 3.329, |
|
"eval_steps_per_second": 0.42, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.950975487743872, |
|
"grad_norm": 0.2689700424671173, |
|
"learning_rate": 5.005005005005006e-06, |
|
"loss": 1.3972, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.950975487743872, |
|
"eval_loss": 1.280760645866394, |
|
"eval_runtime": 149.8977, |
|
"eval_samples_per_second": 3.329, |
|
"eval_steps_per_second": 0.42, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.001000500250125, |
|
"grad_norm": 0.3633726239204407, |
|
"learning_rate": 0.0, |
|
"loss": 1.1746, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.001000500250125, |
|
"eval_loss": 1.2818013429641724, |
|
"eval_runtime": 149.8121, |
|
"eval_samples_per_second": 3.331, |
|
"eval_steps_per_second": 0.421, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.844485620424704e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|