|
{ |
|
"best_metric": 3.510607957839966, |
|
"best_model_checkpoint": "./Phi-2_PT_QA_1/checkpoint-200", |
|
"epoch": 0.0022854530910753055, |
|
"eval_steps": 1, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.1427265455376528e-05, |
|
"grad_norm": 0.7554183006286621, |
|
"learning_rate": 2.5e-05, |
|
"loss": 7.6787, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 1.1427265455376528e-05, |
|
"eval_loss": 7.217936038970947, |
|
"eval_runtime": 3184.807, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2.2854530910753055e-05, |
|
"grad_norm": 0.809639573097229, |
|
"learning_rate": 2.4874371859296484e-05, |
|
"loss": 7.5645, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 2.2854530910753055e-05, |
|
"eval_loss": 7.216185569763184, |
|
"eval_runtime": 3195.682, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 3.428179636612959e-05, |
|
"grad_norm": 0.5419557094573975, |
|
"learning_rate": 2.4748743718592964e-05, |
|
"loss": 5.6844, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 3.428179636612959e-05, |
|
"eval_loss": 7.211589336395264, |
|
"eval_runtime": 3196.3014, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 4.570906182150611e-05, |
|
"grad_norm": 0.6964669823646545, |
|
"learning_rate": 2.462311557788945e-05, |
|
"loss": 6.8741, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 4.570906182150611e-05, |
|
"eval_loss": 7.204798221588135, |
|
"eval_runtime": 3194.9301, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 5.713632727688264e-05, |
|
"grad_norm": 0.670452356338501, |
|
"learning_rate": 2.449748743718593e-05, |
|
"loss": 6.6605, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 5.713632727688264e-05, |
|
"eval_loss": 7.197990894317627, |
|
"eval_runtime": 3194.2197, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 6.856359273225918e-05, |
|
"grad_norm": 0.8575164675712585, |
|
"learning_rate": 2.4371859296482413e-05, |
|
"loss": 8.0797, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 6.856359273225918e-05, |
|
"eval_loss": 7.1901774406433105, |
|
"eval_runtime": 3196.5055, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7.99908581876357e-05, |
|
"grad_norm": 0.46564584970474243, |
|
"learning_rate": 2.4246231155778896e-05, |
|
"loss": 4.6497, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 7.99908581876357e-05, |
|
"eval_loss": 7.182745933532715, |
|
"eval_runtime": 3195.2045, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 9.141812364301222e-05, |
|
"grad_norm": 0.8245170712471008, |
|
"learning_rate": 2.4120603015075376e-05, |
|
"loss": 7.1603, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 9.141812364301222e-05, |
|
"eval_loss": 7.173906326293945, |
|
"eval_runtime": 3196.5564, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00010284538909838876, |
|
"grad_norm": 0.7266675233840942, |
|
"learning_rate": 2.3994974874371863e-05, |
|
"loss": 5.9704, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00010284538909838876, |
|
"eval_loss": 7.165711879730225, |
|
"eval_runtime": 3195.6693, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00011427265455376528, |
|
"grad_norm": 0.8064714670181274, |
|
"learning_rate": 2.3869346733668342e-05, |
|
"loss": 6.5571, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00011427265455376528, |
|
"eval_loss": 7.157066822052002, |
|
"eval_runtime": 3196.6621, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00012569992000914182, |
|
"grad_norm": 1.181254267692566, |
|
"learning_rate": 2.3743718592964825e-05, |
|
"loss": 8.2521, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00012569992000914182, |
|
"eval_loss": 7.1473798751831055, |
|
"eval_runtime": 3195.7223, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00013712718546451835, |
|
"grad_norm": 0.8531734347343445, |
|
"learning_rate": 2.361809045226131e-05, |
|
"loss": 6.5084, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00013712718546451835, |
|
"eval_loss": 7.1375651359558105, |
|
"eval_runtime": 3197.1154, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00014855445091989486, |
|
"grad_norm": 0.8652932643890381, |
|
"learning_rate": 2.3492462311557788e-05, |
|
"loss": 6.1662, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00014855445091989486, |
|
"eval_loss": 7.127073287963867, |
|
"eval_runtime": 3199.1778, |
|
"eval_samples_per_second": 1.719, |
|
"eval_steps_per_second": 1.719, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0001599817163752714, |
|
"grad_norm": 1.108816146850586, |
|
"learning_rate": 2.3366834170854275e-05, |
|
"loss": 7.9397, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0001599817163752714, |
|
"eval_loss": 7.116166114807129, |
|
"eval_runtime": 3199.5106, |
|
"eval_samples_per_second": 1.719, |
|
"eval_steps_per_second": 1.719, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00017140898183064793, |
|
"grad_norm": 1.090276837348938, |
|
"learning_rate": 2.3241206030150754e-05, |
|
"loss": 7.066, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00017140898183064793, |
|
"eval_loss": 7.104613780975342, |
|
"eval_runtime": 3197.1218, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00018283624728602444, |
|
"grad_norm": 1.5131642818450928, |
|
"learning_rate": 2.3115577889447238e-05, |
|
"loss": 8.6329, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00018283624728602444, |
|
"eval_loss": 7.092124938964844, |
|
"eval_runtime": 3198.3437, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00019426351274140098, |
|
"grad_norm": 1.1908084154129028, |
|
"learning_rate": 2.298994974874372e-05, |
|
"loss": 7.2422, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00019426351274140098, |
|
"eval_loss": 7.078921794891357, |
|
"eval_runtime": 3196.3444, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00020569077819677752, |
|
"grad_norm": 1.157004714012146, |
|
"learning_rate": 2.28643216080402e-05, |
|
"loss": 6.7972, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00020569077819677752, |
|
"eval_loss": 7.065500736236572, |
|
"eval_runtime": 3196.895, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00021711804365215405, |
|
"grad_norm": 1.2554625272750854, |
|
"learning_rate": 2.2738693467336687e-05, |
|
"loss": 6.9076, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.00021711804365215405, |
|
"eval_loss": 7.051235675811768, |
|
"eval_runtime": 3197.6043, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.00022854530910753056, |
|
"grad_norm": 1.1883549690246582, |
|
"learning_rate": 2.2613065326633167e-05, |
|
"loss": 6.5687, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00022854530910753056, |
|
"eval_loss": 7.036637306213379, |
|
"eval_runtime": 3196.0972, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0002399725745629071, |
|
"grad_norm": 1.328796148300171, |
|
"learning_rate": 2.248743718592965e-05, |
|
"loss": 6.7253, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0002399725745629071, |
|
"eval_loss": 7.021302700042725, |
|
"eval_runtime": 3197.9644, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00025139984001828363, |
|
"grad_norm": 1.2712606191635132, |
|
"learning_rate": 2.2361809045226133e-05, |
|
"loss": 6.1315, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00025139984001828363, |
|
"eval_loss": 7.005300521850586, |
|
"eval_runtime": 3198.4581, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00026282710547366017, |
|
"grad_norm": 1.5367112159729004, |
|
"learning_rate": 2.2236180904522613e-05, |
|
"loss": 7.0915, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00026282710547366017, |
|
"eval_loss": 6.98922872543335, |
|
"eval_runtime": 3194.9885, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0002742543709290367, |
|
"grad_norm": 1.4399423599243164, |
|
"learning_rate": 2.21105527638191e-05, |
|
"loss": 6.5237, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0002742543709290367, |
|
"eval_loss": 6.972210884094238, |
|
"eval_runtime": 3194.7666, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0002856816363844132, |
|
"grad_norm": 0.9845247864723206, |
|
"learning_rate": 2.198492462311558e-05, |
|
"loss": 5.0536, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0002856816363844132, |
|
"eval_loss": 6.955036163330078, |
|
"eval_runtime": 3197.7715, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0002971089018397897, |
|
"grad_norm": 1.1501060724258423, |
|
"learning_rate": 2.1859296482412062e-05, |
|
"loss": 5.3398, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0002971089018397897, |
|
"eval_loss": 6.937135696411133, |
|
"eval_runtime": 3193.8534, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00030853616729516626, |
|
"grad_norm": 0.6898478865623474, |
|
"learning_rate": 2.1733668341708545e-05, |
|
"loss": 4.2246, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00030853616729516626, |
|
"eval_loss": 6.919322490692139, |
|
"eval_runtime": 3194.157, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0003199634327505428, |
|
"grad_norm": 1.9928979873657227, |
|
"learning_rate": 2.1608040201005025e-05, |
|
"loss": 8.0199, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0003199634327505428, |
|
"eval_loss": 6.901232719421387, |
|
"eval_runtime": 3194.869, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00033139069820591933, |
|
"grad_norm": 1.6013647317886353, |
|
"learning_rate": 2.1482412060301508e-05, |
|
"loss": 6.4213, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.00033139069820591933, |
|
"eval_loss": 6.882281303405762, |
|
"eval_runtime": 3196.4271, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.00034281796366129587, |
|
"grad_norm": 0.6780699491500854, |
|
"learning_rate": 2.135678391959799e-05, |
|
"loss": 4.0142, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00034281796366129587, |
|
"eval_loss": 6.86350154876709, |
|
"eval_runtime": 3195.3922, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0003542452291166724, |
|
"grad_norm": 1.243430256843567, |
|
"learning_rate": 2.1231155778894474e-05, |
|
"loss": 5.1075, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0003542452291166724, |
|
"eval_loss": 6.844424724578857, |
|
"eval_runtime": 3194.7916, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0003656724945720489, |
|
"grad_norm": 1.993019700050354, |
|
"learning_rate": 2.1105527638190957e-05, |
|
"loss": 6.6294, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0003656724945720489, |
|
"eval_loss": 6.824941635131836, |
|
"eval_runtime": 3194.2906, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0003770997600274254, |
|
"grad_norm": 2.094905138015747, |
|
"learning_rate": 2.0979899497487437e-05, |
|
"loss": 7.1042, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0003770997600274254, |
|
"eval_loss": 6.804728031158447, |
|
"eval_runtime": 3194.5706, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00038852702548280196, |
|
"grad_norm": 1.7997703552246094, |
|
"learning_rate": 2.085427135678392e-05, |
|
"loss": 6.6241, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.00038852702548280196, |
|
"eval_loss": 6.784134864807129, |
|
"eval_runtime": 3194.7504, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0003999542909381785, |
|
"grad_norm": 1.6641287803649902, |
|
"learning_rate": 2.0728643216080403e-05, |
|
"loss": 5.6431, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0003999542909381785, |
|
"eval_loss": 6.762852191925049, |
|
"eval_runtime": 3195.8642, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00041138155639355503, |
|
"grad_norm": 1.9099078178405762, |
|
"learning_rate": 2.0603015075376886e-05, |
|
"loss": 6.4051, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.00041138155639355503, |
|
"eval_loss": 6.741316795349121, |
|
"eval_runtime": 3192.3732, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.00042280882184893157, |
|
"grad_norm": 1.958937406539917, |
|
"learning_rate": 2.047738693467337e-05, |
|
"loss": 6.8499, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.00042280882184893157, |
|
"eval_loss": 6.719349384307861, |
|
"eval_runtime": 3195.1382, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0004342360873043081, |
|
"grad_norm": 1.903361201286316, |
|
"learning_rate": 2.035175879396985e-05, |
|
"loss": 6.1026, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0004342360873043081, |
|
"eval_loss": 6.697410583496094, |
|
"eval_runtime": 3195.9588, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0004456633527596846, |
|
"grad_norm": 2.3142635822296143, |
|
"learning_rate": 2.0226130653266332e-05, |
|
"loss": 6.834, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0004456633527596846, |
|
"eval_loss": 6.674659252166748, |
|
"eval_runtime": 3194.6685, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0004570906182150611, |
|
"grad_norm": 2.355628252029419, |
|
"learning_rate": 2.0100502512562815e-05, |
|
"loss": 7.05, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0004570906182150611, |
|
"eval_loss": 6.651575088500977, |
|
"eval_runtime": 3192.9283, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00046851788367043766, |
|
"grad_norm": 2.478034496307373, |
|
"learning_rate": 1.9974874371859298e-05, |
|
"loss": 7.203, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.00046851788367043766, |
|
"eval_loss": 6.6284260749816895, |
|
"eval_runtime": 3195.3222, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0004799451491258142, |
|
"grad_norm": 1.591928482055664, |
|
"learning_rate": 1.984924623115578e-05, |
|
"loss": 4.5537, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0004799451491258142, |
|
"eval_loss": 6.604694366455078, |
|
"eval_runtime": 3196.8557, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0004913724145811907, |
|
"grad_norm": 2.615412712097168, |
|
"learning_rate": 1.972361809045226e-05, |
|
"loss": 6.8927, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0004913724145811907, |
|
"eval_loss": 6.580879211425781, |
|
"eval_runtime": 3197.1663, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0005027996800365673, |
|
"grad_norm": 2.0230987071990967, |
|
"learning_rate": 1.9597989949748744e-05, |
|
"loss": 6.0638, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0005027996800365673, |
|
"eval_loss": 6.556508541107178, |
|
"eval_runtime": 3196.2452, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0005142269454919438, |
|
"grad_norm": 2.6316287517547607, |
|
"learning_rate": 1.9472361809045227e-05, |
|
"loss": 6.8353, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0005142269454919438, |
|
"eval_loss": 6.532052516937256, |
|
"eval_runtime": 3195.9887, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0005256542109473203, |
|
"grad_norm": 1.5371630191802979, |
|
"learning_rate": 1.934673366834171e-05, |
|
"loss": 4.5244, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0005256542109473203, |
|
"eval_loss": 6.507321834564209, |
|
"eval_runtime": 3196.0239, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0005370814764026968, |
|
"grad_norm": 1.7422285079956055, |
|
"learning_rate": 1.9221105527638193e-05, |
|
"loss": 4.9649, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0005370814764026968, |
|
"eval_loss": 6.482240200042725, |
|
"eval_runtime": 3195.5121, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0005485087418580734, |
|
"grad_norm": 1.1683274507522583, |
|
"learning_rate": 1.9095477386934673e-05, |
|
"loss": 3.7341, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0005485087418580734, |
|
"eval_loss": 6.456914901733398, |
|
"eval_runtime": 3194.9287, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0005599360073134499, |
|
"grad_norm": 3.5512888431549072, |
|
"learning_rate": 1.8969849246231156e-05, |
|
"loss": 8.3966, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0005599360073134499, |
|
"eval_loss": 6.431957244873047, |
|
"eval_runtime": 3193.8838, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0005713632727688264, |
|
"grad_norm": 1.8522059917449951, |
|
"learning_rate": 1.884422110552764e-05, |
|
"loss": 4.837, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0005713632727688264, |
|
"eval_loss": 6.406413555145264, |
|
"eval_runtime": 3195.7335, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.000582790538224203, |
|
"grad_norm": 2.5635313987731934, |
|
"learning_rate": 1.8718592964824123e-05, |
|
"loss": 6.2677, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.000582790538224203, |
|
"eval_loss": 6.380771160125732, |
|
"eval_runtime": 3193.483, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0005942178036795794, |
|
"grad_norm": 3.0012006759643555, |
|
"learning_rate": 1.8592964824120602e-05, |
|
"loss": 6.3779, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0005942178036795794, |
|
"eval_loss": 6.354803085327148, |
|
"eval_runtime": 3191.7709, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.000605645069134956, |
|
"grad_norm": 2.9274518489837646, |
|
"learning_rate": 1.8467336683417085e-05, |
|
"loss": 6.4349, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.000605645069134956, |
|
"eval_loss": 6.329031467437744, |
|
"eval_runtime": 3194.6961, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0006170723345903325, |
|
"grad_norm": 3.2986245155334473, |
|
"learning_rate": 1.834170854271357e-05, |
|
"loss": 7.2301, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0006170723345903325, |
|
"eval_loss": 6.30263090133667, |
|
"eval_runtime": 3193.3906, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0006284996000457091, |
|
"grad_norm": 2.258892297744751, |
|
"learning_rate": 1.821608040201005e-05, |
|
"loss": 5.288, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0006284996000457091, |
|
"eval_loss": 6.27651834487915, |
|
"eval_runtime": 3193.9175, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0006399268655010856, |
|
"grad_norm": 3.4918830394744873, |
|
"learning_rate": 1.8090452261306535e-05, |
|
"loss": 7.0831, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0006399268655010856, |
|
"eval_loss": 6.249880790710449, |
|
"eval_runtime": 3195.1438, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0006513541309564621, |
|
"grad_norm": 3.48417592048645, |
|
"learning_rate": 1.7964824120603014e-05, |
|
"loss": 7.1039, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0006513541309564621, |
|
"eval_loss": 6.223054885864258, |
|
"eval_runtime": 3193.8782, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0006627813964118387, |
|
"grad_norm": 2.575576066970825, |
|
"learning_rate": 1.7839195979899497e-05, |
|
"loss": 5.38, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0006627813964118387, |
|
"eval_loss": 6.195952892303467, |
|
"eval_runtime": 3193.8956, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0006742086618672151, |
|
"grad_norm": 3.869877815246582, |
|
"learning_rate": 1.771356783919598e-05, |
|
"loss": 7.1632, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0006742086618672151, |
|
"eval_loss": 6.1690497398376465, |
|
"eval_runtime": 3195.4516, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0006856359273225917, |
|
"grad_norm": 2.590207815170288, |
|
"learning_rate": 1.7587939698492464e-05, |
|
"loss": 5.6661, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0006856359273225917, |
|
"eval_loss": 6.141936779022217, |
|
"eval_runtime": 3196.1077, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0006970631927779682, |
|
"grad_norm": 3.316432237625122, |
|
"learning_rate": 1.7462311557788947e-05, |
|
"loss": 6.4535, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0006970631927779682, |
|
"eval_loss": 6.114187717437744, |
|
"eval_runtime": 3196.6866, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0007084904582333448, |
|
"grad_norm": 3.1938068866729736, |
|
"learning_rate": 1.7336683417085427e-05, |
|
"loss": 6.0447, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0007084904582333448, |
|
"eval_loss": 6.08643913269043, |
|
"eval_runtime": 3195.265, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0007199177236887213, |
|
"grad_norm": 3.078531265258789, |
|
"learning_rate": 1.721105527638191e-05, |
|
"loss": 5.802, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0007199177236887213, |
|
"eval_loss": 6.058650016784668, |
|
"eval_runtime": 3193.9302, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0007313449891440978, |
|
"grad_norm": 3.1577515602111816, |
|
"learning_rate": 1.7085427135678393e-05, |
|
"loss": 5.8267, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0007313449891440978, |
|
"eval_loss": 6.030569553375244, |
|
"eval_runtime": 3193.2589, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0007427722545994744, |
|
"grad_norm": 3.361140012741089, |
|
"learning_rate": 1.6959798994974876e-05, |
|
"loss": 6.1471, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0007427722545994744, |
|
"eval_loss": 6.002427577972412, |
|
"eval_runtime": 3194.7411, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0007541995200548508, |
|
"grad_norm": 1.4381821155548096, |
|
"learning_rate": 1.683417085427136e-05, |
|
"loss": 3.696, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0007541995200548508, |
|
"eval_loss": 5.974757671356201, |
|
"eval_runtime": 3195.246, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0007656267855102274, |
|
"grad_norm": 3.1451377868652344, |
|
"learning_rate": 1.670854271356784e-05, |
|
"loss": 5.9335, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0007656267855102274, |
|
"eval_loss": 5.946895599365234, |
|
"eval_runtime": 3195.4763, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0007770540509656039, |
|
"grad_norm": 2.85153865814209, |
|
"learning_rate": 1.6582914572864322e-05, |
|
"loss": 5.5253, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0007770540509656039, |
|
"eval_loss": 5.918909072875977, |
|
"eval_runtime": 3194.2966, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0007884813164209805, |
|
"grad_norm": 0.18748074769973755, |
|
"learning_rate": 1.6457286432160805e-05, |
|
"loss": 2.4955, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0007884813164209805, |
|
"eval_loss": 5.893527507781982, |
|
"eval_runtime": 3192.8875, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.000799908581876357, |
|
"grad_norm": 3.377441167831421, |
|
"learning_rate": 1.6331658291457288e-05, |
|
"loss": 6.0589, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.000799908581876357, |
|
"eval_loss": 5.8677473068237305, |
|
"eval_runtime": 3192.1113, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0008113358473317335, |
|
"grad_norm": 3.5671401023864746, |
|
"learning_rate": 1.620603015075377e-05, |
|
"loss": 6.1037, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0008113358473317335, |
|
"eval_loss": 5.841763019561768, |
|
"eval_runtime": 3193.951, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0008227631127871101, |
|
"grad_norm": 1.8638324737548828, |
|
"learning_rate": 1.608040201005025e-05, |
|
"loss": 3.9179, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0008227631127871101, |
|
"eval_loss": 5.815106391906738, |
|
"eval_runtime": 3193.2648, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0008341903782424865, |
|
"grad_norm": 3.3266711235046387, |
|
"learning_rate": 1.5954773869346734e-05, |
|
"loss": 5.3457, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0008341903782424865, |
|
"eval_loss": 5.788687229156494, |
|
"eval_runtime": 3194.6212, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0008456176436978631, |
|
"grad_norm": 4.605172157287598, |
|
"learning_rate": 1.5829145728643217e-05, |
|
"loss": 7.0032, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0008456176436978631, |
|
"eval_loss": 5.761895656585693, |
|
"eval_runtime": 3191.5309, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0008570449091532396, |
|
"grad_norm": 3.4830474853515625, |
|
"learning_rate": 1.57035175879397e-05, |
|
"loss": 5.9244, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0008570449091532396, |
|
"eval_loss": 5.734696388244629, |
|
"eval_runtime": 3191.7239, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0008684721746086162, |
|
"grad_norm": 3.59964919090271, |
|
"learning_rate": 1.5577889447236183e-05, |
|
"loss": 5.4847, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0008684721746086162, |
|
"eval_loss": 5.707438945770264, |
|
"eval_runtime": 3191.9511, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0008798994400639927, |
|
"grad_norm": 4.7916412353515625, |
|
"learning_rate": 1.5452261306532663e-05, |
|
"loss": 7.0196, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0008798994400639927, |
|
"eval_loss": 5.680092811584473, |
|
"eval_runtime": 3192.5862, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0008913267055193692, |
|
"grad_norm": 3.67484450340271, |
|
"learning_rate": 1.5326633165829146e-05, |
|
"loss": 5.7853, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0008913267055193692, |
|
"eval_loss": 5.652522087097168, |
|
"eval_runtime": 3191.0964, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0009027539709747458, |
|
"grad_norm": 3.265822410583496, |
|
"learning_rate": 1.5201005025125627e-05, |
|
"loss": 5.3229, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0009027539709747458, |
|
"eval_loss": 5.624815940856934, |
|
"eval_runtime": 3191.2862, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0009141812364301222, |
|
"grad_norm": 3.9072275161743164, |
|
"learning_rate": 1.507537688442211e-05, |
|
"loss": 5.6212, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0009141812364301222, |
|
"eval_loss": 5.597079753875732, |
|
"eval_runtime": 3190.1642, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0009256085018854988, |
|
"grad_norm": 3.1518900394439697, |
|
"learning_rate": 1.4949748743718595e-05, |
|
"loss": 5.0651, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0009256085018854988, |
|
"eval_loss": 5.56900691986084, |
|
"eval_runtime": 3189.2121, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0009370357673408753, |
|
"grad_norm": 2.539473295211792, |
|
"learning_rate": 1.4824120603015077e-05, |
|
"loss": 4.0285, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0009370357673408753, |
|
"eval_loss": 5.540792942047119, |
|
"eval_runtime": 3188.8344, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0009484630327962519, |
|
"grad_norm": 4.24508810043335, |
|
"learning_rate": 1.4698492462311558e-05, |
|
"loss": 6.0955, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0009484630327962519, |
|
"eval_loss": 5.512935638427734, |
|
"eval_runtime": 3191.1876, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0009598902982516284, |
|
"grad_norm": 3.5962705612182617, |
|
"learning_rate": 1.457286432160804e-05, |
|
"loss": 5.3177, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0009598902982516284, |
|
"eval_loss": 5.484694004058838, |
|
"eval_runtime": 3191.2979, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0009713175637070049, |
|
"grad_norm": 4.158171653747559, |
|
"learning_rate": 1.4447236180904523e-05, |
|
"loss": 5.8912, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0009713175637070049, |
|
"eval_loss": 5.456665515899658, |
|
"eval_runtime": 3192.0268, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0009827448291623814, |
|
"grad_norm": 4.112974643707275, |
|
"learning_rate": 1.4321608040201007e-05, |
|
"loss": 5.9361, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0009827448291623814, |
|
"eval_loss": 5.428305149078369, |
|
"eval_runtime": 3189.75, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.000994172094617758, |
|
"grad_norm": 3.924593925476074, |
|
"learning_rate": 1.4195979899497489e-05, |
|
"loss": 5.4892, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.000994172094617758, |
|
"eval_loss": 5.400221824645996, |
|
"eval_runtime": 3191.2904, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.0010055993600731345, |
|
"grad_norm": 5.291077613830566, |
|
"learning_rate": 1.407035175879397e-05, |
|
"loss": 6.7912, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0010055993600731345, |
|
"eval_loss": 5.371959686279297, |
|
"eval_runtime": 3188.0583, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.001017026625528511, |
|
"grad_norm": 3.935314893722534, |
|
"learning_rate": 1.3944723618090452e-05, |
|
"loss": 5.2535, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.001017026625528511, |
|
"eval_loss": 5.343928337097168, |
|
"eval_runtime": 3186.5407, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 1.726, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0010284538909838875, |
|
"grad_norm": 4.988033771514893, |
|
"learning_rate": 1.3819095477386935e-05, |
|
"loss": 6.5576, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0010284538909838875, |
|
"eval_loss": 5.315735816955566, |
|
"eval_runtime": 3190.1379, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.001039881156439264, |
|
"grad_norm": 3.571462631225586, |
|
"learning_rate": 1.369346733668342e-05, |
|
"loss": 5.0059, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.001039881156439264, |
|
"eval_loss": 5.287474632263184, |
|
"eval_runtime": 3183.8947, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0010513084218946407, |
|
"grad_norm": 3.0511598587036133, |
|
"learning_rate": 1.3567839195979901e-05, |
|
"loss": 4.255, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0010513084218946407, |
|
"eval_loss": 5.25911283493042, |
|
"eval_runtime": 3188.1641, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0010627356873500172, |
|
"grad_norm": 3.8936984539031982, |
|
"learning_rate": 1.3442211055276382e-05, |
|
"loss": 5.0144, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0010627356873500172, |
|
"eval_loss": 5.231190204620361, |
|
"eval_runtime": 3178.9927, |
|
"eval_samples_per_second": 1.73, |
|
"eval_steps_per_second": 1.73, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0010741629528053936, |
|
"grad_norm": 3.204448699951172, |
|
"learning_rate": 1.3316582914572864e-05, |
|
"loss": 4.539, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0010741629528053936, |
|
"eval_loss": 5.203569412231445, |
|
"eval_runtime": 3181.9254, |
|
"eval_samples_per_second": 1.729, |
|
"eval_steps_per_second": 1.729, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0010855902182607701, |
|
"grad_norm": 5.711158752441406, |
|
"learning_rate": 1.3190954773869347e-05, |
|
"loss": 7.1367, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0010855902182607701, |
|
"eval_loss": 5.176050662994385, |
|
"eval_runtime": 3182.616, |
|
"eval_samples_per_second": 1.728, |
|
"eval_steps_per_second": 1.728, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0010970174837161468, |
|
"grad_norm": 4.6051435470581055, |
|
"learning_rate": 1.306532663316583e-05, |
|
"loss": 5.5839, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0010970174837161468, |
|
"eval_loss": 5.148005962371826, |
|
"eval_runtime": 3185.2573, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0011084447491715233, |
|
"grad_norm": 2.9833264350891113, |
|
"learning_rate": 1.2939698492462313e-05, |
|
"loss": 4.1461, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0011084447491715233, |
|
"eval_loss": 5.120667934417725, |
|
"eval_runtime": 3188.7644, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0011198720146268998, |
|
"grad_norm": 3.4609251022338867, |
|
"learning_rate": 1.2814070351758795e-05, |
|
"loss": 4.3763, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0011198720146268998, |
|
"eval_loss": 5.093635559082031, |
|
"eval_runtime": 3186.9357, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 1.726, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0011312992800822763, |
|
"grad_norm": 3.9000020027160645, |
|
"learning_rate": 1.2688442211055276e-05, |
|
"loss": 4.8055, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0011312992800822763, |
|
"eval_loss": 5.06650447845459, |
|
"eval_runtime": 3189.8792, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0011427265455376528, |
|
"grad_norm": 4.556962966918945, |
|
"learning_rate": 1.2562814070351759e-05, |
|
"loss": 5.1023, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0011427265455376528, |
|
"eval_loss": 5.03890323638916, |
|
"eval_runtime": 3185.537, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0011541538109930295, |
|
"grad_norm": 3.0480575561523438, |
|
"learning_rate": 1.2437185929648242e-05, |
|
"loss": 3.8963, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.0011541538109930295, |
|
"eval_loss": 5.011804580688477, |
|
"eval_runtime": 3186.5554, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 1.726, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.001165581076448406, |
|
"grad_norm": 4.384181499481201, |
|
"learning_rate": 1.2311557788944725e-05, |
|
"loss": 5.3644, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.001165581076448406, |
|
"eval_loss": 4.984678745269775, |
|
"eval_runtime": 3185.3238, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.0011770083419037824, |
|
"grad_norm": 3.9702224731445312, |
|
"learning_rate": 1.2185929648241207e-05, |
|
"loss": 4.6785, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.0011770083419037824, |
|
"eval_loss": 4.957573890686035, |
|
"eval_runtime": 3183.6056, |
|
"eval_samples_per_second": 1.728, |
|
"eval_steps_per_second": 1.728, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.001188435607359159, |
|
"grad_norm": 2.205033540725708, |
|
"learning_rate": 1.2060301507537688e-05, |
|
"loss": 3.2692, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.001188435607359159, |
|
"eval_loss": 4.930599212646484, |
|
"eval_runtime": 3187.0691, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 1.726, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0011998628728145354, |
|
"grad_norm": 5.425383567810059, |
|
"learning_rate": 1.1934673366834171e-05, |
|
"loss": 5.8127, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0011998628728145354, |
|
"eval_loss": 4.903759002685547, |
|
"eval_runtime": 3191.0573, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.001211290138269912, |
|
"grad_norm": 2.0843021869659424, |
|
"learning_rate": 1.1809045226130654e-05, |
|
"loss": 3.2769, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.001211290138269912, |
|
"eval_loss": 4.877487659454346, |
|
"eval_runtime": 3192.0095, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.0012227174037252886, |
|
"grad_norm": 5.174865245819092, |
|
"learning_rate": 1.1683417085427137e-05, |
|
"loss": 5.5865, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.0012227174037252886, |
|
"eval_loss": 4.85078763961792, |
|
"eval_runtime": 3181.3647, |
|
"eval_samples_per_second": 1.729, |
|
"eval_steps_per_second": 1.729, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.001234144669180665, |
|
"grad_norm": 4.530418395996094, |
|
"learning_rate": 1.1557788944723619e-05, |
|
"loss": 4.9615, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.001234144669180665, |
|
"eval_loss": 4.824298858642578, |
|
"eval_runtime": 3188.1147, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.0012455719346360415, |
|
"grad_norm": 4.7975172996521, |
|
"learning_rate": 1.14321608040201e-05, |
|
"loss": 5.4098, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0012455719346360415, |
|
"eval_loss": 4.7980499267578125, |
|
"eval_runtime": 3187.1944, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 1.726, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0012569992000914182, |
|
"grad_norm": 4.517176151275635, |
|
"learning_rate": 1.1306532663316583e-05, |
|
"loss": 5.0089, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0012569992000914182, |
|
"eval_loss": 4.771963596343994, |
|
"eval_runtime": 3195.7844, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0012684264655467947, |
|
"grad_norm": 4.071757793426514, |
|
"learning_rate": 1.1180904522613066e-05, |
|
"loss": 4.3814, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0012684264655467947, |
|
"eval_loss": 4.745976448059082, |
|
"eval_runtime": 3197.3702, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0012798537310021712, |
|
"grad_norm": 3.3903536796569824, |
|
"learning_rate": 1.105527638190955e-05, |
|
"loss": 4.0058, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0012798537310021712, |
|
"eval_loss": 4.720351696014404, |
|
"eval_runtime": 3192.7959, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0012912809964575477, |
|
"grad_norm": 3.1727375984191895, |
|
"learning_rate": 1.0929648241206031e-05, |
|
"loss": 3.7689, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0012912809964575477, |
|
"eval_loss": 4.694741249084473, |
|
"eval_runtime": 3189.1299, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0013027082619129242, |
|
"grad_norm": 2.47040057182312, |
|
"learning_rate": 1.0804020100502512e-05, |
|
"loss": 3.3884, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0013027082619129242, |
|
"eval_loss": 4.6697916984558105, |
|
"eval_runtime": 3190.5928, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0013141355273683009, |
|
"grad_norm": 3.910038471221924, |
|
"learning_rate": 1.0678391959798995e-05, |
|
"loss": 4.1467, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0013141355273683009, |
|
"eval_loss": 4.645045280456543, |
|
"eval_runtime": 3188.0361, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0013255627928236773, |
|
"grad_norm": 5.768658638000488, |
|
"learning_rate": 1.0552763819095479e-05, |
|
"loss": 5.5406, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0013255627928236773, |
|
"eval_loss": 4.620331764221191, |
|
"eval_runtime": 3188.8345, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0013369900582790538, |
|
"grad_norm": 4.449304580688477, |
|
"learning_rate": 1.042713567839196e-05, |
|
"loss": 4.4227, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.0013369900582790538, |
|
"eval_loss": 4.595801830291748, |
|
"eval_runtime": 3189.9885, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.0013484173237344303, |
|
"grad_norm": 5.598243236541748, |
|
"learning_rate": 1.0301507537688443e-05, |
|
"loss": 5.0256, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.0013484173237344303, |
|
"eval_loss": 4.571651458740234, |
|
"eval_runtime": 3184.1254, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.0013598445891898068, |
|
"grad_norm": 2.351072311401367, |
|
"learning_rate": 1.0175879396984924e-05, |
|
"loss": 3.3235, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0013598445891898068, |
|
"eval_loss": 4.5476908683776855, |
|
"eval_runtime": 3195.9101, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0013712718546451835, |
|
"grad_norm": 4.657642364501953, |
|
"learning_rate": 1.0050251256281408e-05, |
|
"loss": 4.6996, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0013712718546451835, |
|
"eval_loss": 4.524058818817139, |
|
"eval_runtime": 3187.7906, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00138269912010056, |
|
"grad_norm": 4.85261344909668, |
|
"learning_rate": 9.92462311557789e-06, |
|
"loss": 4.9033, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.00138269912010056, |
|
"eval_loss": 4.500573635101318, |
|
"eval_runtime": 3191.7978, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.0013941263855559364, |
|
"grad_norm": 2.878826379776001, |
|
"learning_rate": 9.798994974874372e-06, |
|
"loss": 3.7129, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.0013941263855559364, |
|
"eval_loss": 4.477457046508789, |
|
"eval_runtime": 3188.7301, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.001405553651011313, |
|
"grad_norm": 4.454727649688721, |
|
"learning_rate": 9.673366834170855e-06, |
|
"loss": 4.4705, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.001405553651011313, |
|
"eval_loss": 4.454273223876953, |
|
"eval_runtime": 3193.5195, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.0014169809164666896, |
|
"grad_norm": 5.7791547775268555, |
|
"learning_rate": 9.547738693467337e-06, |
|
"loss": 5.1999, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0014169809164666896, |
|
"eval_loss": 4.431214332580566, |
|
"eval_runtime": 3197.3551, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.001428408181922066, |
|
"grad_norm": 4.197294235229492, |
|
"learning_rate": 9.42211055276382e-06, |
|
"loss": 4.0018, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.001428408181922066, |
|
"eval_loss": 4.408187389373779, |
|
"eval_runtime": 3199.7906, |
|
"eval_samples_per_second": 1.719, |
|
"eval_steps_per_second": 1.719, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0014398354473774426, |
|
"grad_norm": 4.467915058135986, |
|
"learning_rate": 9.296482412060301e-06, |
|
"loss": 4.3018, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.0014398354473774426, |
|
"eval_loss": 4.386054515838623, |
|
"eval_runtime": 3193.7722, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.001451262712832819, |
|
"grad_norm": 5.470479488372803, |
|
"learning_rate": 9.170854271356784e-06, |
|
"loss": 4.8658, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.001451262712832819, |
|
"eval_loss": 4.363833427429199, |
|
"eval_runtime": 3195.1881, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0014626899782881956, |
|
"grad_norm": 4.576855182647705, |
|
"learning_rate": 9.045226130653267e-06, |
|
"loss": 4.5801, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0014626899782881956, |
|
"eval_loss": 4.341481685638428, |
|
"eval_runtime": 3196.2749, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0014741172437435722, |
|
"grad_norm": 3.8720321655273438, |
|
"learning_rate": 8.919597989949749e-06, |
|
"loss": 3.9047, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.0014741172437435722, |
|
"eval_loss": 4.319530010223389, |
|
"eval_runtime": 3192.3427, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.0014855445091989487, |
|
"grad_norm": 5.404055595397949, |
|
"learning_rate": 8.793969849246232e-06, |
|
"loss": 4.5479, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0014855445091989487, |
|
"eval_loss": 4.298058032989502, |
|
"eval_runtime": 3192.0239, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0014969717746543252, |
|
"grad_norm": 5.212480068206787, |
|
"learning_rate": 8.668341708542713e-06, |
|
"loss": 4.6837, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.0014969717746543252, |
|
"eval_loss": 4.276992321014404, |
|
"eval_runtime": 3190.8662, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.0015083990401097017, |
|
"grad_norm": 5.532873153686523, |
|
"learning_rate": 8.542713567839196e-06, |
|
"loss": 4.5492, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0015083990401097017, |
|
"eval_loss": 4.256262302398682, |
|
"eval_runtime": 3195.153, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0015198263055650782, |
|
"grad_norm": 5.229096412658691, |
|
"learning_rate": 8.41708542713568e-06, |
|
"loss": 4.6645, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.0015198263055650782, |
|
"eval_loss": 4.235621452331543, |
|
"eval_runtime": 3193.1225, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.0015312535710204549, |
|
"grad_norm": 6.112283229827881, |
|
"learning_rate": 8.291457286432161e-06, |
|
"loss": 5.1063, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0015312535710204549, |
|
"eval_loss": 4.215306282043457, |
|
"eval_runtime": 3190.5362, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0015426808364758314, |
|
"grad_norm": 5.098965644836426, |
|
"learning_rate": 8.165829145728644e-06, |
|
"loss": 4.3275, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0015426808364758314, |
|
"eval_loss": 4.195265769958496, |
|
"eval_runtime": 3197.2965, |
|
"eval_samples_per_second": 1.72, |
|
"eval_steps_per_second": 1.72, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0015541081019312078, |
|
"grad_norm": 4.572518825531006, |
|
"learning_rate": 8.040201005025125e-06, |
|
"loss": 4.1709, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0015541081019312078, |
|
"eval_loss": 4.174986362457275, |
|
"eval_runtime": 3195.4093, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0015655353673865843, |
|
"grad_norm": 4.504931926727295, |
|
"learning_rate": 7.914572864321608e-06, |
|
"loss": 4.1334, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0015655353673865843, |
|
"eval_loss": 4.155473232269287, |
|
"eval_runtime": 3196.5363, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.001576962632841961, |
|
"grad_norm": 4.817331790924072, |
|
"learning_rate": 7.788944723618092e-06, |
|
"loss": 4.0312, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.001576962632841961, |
|
"eval_loss": 4.136098861694336, |
|
"eval_runtime": 3192.698, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0015883898982973375, |
|
"grad_norm": 5.8819146156311035, |
|
"learning_rate": 7.663316582914573e-06, |
|
"loss": 4.8895, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0015883898982973375, |
|
"eval_loss": 4.116922378540039, |
|
"eval_runtime": 3194.9621, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.001599817163752714, |
|
"grad_norm": 4.634926795959473, |
|
"learning_rate": 7.537688442211055e-06, |
|
"loss": 4.1536, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.001599817163752714, |
|
"eval_loss": 4.097559452056885, |
|
"eval_runtime": 3190.2406, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0016112444292080905, |
|
"grad_norm": 4.497166156768799, |
|
"learning_rate": 7.412060301507538e-06, |
|
"loss": 4.1384, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.0016112444292080905, |
|
"eval_loss": 4.079128742218018, |
|
"eval_runtime": 3192.3172, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.001622671694663467, |
|
"grad_norm": 5.713669300079346, |
|
"learning_rate": 7.28643216080402e-06, |
|
"loss": 4.7611, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.001622671694663467, |
|
"eval_loss": 4.0609354972839355, |
|
"eval_runtime": 3191.9262, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.0016340989601188436, |
|
"grad_norm": 4.149837017059326, |
|
"learning_rate": 7.160804020100504e-06, |
|
"loss": 3.8286, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0016340989601188436, |
|
"eval_loss": 4.042760372161865, |
|
"eval_runtime": 3188.8273, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0016455262255742201, |
|
"grad_norm": 6.028262138366699, |
|
"learning_rate": 7.035175879396985e-06, |
|
"loss": 4.6085, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.0016455262255742201, |
|
"eval_loss": 4.024686336517334, |
|
"eval_runtime": 3191.547, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.0016569534910295966, |
|
"grad_norm": 5.114198207855225, |
|
"learning_rate": 6.909547738693467e-06, |
|
"loss": 4.2915, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0016569534910295966, |
|
"eval_loss": 4.007183074951172, |
|
"eval_runtime": 3193.0349, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.001668380756484973, |
|
"grad_norm": 5.462469100952148, |
|
"learning_rate": 6.7839195979899505e-06, |
|
"loss": 4.1471, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.001668380756484973, |
|
"eval_loss": 3.9897186756134033, |
|
"eval_runtime": 3190.8896, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.0016798080219403496, |
|
"grad_norm": 0.8918362855911255, |
|
"learning_rate": 6.658291457286432e-06, |
|
"loss": 2.468, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.0016798080219403496, |
|
"eval_loss": 3.9733145236968994, |
|
"eval_runtime": 3186.1248, |
|
"eval_samples_per_second": 1.726, |
|
"eval_steps_per_second": 1.726, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.0016912352873957263, |
|
"grad_norm": 4.393204689025879, |
|
"learning_rate": 6.532663316582915e-06, |
|
"loss": 3.9712, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.0016912352873957263, |
|
"eval_loss": 3.956583023071289, |
|
"eval_runtime": 3187.8674, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.0017026625528511028, |
|
"grad_norm": 6.782311916351318, |
|
"learning_rate": 6.407035175879397e-06, |
|
"loss": 5.212, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0017026625528511028, |
|
"eval_loss": 3.9407119750976562, |
|
"eval_runtime": 3189.4481, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0017140898183064792, |
|
"grad_norm": 3.583444356918335, |
|
"learning_rate": 6.2814070351758795e-06, |
|
"loss": 3.5865, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0017140898183064792, |
|
"eval_loss": 3.9249038696289062, |
|
"eval_runtime": 3193.1787, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0017255170837618557, |
|
"grad_norm": 4.707357406616211, |
|
"learning_rate": 6.155778894472363e-06, |
|
"loss": 3.508, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.0017255170837618557, |
|
"eval_loss": 3.9093074798583984, |
|
"eval_runtime": 3187.8945, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.0017369443492172324, |
|
"grad_norm": 4.769847393035889, |
|
"learning_rate": 6.030150753768844e-06, |
|
"loss": 3.8634, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.0017369443492172324, |
|
"eval_loss": 3.894160270690918, |
|
"eval_runtime": 3191.7342, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.001748371614672609, |
|
"grad_norm": 2.5702831745147705, |
|
"learning_rate": 5.904522613065327e-06, |
|
"loss": 2.8717, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.001748371614672609, |
|
"eval_loss": 3.879079818725586, |
|
"eval_runtime": 3192.0831, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.0017597988801279854, |
|
"grad_norm": 6.417558670043945, |
|
"learning_rate": 5.778894472361809e-06, |
|
"loss": 4.4295, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0017597988801279854, |
|
"eval_loss": 3.8640522956848145, |
|
"eval_runtime": 3194.0863, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0017712261455833619, |
|
"grad_norm": 4.981814861297607, |
|
"learning_rate": 5.653266331658292e-06, |
|
"loss": 3.6863, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0017712261455833619, |
|
"eval_loss": 3.849724769592285, |
|
"eval_runtime": 3192.4569, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0017826534110387383, |
|
"grad_norm": 3.5112829208374023, |
|
"learning_rate": 5.527638190954775e-06, |
|
"loss": 3.5074, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.0017826534110387383, |
|
"eval_loss": 3.835315704345703, |
|
"eval_runtime": 3195.7661, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.001794080676494115, |
|
"grad_norm": 4.1393513679504395, |
|
"learning_rate": 5.402010050251256e-06, |
|
"loss": 3.6329, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.001794080676494115, |
|
"eval_loss": 3.820890188217163, |
|
"eval_runtime": 3191.7062, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.0018055079419494915, |
|
"grad_norm": 5.644866943359375, |
|
"learning_rate": 5.276381909547739e-06, |
|
"loss": 3.8676, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.0018055079419494915, |
|
"eval_loss": 3.8074545860290527, |
|
"eval_runtime": 3189.2491, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.001816935207404868, |
|
"grad_norm": 5.790183067321777, |
|
"learning_rate": 5.1507537688442215e-06, |
|
"loss": 4.4173, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.001816935207404868, |
|
"eval_loss": 3.7939019203186035, |
|
"eval_runtime": 3184.9276, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0018283624728602445, |
|
"grad_norm": 4.434901237487793, |
|
"learning_rate": 5.025125628140704e-06, |
|
"loss": 3.561, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0018283624728602445, |
|
"eval_loss": 3.7808175086975098, |
|
"eval_runtime": 3191.4708, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.001839789738315621, |
|
"grad_norm": 5.6928606033325195, |
|
"learning_rate": 4.899497487437186e-06, |
|
"loss": 4.0468, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.001839789738315621, |
|
"eval_loss": 3.7684166431427, |
|
"eval_runtime": 3189.152, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.0018512170037709977, |
|
"grad_norm": 5.777411937713623, |
|
"learning_rate": 4.773869346733668e-06, |
|
"loss": 4.046, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.0018512170037709977, |
|
"eval_loss": 3.755950927734375, |
|
"eval_runtime": 3194.3133, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.0018626442692263742, |
|
"grad_norm": 4.234452247619629, |
|
"learning_rate": 4.6482412060301506e-06, |
|
"loss": 3.4605, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.0018626442692263742, |
|
"eval_loss": 3.743618965148926, |
|
"eval_runtime": 3190.5794, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.0018740715346817506, |
|
"grad_norm": 4.58046817779541, |
|
"learning_rate": 4.522613065326634e-06, |
|
"loss": 3.8547, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.0018740715346817506, |
|
"eval_loss": 3.731459856033325, |
|
"eval_runtime": 3195.1963, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.0018854988001371271, |
|
"grad_norm": 4.460115909576416, |
|
"learning_rate": 4.396984924623116e-06, |
|
"loss": 3.4863, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0018854988001371271, |
|
"eval_loss": 3.7194907665252686, |
|
"eval_runtime": 3192.9919, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0018969260655925038, |
|
"grad_norm": 3.752925395965576, |
|
"learning_rate": 4.271356783919598e-06, |
|
"loss": 3.1784, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.0018969260655925038, |
|
"eval_loss": 3.708251476287842, |
|
"eval_runtime": 3191.38, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.0019083533310478803, |
|
"grad_norm": 5.099639415740967, |
|
"learning_rate": 4.1457286432160804e-06, |
|
"loss": 3.9205, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.0019083533310478803, |
|
"eval_loss": 3.6969456672668457, |
|
"eval_runtime": 3184.4302, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.0019197805965032568, |
|
"grad_norm": 2.3207623958587646, |
|
"learning_rate": 4.020100502512563e-06, |
|
"loss": 2.687, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.0019197805965032568, |
|
"eval_loss": 3.686185359954834, |
|
"eval_runtime": 3190.1367, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.0019312078619586333, |
|
"grad_norm": 5.285946846008301, |
|
"learning_rate": 3.894472361809046e-06, |
|
"loss": 3.916, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.0019312078619586333, |
|
"eval_loss": 3.675804615020752, |
|
"eval_runtime": 3187.8804, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.0019426351274140097, |
|
"grad_norm": 4.83530855178833, |
|
"learning_rate": 3.7688442211055276e-06, |
|
"loss": 3.7806, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0019426351274140097, |
|
"eval_loss": 3.6655375957489014, |
|
"eval_runtime": 3183.8288, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0019540623928693862, |
|
"grad_norm": 5.306794166564941, |
|
"learning_rate": 3.64321608040201e-06, |
|
"loss": 3.6463, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.0019540623928693862, |
|
"eval_loss": 3.6555092334747314, |
|
"eval_runtime": 3189.4936, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.0019654896583247627, |
|
"grad_norm": 7.400163173675537, |
|
"learning_rate": 3.5175879396984926e-06, |
|
"loss": 4.8809, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.0019654896583247627, |
|
"eval_loss": 3.645655632019043, |
|
"eval_runtime": 3189.5993, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.0019769169237801396, |
|
"grad_norm": 5.281580448150635, |
|
"learning_rate": 3.3919597989949752e-06, |
|
"loss": 3.7168, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.0019769169237801396, |
|
"eval_loss": 3.6365396976470947, |
|
"eval_runtime": 3191.3086, |
|
"eval_samples_per_second": 1.723, |
|
"eval_steps_per_second": 1.723, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.001988344189235516, |
|
"grad_norm": 5.500361919403076, |
|
"learning_rate": 3.2663316582914575e-06, |
|
"loss": 3.5465, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.001988344189235516, |
|
"eval_loss": 3.627671957015991, |
|
"eval_runtime": 3191.166, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.0019997714546908926, |
|
"grad_norm": 5.550091743469238, |
|
"learning_rate": 3.1407035175879398e-06, |
|
"loss": 3.7022, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0019997714546908926, |
|
"eval_loss": 3.6191136837005615, |
|
"eval_runtime": 3190.3196, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.002011198720146269, |
|
"grad_norm": 4.048425197601318, |
|
"learning_rate": 3.015075376884422e-06, |
|
"loss": 3.5034, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.002011198720146269, |
|
"eval_loss": 3.6107609272003174, |
|
"eval_runtime": 3196.3566, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.0020226259856016456, |
|
"grad_norm": 4.696182727813721, |
|
"learning_rate": 2.8894472361809047e-06, |
|
"loss": 3.5201, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.0020226259856016456, |
|
"eval_loss": 3.602489948272705, |
|
"eval_runtime": 3191.1045, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.002034053251057022, |
|
"grad_norm": 4.840107440948486, |
|
"learning_rate": 2.7638190954773874e-06, |
|
"loss": 3.8883, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.002034053251057022, |
|
"eval_loss": 3.5948402881622314, |
|
"eval_runtime": 3181.8951, |
|
"eval_samples_per_second": 1.729, |
|
"eval_steps_per_second": 1.729, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.0020454805165123985, |
|
"grad_norm": 3.1070148944854736, |
|
"learning_rate": 2.6381909547738696e-06, |
|
"loss": 2.8413, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.0020454805165123985, |
|
"eval_loss": 3.587570905685425, |
|
"eval_runtime": 3187.9084, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.002056907781967775, |
|
"grad_norm": 4.61260986328125, |
|
"learning_rate": 2.512562814070352e-06, |
|
"loss": 3.704, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.002056907781967775, |
|
"eval_loss": 3.5807385444641113, |
|
"eval_runtime": 3193.1025, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0020683350474231515, |
|
"grad_norm": 5.774682998657227, |
|
"learning_rate": 2.386934673366834e-06, |
|
"loss": 4.1228, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.0020683350474231515, |
|
"eval_loss": 3.5740838050842285, |
|
"eval_runtime": 3188.6673, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.002079762312878528, |
|
"grad_norm": 6.585142612457275, |
|
"learning_rate": 2.261306532663317e-06, |
|
"loss": 4.1375, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.002079762312878528, |
|
"eval_loss": 3.5675177574157715, |
|
"eval_runtime": 3187.9902, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.002091189578333905, |
|
"grad_norm": 4.422517776489258, |
|
"learning_rate": 2.135678391959799e-06, |
|
"loss": 3.2458, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.002091189578333905, |
|
"eval_loss": 3.5616683959960938, |
|
"eval_runtime": 3189.6985, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.0021026168437892814, |
|
"grad_norm": 5.40252685546875, |
|
"learning_rate": 2.0100502512562813e-06, |
|
"loss": 3.7518, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.0021026168437892814, |
|
"eval_loss": 3.556403398513794, |
|
"eval_runtime": 3188.5075, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.002114044109244658, |
|
"grad_norm": 3.357342004776001, |
|
"learning_rate": 1.8844221105527638e-06, |
|
"loss": 2.8388, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.002114044109244658, |
|
"eval_loss": 3.551210880279541, |
|
"eval_runtime": 3198.6279, |
|
"eval_samples_per_second": 1.719, |
|
"eval_steps_per_second": 1.719, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0021254713747000343, |
|
"grad_norm": 5.60300350189209, |
|
"learning_rate": 1.7587939698492463e-06, |
|
"loss": 3.6586, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.0021254713747000343, |
|
"eval_loss": 3.5462591648101807, |
|
"eval_runtime": 3194.9645, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.002136898640155411, |
|
"grad_norm": 4.910702228546143, |
|
"learning_rate": 1.6331658291457288e-06, |
|
"loss": 3.6408, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.002136898640155411, |
|
"eval_loss": 3.5415446758270264, |
|
"eval_runtime": 3185.4074, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.0021483259056107873, |
|
"grad_norm": 5.6200737953186035, |
|
"learning_rate": 1.507537688442211e-06, |
|
"loss": 3.5742, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.0021483259056107873, |
|
"eval_loss": 3.537212371826172, |
|
"eval_runtime": 3182.681, |
|
"eval_samples_per_second": 1.728, |
|
"eval_steps_per_second": 1.728, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.0021597531710661638, |
|
"grad_norm": 5.1306986808776855, |
|
"learning_rate": 1.3819095477386937e-06, |
|
"loss": 3.5693, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0021597531710661638, |
|
"eval_loss": 3.5330007076263428, |
|
"eval_runtime": 3185.4993, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0021711804365215403, |
|
"grad_norm": 5.107001781463623, |
|
"learning_rate": 1.256281407035176e-06, |
|
"loss": 3.6025, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0021711804365215403, |
|
"eval_loss": 3.529104471206665, |
|
"eval_runtime": 3180.9702, |
|
"eval_samples_per_second": 1.729, |
|
"eval_steps_per_second": 1.729, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0021826077019769167, |
|
"grad_norm": 5.954830646514893, |
|
"learning_rate": 1.1306532663316584e-06, |
|
"loss": 3.5938, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.0021826077019769167, |
|
"eval_loss": 3.525881052017212, |
|
"eval_runtime": 3184.2834, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 1.727, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.0021940349674322937, |
|
"grad_norm": 4.140493392944336, |
|
"learning_rate": 1.0050251256281407e-06, |
|
"loss": 3.3742, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.0021940349674322937, |
|
"eval_loss": 3.5228757858276367, |
|
"eval_runtime": 3193.6231, |
|
"eval_samples_per_second": 1.722, |
|
"eval_steps_per_second": 1.722, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.00220546223288767, |
|
"grad_norm": 6.1048665046691895, |
|
"learning_rate": 8.793969849246231e-07, |
|
"loss": 3.9661, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.00220546223288767, |
|
"eval_loss": 3.519984722137451, |
|
"eval_runtime": 3190.529, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.0022168894983430466, |
|
"grad_norm": 4.738156318664551, |
|
"learning_rate": 7.537688442211055e-07, |
|
"loss": 3.1686, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.0022168894983430466, |
|
"eval_loss": 3.517745018005371, |
|
"eval_runtime": 3189.2139, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.002228316763798423, |
|
"grad_norm": 6.101266860961914, |
|
"learning_rate": 6.28140703517588e-07, |
|
"loss": 3.5854, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.002228316763798423, |
|
"eval_loss": 3.515723705291748, |
|
"eval_runtime": 3190.1764, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.0022397440292537996, |
|
"grad_norm": 3.0352277755737305, |
|
"learning_rate": 5.025125628140703e-07, |
|
"loss": 2.9543, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.0022397440292537996, |
|
"eval_loss": 3.5141751766204834, |
|
"eval_runtime": 3188.4018, |
|
"eval_samples_per_second": 1.725, |
|
"eval_steps_per_second": 1.725, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.002251171294709176, |
|
"grad_norm": 3.8082351684570312, |
|
"learning_rate": 3.7688442211055275e-07, |
|
"loss": 3.3165, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.002251171294709176, |
|
"eval_loss": 3.512690305709839, |
|
"eval_runtime": 3190.2684, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.0022625985601645525, |
|
"grad_norm": 3.919834613800049, |
|
"learning_rate": 2.5125628140703517e-07, |
|
"loss": 2.769, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.0022625985601645525, |
|
"eval_loss": 3.5116732120513916, |
|
"eval_runtime": 3190.4709, |
|
"eval_samples_per_second": 1.724, |
|
"eval_steps_per_second": 1.724, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.002274025825619929, |
|
"grad_norm": 5.8951334953308105, |
|
"learning_rate": 1.2562814070351758e-07, |
|
"loss": 3.999, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.002274025825619929, |
|
"eval_loss": 3.511035442352295, |
|
"eval_runtime": 3199.1703, |
|
"eval_samples_per_second": 1.719, |
|
"eval_steps_per_second": 1.719, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.0022854530910753055, |
|
"grad_norm": 5.609264850616455, |
|
"learning_rate": 0.0, |
|
"loss": 3.8117, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0022854530910753055, |
|
"eval_loss": 3.510607957839966, |
|
"eval_runtime": 3196.7225, |
|
"eval_samples_per_second": 1.721, |
|
"eval_steps_per_second": 1.721, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1, |
|
"total_flos": 2564058316800000.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|