|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 2304, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0390625, |
|
"grad_norm": 2.857225616936394, |
|
"learning_rate": 6.493506493506493e-07, |
|
"loss": 0.7578, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 1.3044758629614164, |
|
"learning_rate": 1.2987012987012986e-06, |
|
"loss": 0.5677, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1171875, |
|
"grad_norm": 1.1974358130130436, |
|
"learning_rate": 1.9480519480519483e-06, |
|
"loss": 0.5243, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 1.0778991302776368, |
|
"learning_rate": 2.597402597402597e-06, |
|
"loss": 0.5118, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1953125, |
|
"grad_norm": 1.0101444029426376, |
|
"learning_rate": 3.246753246753247e-06, |
|
"loss": 0.5146, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 0.97813783772122, |
|
"learning_rate": 3.896103896103897e-06, |
|
"loss": 0.5121, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2734375, |
|
"grad_norm": 1.0146147755881423, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.5172, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.0002914434860946, |
|
"learning_rate": 4.999767464405452e-06, |
|
"loss": 0.5053, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3515625, |
|
"grad_norm": 0.9909790519322216, |
|
"learning_rate": 4.995634701567892e-06, |
|
"loss": 0.5197, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 0.9471114669156748, |
|
"learning_rate": 4.986344312601082e-06, |
|
"loss": 0.5087, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4296875, |
|
"grad_norm": 0.990508902551153, |
|
"learning_rate": 4.971915497571788e-06, |
|
"loss": 0.5126, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.9299422099207572, |
|
"learning_rate": 4.9523780759216764e-06, |
|
"loss": 0.5144, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5078125, |
|
"grad_norm": 0.8743256439010634, |
|
"learning_rate": 4.927772424840702e-06, |
|
"loss": 0.5083, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 0.9036662054182406, |
|
"learning_rate": 4.898149395821218e-06, |
|
"loss": 0.5117, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5859375, |
|
"grad_norm": 0.8810653800350299, |
|
"learning_rate": 4.863570209565277e-06, |
|
"loss": 0.5107, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.8846561337964834, |
|
"learning_rate": 4.824106329462313e-06, |
|
"loss": 0.5098, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6640625, |
|
"grad_norm": 0.8997382537130586, |
|
"learning_rate": 4.779839313898675e-06, |
|
"loss": 0.5152, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 0.939340943654687, |
|
"learning_rate": 4.730860647704252e-06, |
|
"loss": 0.5078, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7421875, |
|
"grad_norm": 0.845279140634368, |
|
"learning_rate": 4.677271553084515e-06, |
|
"loss": 0.5078, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.9487929366289761, |
|
"learning_rate": 4.6191827804287236e-06, |
|
"loss": 0.5073, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8203125, |
|
"grad_norm": 0.9162235285381133, |
|
"learning_rate": 4.556714379426634e-06, |
|
"loss": 0.5194, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 0.8817621527988113, |
|
"learning_rate": 4.489995450966714e-06, |
|
"loss": 0.4997, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8984375, |
|
"grad_norm": 0.8573037310302468, |
|
"learning_rate": 4.419163880328615e-06, |
|
"loss": 0.5008, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.9279340785042338, |
|
"learning_rate": 4.344366052221316e-06, |
|
"loss": 0.5037, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9765625, |
|
"grad_norm": 0.8075055611747111, |
|
"learning_rate": 4.265756548255823e-06, |
|
"loss": 0.4977, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.015625, |
|
"grad_norm": 1.098595770652355, |
|
"learning_rate": 4.183497827477687e-06, |
|
"loss": 0.4782, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.0546875, |
|
"grad_norm": 0.9988970588938918, |
|
"learning_rate": 4.097759890619539e-06, |
|
"loss": 0.439, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.9731524721989655, |
|
"learning_rate": 4.00871992876753e-06, |
|
"loss": 0.4441, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.1328125, |
|
"grad_norm": 0.9099368305482215, |
|
"learning_rate": 3.916561957167765e-06, |
|
"loss": 0.4438, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.171875, |
|
"grad_norm": 1.0659953860812488, |
|
"learning_rate": 3.82147643492952e-06, |
|
"loss": 0.44, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2109375, |
|
"grad_norm": 0.9779085212603401, |
|
"learning_rate": 3.723659871411196e-06, |
|
"loss": 0.4406, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.9719837374463801, |
|
"learning_rate": 3.623314420102467e-06, |
|
"loss": 0.4464, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.2890625, |
|
"grad_norm": 1.0182265044301695, |
|
"learning_rate": 3.5206474608419385e-06, |
|
"loss": 0.4462, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.328125, |
|
"grad_norm": 0.9265295707713885, |
|
"learning_rate": 3.415871171233709e-06, |
|
"loss": 0.4412, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.3671875, |
|
"grad_norm": 1.0015451544453786, |
|
"learning_rate": 3.3092020881486085e-06, |
|
"loss": 0.4395, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.93431276827024, |
|
"learning_rate": 3.2008606602163023e-06, |
|
"loss": 0.4425, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.4453125, |
|
"grad_norm": 0.9255587183512949, |
|
"learning_rate": 3.091070792233124e-06, |
|
"loss": 0.439, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.484375, |
|
"grad_norm": 0.9343974096084879, |
|
"learning_rate": 2.9800593824272027e-06, |
|
"loss": 0.4354, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.5234375, |
|
"grad_norm": 1.042370945691791, |
|
"learning_rate": 2.8680558535371688e-06, |
|
"loss": 0.4404, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.9126651351226419, |
|
"learning_rate": 2.7552916786735744e-06, |
|
"loss": 0.4431, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.6015625, |
|
"grad_norm": 0.9963009170045803, |
|
"learning_rate": 2.641999902942882e-06, |
|
"loss": 0.43, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.640625, |
|
"grad_norm": 0.9703924532720508, |
|
"learning_rate": 2.5284146618226807e-06, |
|
"loss": 0.449, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.6796875, |
|
"grad_norm": 0.8989408268444277, |
|
"learning_rate": 2.414770697283471e-06, |
|
"loss": 0.4387, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.9709563268893221, |
|
"learning_rate": 2.3013028726570436e-06, |
|
"loss": 0.444, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.7578125, |
|
"grad_norm": 0.9557991402725722, |
|
"learning_rate": 2.188245687254035e-06, |
|
"loss": 0.4394, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.796875, |
|
"grad_norm": 0.9603778434937646, |
|
"learning_rate": 2.075832791733802e-06, |
|
"loss": 0.4473, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.8359375, |
|
"grad_norm": 0.9476806986189421, |
|
"learning_rate": 1.9642965052281618e-06, |
|
"loss": 0.4404, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.9085522492818641, |
|
"learning_rate": 1.8538673352169467e-06, |
|
"loss": 0.4446, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.9140625, |
|
"grad_norm": 0.9480501458847437, |
|
"learning_rate": 1.744773501147627e-06, |
|
"loss": 0.4236, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.953125, |
|
"grad_norm": 0.9099438022581319, |
|
"learning_rate": 1.6372404627835182e-06, |
|
"loss": 0.4352, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9921875, |
|
"grad_norm": 0.9130000879221961, |
|
"learning_rate": 1.5314904542553099e-06, |
|
"loss": 0.4344, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 1.072785526043702, |
|
"learning_rate": 1.4277420247788842e-06, |
|
"loss": 0.3877, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.0703125, |
|
"grad_norm": 1.0213733525555977, |
|
"learning_rate": 1.3262095869885907e-06, |
|
"loss": 0.3748, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.109375, |
|
"grad_norm": 1.0378622376674729, |
|
"learning_rate": 1.227102973819426e-06, |
|
"loss": 0.3801, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.1484375, |
|
"grad_norm": 1.0636201329760862, |
|
"learning_rate": 1.1306270048538966e-06, |
|
"loss": 0.3623, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 1.065350906800766, |
|
"learning_rate": 1.0369810630297658e-06, |
|
"loss": 0.3652, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.2265625, |
|
"grad_norm": 1.065580953707456, |
|
"learning_rate": 9.463586825834939e-07, |
|
"loss": 0.3724, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.265625, |
|
"grad_norm": 1.1044573731710503, |
|
"learning_rate": 8.589471490809473e-07, |
|
"loss": 0.3639, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.3046875, |
|
"grad_norm": 1.057219345369191, |
|
"learning_rate": 7.749271123619889e-07, |
|
"loss": 0.3665, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.120692356243222, |
|
"learning_rate": 6.944722131988394e-07, |
|
"loss": 0.3624, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.3828125, |
|
"grad_norm": 1.1354171819466858, |
|
"learning_rate": 6.177487244398009e-07, |
|
"loss": 0.3629, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.421875, |
|
"grad_norm": 1.0454720627484864, |
|
"learning_rate": 5.449152073799616e-07, |
|
"loss": 0.3739, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.4609375, |
|
"grad_norm": 1.1194418224635865, |
|
"learning_rate": 4.761221840690586e-07, |
|
"loss": 0.3723, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.097996189333425, |
|
"learning_rate": 4.115118262337128e-07, |
|
"loss": 0.377, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.5390625, |
|
"grad_norm": 1.0208876828373903, |
|
"learning_rate": 3.512176614569418e-07, |
|
"loss": 0.3676, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.578125, |
|
"grad_norm": 1.022735887067447, |
|
"learning_rate": 2.9536429722216207e-07, |
|
"loss": 0.3714, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.6171875, |
|
"grad_norm": 1.04711360457167, |
|
"learning_rate": 2.440671633920075e-07, |
|
"loss": 0.3733, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 1.0415975853212511, |
|
"learning_rate": 1.9743227365415092e-07, |
|
"loss": 0.3694, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.6953125, |
|
"grad_norm": 1.106325100873404, |
|
"learning_rate": 1.5555600642715442e-07, |
|
"loss": 0.3747, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.734375, |
|
"grad_norm": 1.0448870665618393, |
|
"learning_rate": 1.1852490567913655e-07, |
|
"loss": 0.3611, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.7734375, |
|
"grad_norm": 1.1214742422046289, |
|
"learning_rate": 8.641550207089039e-08, |
|
"loss": 0.3686, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 1.0756561712922115, |
|
"learning_rate": 5.92941547931028e-08, |
|
"loss": 0.3716, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.8515625, |
|
"grad_norm": 1.057575658588282, |
|
"learning_rate": 3.7216914424527686e-08, |
|
"loss": 0.3624, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.890625, |
|
"grad_norm": 1.0540930132763664, |
|
"learning_rate": 2.0229407094547736e-08, |
|
"loss": 0.369, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.9296875, |
|
"grad_norm": 1.105125578397597, |
|
"learning_rate": 8.366740189520716e-09, |
|
"loss": 0.3668, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.9862762687662256, |
|
"learning_rate": 1.6534297977804925e-09, |
|
"loss": 0.3621, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2304, |
|
"total_flos": 415352546656256.0, |
|
"train_loss": 0.4438822174237834, |
|
"train_runtime": 11247.4679, |
|
"train_samples_per_second": 26.212, |
|
"train_steps_per_second": 0.205 |
|
} |
|
], |
|
"logging_steps": 30, |
|
"max_steps": 2304, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 256, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 415352546656256.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|