|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 212, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 56.48654000341791, |
|
"learning_rate": 2.2727272727272729e-07, |
|
"loss": 1.4277, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 32.7318603335361, |
|
"learning_rate": 1.1363636363636364e-06, |
|
"loss": 1.3555, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 18.95719710067938, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 1.1538, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.971441284336922, |
|
"learning_rate": 3.409090909090909e-06, |
|
"loss": 1.0874, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.3823086685656945, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.0206, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.089694965230436, |
|
"learning_rate": 4.9969249228707625e-06, |
|
"loss": 1.003, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.6458232095508354, |
|
"learning_rate": 4.978160173317439e-06, |
|
"loss": 0.9657, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.2176159938800266, |
|
"learning_rate": 4.942467076958999e-06, |
|
"loss": 0.9435, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.977792325165127, |
|
"learning_rate": 4.890089453835894e-06, |
|
"loss": 0.9368, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.1118165842382504, |
|
"learning_rate": 4.821385096224268e-06, |
|
"loss": 0.9261, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.3730430715720994, |
|
"learning_rate": 4.736823324551909e-06, |
|
"loss": 0.9112, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.1574351012684474, |
|
"learning_rate": 4.636981781463848e-06, |
|
"loss": 0.9145, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.0769386301276183, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.8814, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.055053060479708, |
|
"learning_rate": 4.394287174400838e-06, |
|
"loss": 0.8751, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.4232076990901, |
|
"learning_rate": 4.253091960681222e-06, |
|
"loss": 0.8909, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.2020191326645273, |
|
"learning_rate": 4.099921351258292e-06, |
|
"loss": 0.8597, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.290349939143738, |
|
"learning_rate": 3.935821656707359e-06, |
|
"loss": 0.8913, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.4145364641074396, |
|
"learning_rate": 3.76191384433711e-06, |
|
"loss": 0.8613, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.263977821412841, |
|
"learning_rate": 3.579385880846232e-06, |
|
"loss": 0.8759, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 10.655125505241807, |
|
"learning_rate": 3.3894846173062917e-06, |
|
"loss": 0.8519, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 250.56147092672978, |
|
"learning_rate": 3.193507271904612e-06, |
|
"loss": 0.8517, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.419289400374, |
|
"learning_rate": 2.9927925686287006e-06, |
|
"loss": 0.8594, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.263295130566819, |
|
"learning_rate": 2.788711592423966e-06, |
|
"loss": 0.849, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.466296576337294, |
|
"learning_rate": 2.5826584232932707e-06, |
|
"loss": 0.8876, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.464842972592367, |
|
"learning_rate": 2.376040613316944e-06, |
|
"loss": 0.8605, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.301898509073216, |
|
"learning_rate": 2.1702695716448276e-06, |
|
"loss": 0.8485, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.3264276395542414, |
|
"learning_rate": 1.9667509231406332e-06, |
|
"loss": 0.84, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.2310503156717245, |
|
"learning_rate": 1.7668749065388385e-06, |
|
"loss": 0.835, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.361233531250762, |
|
"learning_rate": 1.5720068777044479e-06, |
|
"loss": 0.8413, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.229898438412517, |
|
"learning_rate": 1.383477982867984e-06, |
|
"loss": 0.8471, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.082051479835871, |
|
"learning_rate": 1.2025760655469629e-06, |
|
"loss": 0.8435, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.392093980236051, |
|
"learning_rate": 1.0305368692688175e-06, |
|
"loss": 0.8405, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.3593945187641316, |
|
"learning_rate": 8.685355961895783e-07, |
|
"loss": 0.8475, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.183216674509743, |
|
"learning_rate": 7.176788792715076e-07, |
|
"loss": 0.8551, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.1059443889529974, |
|
"learning_rate": 5.78997222857853e-07, |
|
"loss": 0.8347, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.1090745975048404, |
|
"learning_rate": 4.534379632832692e-07, |
|
"loss": 0.8251, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.1186904842842007, |
|
"learning_rate": 3.4185879760606525e-07, |
|
"loss": 0.816, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 13.97461867271608, |
|
"learning_rate": 2.450219246676028e-07, |
|
"loss": 0.8235, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.0246555374160558, |
|
"learning_rate": 1.6358883850134815e-07, |
|
"loss": 0.8356, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.969712888409742, |
|
"learning_rate": 9.811580965787965e-08, |
|
"loss": 0.8327, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.9445398637597826, |
|
"learning_rate": 4.905008531297661e-08, |
|
"loss": 0.86, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.096896769727839, |
|
"learning_rate": 1.6726834115904645e-08, |
|
"loss": 0.8216, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.9776364501353694, |
|
"learning_rate": 1.3668566476848777e-09, |
|
"loss": 0.8468, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3747081756591797, |
|
"eval_runtime": 1.4843, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 0.674, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 212, |
|
"total_flos": 88776974008320.0, |
|
"train_loss": 0.8946974671111917, |
|
"train_runtime": 2355.8103, |
|
"train_samples_per_second": 5.733, |
|
"train_steps_per_second": 0.09 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 212, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 88776974008320.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|