|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.945945945945946, |
|
"eval_steps": 500, |
|
"global_step": 460, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00019976687691905393, |
|
"loss": 0.7107, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.235595703125, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 0.6952, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.423583984375, |
|
"learning_rate": 0.00019790840876823232, |
|
"loss": 0.6967, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.3974609375, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 0.7649, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 0.327880859375, |
|
"learning_rate": 0.00019422609221188207, |
|
"loss": 0.7627, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 0.31689453125, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 0.7527, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 0.4111328125, |
|
"learning_rate": 0.0001887885218402375, |
|
"loss": 0.7536, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 1.376953125, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 0.7241, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 0.387939453125, |
|
"learning_rate": 0.0001816969893010442, |
|
"loss": 0.757, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.366455078125, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 0.7339, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 0.419677734375, |
|
"learning_rate": 0.00017308359642781242, |
|
"loss": 0.7005, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 0.38818359375, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 0.7167, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 0.452880859375, |
|
"learning_rate": 0.00016310879443260528, |
|
"loss": 0.7403, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.027027027027027, |
|
"grad_norm": 0.412841796875, |
|
"learning_rate": 0.00015766803221148673, |
|
"loss": 0.7248, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 0.410400390625, |
|
"learning_rate": 0.00015195839500354335, |
|
"loss": 0.7208, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.4594594594594597, |
|
"grad_norm": 0.484619140625, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 0.7079, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.6756756756756754, |
|
"grad_norm": 0.429443359375, |
|
"learning_rate": 0.00013984010898462416, |
|
"loss": 0.6847, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.891891891891892, |
|
"grad_norm": 0.437744140625, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 0.7104, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.108108108108108, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00012697967711570242, |
|
"loss": 0.682, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.324324324324325, |
|
"grad_norm": 0.484130859375, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 0.6754, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.54054054054054, |
|
"grad_norm": 0.474853515625, |
|
"learning_rate": 0.00011361666490962468, |
|
"loss": 0.6671, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.756756756756757, |
|
"grad_norm": 0.474365234375, |
|
"learning_rate": 0.0001068242413364671, |
|
"loss": 0.6956, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.972972972972973, |
|
"grad_norm": 0.484130859375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6679, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.1891891891891895, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 9.317575866353292e-05, |
|
"loss": 0.6496, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.405405405405405, |
|
"grad_norm": 0.44482421875, |
|
"learning_rate": 8.638333509037536e-05, |
|
"loss": 0.6547, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.621621621621622, |
|
"grad_norm": 0.488037109375, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 0.6739, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.837837837837838, |
|
"grad_norm": 0.4658203125, |
|
"learning_rate": 7.302032288429756e-05, |
|
"loss": 0.6501, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.054054054054054, |
|
"grad_norm": 0.48974609375, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 0.6773, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.27027027027027, |
|
"grad_norm": 0.4775390625, |
|
"learning_rate": 6.015989101537586e-05, |
|
"loss": 0.6298, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.486486486486487, |
|
"grad_norm": 0.521484375, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 0.6464, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.702702702702703, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 4.804160499645667e-05, |
|
"loss": 0.6369, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.918918918918919, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.2331967788513295e-05, |
|
"loss": 0.6734, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 7.135135135135135, |
|
"grad_norm": 0.54736328125, |
|
"learning_rate": 3.689120556739475e-05, |
|
"loss": 0.6321, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 7.351351351351352, |
|
"grad_norm": 0.57080078125, |
|
"learning_rate": 3.174468567813461e-05, |
|
"loss": 0.6261, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.5675675675675675, |
|
"grad_norm": 0.47705078125, |
|
"learning_rate": 2.691640357218759e-05, |
|
"loss": 0.6196, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.783783783783784, |
|
"grad_norm": 0.5712890625, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 0.6505, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.50732421875, |
|
"learning_rate": 1.8303010698955804e-05, |
|
"loss": 0.6331, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.216216216216216, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 0.6366, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.432432432432432, |
|
"grad_norm": 0.49951171875, |
|
"learning_rate": 1.1211478159762478e-05, |
|
"loss": 0.641, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.64864864864865, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 0.6233, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.864864864864865, |
|
"grad_norm": 0.50634765625, |
|
"learning_rate": 5.77390778811796e-06, |
|
"loss": 0.6185, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 9.08108108108108, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 3.7082712652200867e-06, |
|
"loss": 0.6224, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.297297297297296, |
|
"grad_norm": 0.487548828125, |
|
"learning_rate": 2.091591231767709e-06, |
|
"loss": 0.6054, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 9.513513513513514, |
|
"grad_norm": 0.5166015625, |
|
"learning_rate": 9.314053963669245e-07, |
|
"loss": 0.6299, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.72972972972973, |
|
"grad_norm": 0.4990234375, |
|
"learning_rate": 2.3312308094607382e-07, |
|
"loss": 0.6193, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.945945945945946, |
|
"grad_norm": 0.53271484375, |
|
"learning_rate": 0.0, |
|
"loss": 0.6231, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.945945945945946, |
|
"step": 460, |
|
"total_flos": 2.244679865204736e+16, |
|
"train_loss": 0.6764944273492565, |
|
"train_runtime": 411.2593, |
|
"train_samples_per_second": 4.498, |
|
"train_steps_per_second": 1.119 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.244679865204736e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|