|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.016689645163631087, |
|
"eval_steps": 500, |
|
"global_step": 230, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007256367462448298, |
|
"grad_norm": 12.0, |
|
"learning_rate": 5e-05, |
|
"loss": 3.0993, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0014512734924896596, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.208, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0021769102387344894, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 0.00015, |
|
"loss": 1.3285, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0029025469849793192, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6895, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.003628183731224149, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.00025, |
|
"loss": 0.714, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004353820477468979, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4849, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.005079457223713809, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 0.00035, |
|
"loss": 0.3671, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0058050939699586385, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.0004, |
|
"loss": 0.5693, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.006530730716203468, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 0.4133, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.007256367462448298, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.0005, |
|
"loss": 0.2668, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.007982004208693128, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.000499999340865746, |
|
"loss": 0.2922, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.008707640954937958, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 0.0004999973634664594, |
|
"loss": 0.3996, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.009433277701182788, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0004999940678125673, |
|
"loss": 0.2841, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.010158914447427617, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.000499989453921448, |
|
"loss": 0.3003, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.010884551193672447, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.0004999835218174307, |
|
"loss": 0.2747, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.011610187939917277, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.000499976271531796, |
|
"loss": 0.37, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.012335824686162107, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.000499967703102775, |
|
"loss": 0.2163, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.013061461432406937, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.00049995781657555, |
|
"loss": 0.3652, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.013787098178651766, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.000499946612002253, |
|
"loss": 0.3177, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.014512734924896596, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.0004999340894419668, |
|
"loss": 0.2043, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.015238371671141426, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0004999202489607236, |
|
"loss": 0.2865, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.015964008417386256, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.0004999050906315055, |
|
"loss": 0.2039, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.016689645163631087, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0004998886145342434, |
|
"loss": 0.3509, |
|
"step": 230 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 13781, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"total_flos": 0.0, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|