|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 118100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.353366494178772, |
|
"learning_rate": 0.0002, |
|
"loss": 3.6004, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 1.4591169357299805, |
|
"learning_rate": 0.0002, |
|
"loss": 3.3767, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 1.5558066368103027, |
|
"learning_rate": 0.0002, |
|
"loss": 3.3035, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 1.54267418384552, |
|
"learning_rate": 0.0002, |
|
"loss": 3.2643, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 10.58, |
|
"grad_norm": 1.6217669248580933, |
|
"learning_rate": 0.0002, |
|
"loss": 3.238, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 12.7, |
|
"grad_norm": 1.5397529602050781, |
|
"learning_rate": 0.0002, |
|
"loss": 3.2201, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 14.82, |
|
"grad_norm": 1.5048496723175049, |
|
"learning_rate": 0.0002, |
|
"loss": 3.2048, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 16.93, |
|
"grad_norm": 1.5197534561157227, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1947, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 19.05, |
|
"grad_norm": 1.5024417638778687, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1829, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 21.17, |
|
"grad_norm": 1.5805625915527344, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1729, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 23.29, |
|
"grad_norm": 1.5682896375656128, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1681, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 25.4, |
|
"grad_norm": 1.5147111415863037, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1619, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 27.52, |
|
"grad_norm": 1.6233525276184082, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1557, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 29.64, |
|
"grad_norm": 1.5563185214996338, |
|
"learning_rate": 0.0002, |
|
"loss": 3.153, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 31.75, |
|
"grad_norm": 1.5635435581207275, |
|
"learning_rate": 0.0002, |
|
"loss": 3.151, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 33.87, |
|
"grad_norm": 1.471053957939148, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1449, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 35.99, |
|
"grad_norm": 1.5087348222732544, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1414, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 38.1, |
|
"grad_norm": 1.6342508792877197, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1363, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 40.22, |
|
"grad_norm": 1.7150408029556274, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1327, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 42.34, |
|
"grad_norm": 1.8055483102798462, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1319, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 44.45, |
|
"grad_norm": 1.509770154953003, |
|
"learning_rate": 0.0002, |
|
"loss": 3.13, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 46.57, |
|
"grad_norm": 1.6583279371261597, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1266, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 48.69, |
|
"grad_norm": 1.7038261890411377, |
|
"learning_rate": 0.0002, |
|
"loss": 3.1273, |
|
"step": 115000 |
|
} |
|
], |
|
"logging_steps": 5000, |
|
"max_steps": 118100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 7.86717088860818e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|