|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997401022264576, |
|
"eval_steps": 500, |
|
"global_step": 2885, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 22.53300666809082, |
|
"learning_rate": 6.71280276816609e-06, |
|
"loss": 3.1491, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 36.242984771728516, |
|
"learning_rate": 1.356401384083045e-05, |
|
"loss": 2.6131, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 25.216327667236328, |
|
"learning_rate": 1.999973639055537e-05, |
|
"loss": 2.5726, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 14.273802757263184, |
|
"learning_rate": 1.9917836961775225e-05, |
|
"loss": 2.4989, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 27.216812133789062, |
|
"learning_rate": 1.969086765436979e-05, |
|
"loss": 2.5906, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 18.74100112915039, |
|
"learning_rate": 1.9322148386785378e-05, |
|
"loss": 2.4275, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 20.627084732055664, |
|
"learning_rate": 1.8817072478109763e-05, |
|
"loss": 2.5103, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 15.611855506896973, |
|
"learning_rate": 1.818302775908169e-05, |
|
"loss": 2.3706, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 25.303524017333984, |
|
"learning_rate": 1.7429288509041197e-05, |
|
"loss": 2.3601, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 20.18657875061035, |
|
"learning_rate": 1.6566879799477148e-05, |
|
"loss": 2.5054, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 17.65004539489746, |
|
"learning_rate": 1.560841622844192e-05, |
|
"loss": 2.3717, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 19.5482177734375, |
|
"learning_rate": 1.4578679381126853e-05, |
|
"loss": 2.3772, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 14.92688274383545, |
|
"learning_rate": 1.3471954275891059e-05, |
|
"loss": 2.2991, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 10.425432205200195, |
|
"learning_rate": 1.2314444308256605e-05, |
|
"loss": 2.2865, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 16.403301239013672, |
|
"learning_rate": 1.1123080572287608e-05, |
|
"loss": 2.2595, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 11.935959815979004, |
|
"learning_rate": 9.915289346843219e-06, |
|
"loss": 2.3662, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 18.410987854003906, |
|
"learning_rate": 8.708737198449509e-06, |
|
"loss": 2.2021, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 15.293601036071777, |
|
"learning_rate": 7.521072569442963e-06, |
|
"loss": 2.2545, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 16.34610939025879, |
|
"learning_rate": 6.369667631219584e-06, |
|
"loss": 2.3199, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 15.948208808898926, |
|
"learning_rate": 5.2713641785457504e-06, |
|
"loss": 2.2029, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 27.17706298828125, |
|
"learning_rate": 4.242227281777747e-06, |
|
"loss": 2.2861, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 19.407489776611328, |
|
"learning_rate": 3.297310300360622e-06, |
|
"loss": 2.2157, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 11.622710227966309, |
|
"learning_rate": 2.450434694793621e-06, |
|
"loss": 2.2724, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 16.701732635498047, |
|
"learning_rate": 1.7139878577898772e-06, |
|
"loss": 2.1622, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 10.720149040222168, |
|
"learning_rate": 1.0987419217881333e-06, |
|
"loss": 2.2026, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 14.398381233215332, |
|
"learning_rate": 6.136961931496943e-07, |
|
"loss": 2.2619, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 16.95086669921875, |
|
"learning_rate": 2.6594551778223896e-07, |
|
"loss": 2.2626, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 12.132495880126953, |
|
"learning_rate": 6.057650362879753e-08, |
|
"loss": 2.1139, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2885, |
|
"total_flos": 1.1131515504795648e+16, |
|
"train_loss": 2.3624265621191913, |
|
"train_runtime": 6766.2575, |
|
"train_samples_per_second": 1.706, |
|
"train_steps_per_second": 0.426 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 2885, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 1.1131515504795648e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|