|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 10.65625, |
|
"learning_rate": 0.00019990989662046818, |
|
"loss": 1.0534, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 10.4765625, |
|
"learning_rate": 0.00019963974885425266, |
|
"loss": 0.9925, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 12.546875, |
|
"learning_rate": 0.00019919004352588767, |
|
"loss": 0.9846, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 10.9296875, |
|
"learning_rate": 0.00019856159103477086, |
|
"loss": 0.9861, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 12.125, |
|
"learning_rate": 0.00019775552389476864, |
|
"loss": 0.9614, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 11.1640625, |
|
"learning_rate": 0.0001967732946933499, |
|
"loss": 0.9254, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 11.2109375, |
|
"learning_rate": 0.00019561667347392508, |
|
"loss": 0.9275, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 12.0703125, |
|
"learning_rate": 0.00019428774454610843, |
|
"loss": 0.9252, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 10.671875, |
|
"learning_rate": 0.00019278890272965096, |
|
"loss": 0.8997, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 10.0234375, |
|
"learning_rate": 0.0001911228490388136, |
|
"loss": 0.8997, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 9.828125, |
|
"learning_rate": 0.00018929258581495685, |
|
"loss": 0.9081, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 9.78125, |
|
"learning_rate": 0.00018730141131611882, |
|
"loss": 0.88, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 9.625, |
|
"learning_rate": 0.00018515291377333112, |
|
"loss": 0.8677, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 0.00018285096492438424, |
|
"loss": 0.8999, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 11.6640625, |
|
"learning_rate": 0.00018039971303669407, |
|
"loss": 0.9, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 12.2109375, |
|
"learning_rate": 0.00017780357543184397, |
|
"loss": 0.912, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 14.1640625, |
|
"learning_rate": 0.00017506723052527242, |
|
"loss": 0.8953, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 15.015625, |
|
"learning_rate": 0.00017219560939545246, |
|
"loss": 0.9759, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 15.5546875, |
|
"learning_rate": 0.00016919388689775464, |
|
"loss": 1.0338, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 16.5, |
|
"learning_rate": 0.00016606747233900815, |
|
"loss": 1.0389, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 16.9375, |
|
"learning_rate": 0.00016282199972956425, |
|
"loss": 1.0149, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 14.21875, |
|
"learning_rate": 0.00015946331763042867, |
|
"loss": 0.9946, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 11.09375, |
|
"learning_rate": 0.00015599747861375955, |
|
"loss": 0.9629, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 13.4921875, |
|
"learning_rate": 0.00015243072835572318, |
|
"loss": 0.9677, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 13.3046875, |
|
"learning_rate": 0.00014876949438136347, |
|
"loss": 0.9327, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 13.03125, |
|
"learning_rate": 0.00014502037448176734, |
|
"loss": 0.9566, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 11.9921875, |
|
"learning_rate": 0.0001411901248243993, |
|
"loss": 0.913, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 10.953125, |
|
"learning_rate": 0.00013728564777803088, |
|
"loss": 0.9266, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 12.15625, |
|
"learning_rate": 0.00013331397947420576, |
|
"loss": 0.8859, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.054054054054054, |
|
"grad_norm": 13.1171875, |
|
"learning_rate": 0.00012928227712765504, |
|
"loss": 0.8896, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.1891891891891895, |
|
"grad_norm": 11.8203125, |
|
"learning_rate": 0.00012519780613851254, |
|
"loss": 0.8317, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.324324324324325, |
|
"grad_norm": 12.53125, |
|
"learning_rate": 0.00012106792699957263, |
|
"loss": 0.8712, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.45945945945946, |
|
"grad_norm": 12.671875, |
|
"learning_rate": 0.00011690008203218493, |
|
"loss": 0.8436, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.594594594594595, |
|
"grad_norm": 15.8515625, |
|
"learning_rate": 0.00011270178197468789, |
|
"loss": 0.8538, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.72972972972973, |
|
"grad_norm": 12.46875, |
|
"learning_rate": 0.00010848059244755093, |
|
"loss": 0.8787, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.864864864864865, |
|
"grad_norm": 11.921875, |
|
"learning_rate": 0.00010424412031961484, |
|
"loss": 0.8823, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 11.8203125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8839, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.135135135135135, |
|
"grad_norm": 11.515625, |
|
"learning_rate": 9.57558796803852e-05, |
|
"loss": 0.8362, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.27027027027027, |
|
"grad_norm": 12.15625, |
|
"learning_rate": 9.151940755244912e-05, |
|
"loss": 0.8678, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.405405405405405, |
|
"grad_norm": 13.109375, |
|
"learning_rate": 8.729821802531212e-05, |
|
"loss": 0.8372, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.54054054054054, |
|
"grad_norm": 12.96875, |
|
"learning_rate": 8.309991796781511e-05, |
|
"loss": 0.8662, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.675675675675675, |
|
"grad_norm": 15.203125, |
|
"learning_rate": 7.89320730004274e-05, |
|
"loss": 0.9021, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.8108108108108105, |
|
"grad_norm": 12.4765625, |
|
"learning_rate": 7.48021938614875e-05, |
|
"loss": 0.8902, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.945945945945946, |
|
"grad_norm": 12.953125, |
|
"learning_rate": 7.071772287234497e-05, |
|
"loss": 0.852, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.081081081081081, |
|
"grad_norm": 14.9609375, |
|
"learning_rate": 6.668602052579424e-05, |
|
"loss": 0.8743, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.216216216216216, |
|
"grad_norm": 12.8828125, |
|
"learning_rate": 6.271435222196916e-05, |
|
"loss": 0.8844, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 6.351351351351352, |
|
"grad_norm": 13.3984375, |
|
"learning_rate": 5.880987517560075e-05, |
|
"loss": 0.8924, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.486486486486487, |
|
"grad_norm": 14.3046875, |
|
"learning_rate": 5.497962551823266e-05, |
|
"loss": 0.9041, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.621621621621622, |
|
"grad_norm": 15.421875, |
|
"learning_rate": 5.123050561863657e-05, |
|
"loss": 0.871, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.756756756756757, |
|
"grad_norm": 12.7109375, |
|
"learning_rate": 4.756927164427685e-05, |
|
"loss": 0.8924, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.891891891891892, |
|
"grad_norm": 13.765625, |
|
"learning_rate": 4.4002521386240466e-05, |
|
"loss": 0.9047, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.027027027027027, |
|
"grad_norm": 13.09375, |
|
"learning_rate": 4.053668236957134e-05, |
|
"loss": 0.9083, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.162162162162162, |
|
"grad_norm": 13.875, |
|
"learning_rate": 3.717800027043576e-05, |
|
"loss": 0.8721, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 7.297297297297297, |
|
"grad_norm": 12.09375, |
|
"learning_rate": 3.393252766099187e-05, |
|
"loss": 0.916, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 7.4324324324324325, |
|
"grad_norm": 12.3671875, |
|
"learning_rate": 3.080611310224539e-05, |
|
"loss": 0.8714, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.5675675675675675, |
|
"grad_norm": 13.2109375, |
|
"learning_rate": 2.7804390604547557e-05, |
|
"loss": 0.8843, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.702702702702703, |
|
"grad_norm": 13.9765625, |
|
"learning_rate": 2.493276947472756e-05, |
|
"loss": 0.8934, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.837837837837838, |
|
"grad_norm": 14.28125, |
|
"learning_rate": 2.2196424568156073e-05, |
|
"loss": 0.8953, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.972972972972973, |
|
"grad_norm": 14.8828125, |
|
"learning_rate": 1.9600286963305957e-05, |
|
"loss": 0.8728, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 8.108108108108109, |
|
"grad_norm": 13.2890625, |
|
"learning_rate": 1.7149035075615794e-05, |
|
"loss": 0.8995, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 8.243243243243244, |
|
"grad_norm": 14.7578125, |
|
"learning_rate": 1.4847086226668872e-05, |
|
"loss": 0.8685, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 8.378378378378379, |
|
"grad_norm": 12.0703125, |
|
"learning_rate": 1.2698588683881186e-05, |
|
"loss": 0.88, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 8.513513513513514, |
|
"grad_norm": 16.8125, |
|
"learning_rate": 1.0707414185043163e-05, |
|
"loss": 0.8936, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 8.64864864864865, |
|
"grad_norm": 14.0859375, |
|
"learning_rate": 8.87715096118642e-06, |
|
"loss": 0.8745, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.783783783783784, |
|
"grad_norm": 13.1796875, |
|
"learning_rate": 7.211097270349066e-06, |
|
"loss": 0.8849, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 8.91891891891892, |
|
"grad_norm": 12.6328125, |
|
"learning_rate": 5.71225545389158e-06, |
|
"loss": 0.8775, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 9.054054054054054, |
|
"grad_norm": 12.15625, |
|
"learning_rate": 4.383326526074916e-06, |
|
"loss": 0.8663, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 9.18918918918919, |
|
"grad_norm": 12.9140625, |
|
"learning_rate": 3.226705306650113e-06, |
|
"loss": 0.8975, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 9.324324324324325, |
|
"grad_norm": 14.265625, |
|
"learning_rate": 2.2444761052313856e-06, |
|
"loss": 0.8677, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 9.45945945945946, |
|
"grad_norm": 14.46875, |
|
"learning_rate": 1.4384089652291543e-06, |
|
"loss": 0.8565, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 9.594594594594595, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 8.099564741123166e-07, |
|
"loss": 0.8619, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 9.72972972972973, |
|
"grad_norm": 13.5859375, |
|
"learning_rate": 3.6025114574734785e-07, |
|
"loss": 0.8956, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 9.864864864864865, |
|
"grad_norm": 14.1796875, |
|
"learning_rate": 9.010337953185843e-08, |
|
"loss": 0.8906, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 13.2109375, |
|
"learning_rate": 0.0, |
|
"loss": 0.8617, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 740, |
|
"total_flos": 4.83113130000384e+16, |
|
"train_loss": 0.9043467921179694, |
|
"train_runtime": 626.4938, |
|
"train_samples_per_second": 4.725, |
|
"train_steps_per_second": 1.181 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.83113130000384e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|