|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 987, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030395136778115502, |
|
"grad_norm": 16.64022370418662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9277, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.060790273556231005, |
|
"grad_norm": 1.5999241644765967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.825, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0911854103343465, |
|
"grad_norm": 0.9838744519445904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7817, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12158054711246201, |
|
"grad_norm": 0.876398417764137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7545, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1519756838905775, |
|
"grad_norm": 0.9156799635831374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7358, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.182370820668693, |
|
"grad_norm": 0.808319513356986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7226, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 0.8914678061831848, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7122, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24316109422492402, |
|
"grad_norm": 0.6828512024884075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7016, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2735562310030395, |
|
"grad_norm": 0.6845268978851149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6982, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.303951367781155, |
|
"grad_norm": 0.7312914857408909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6865, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3343465045592705, |
|
"grad_norm": 0.7352540765774106, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6907, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.364741641337386, |
|
"grad_norm": 0.7102916388595696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6824, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3951367781155015, |
|
"grad_norm": 0.751885584367901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6879, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.589815976371268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6824, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.45592705167173253, |
|
"grad_norm": 0.5807246304943029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6785, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.48632218844984804, |
|
"grad_norm": 0.594562932961839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6857, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5167173252279635, |
|
"grad_norm": 0.5466511252566243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6769, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.547112462006079, |
|
"grad_norm": 0.5857763993093636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6853, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5775075987841946, |
|
"grad_norm": 0.564419379546199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6716, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.60790273556231, |
|
"grad_norm": 0.5133760076952175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6726, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 0.7195067283606177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6747, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.668693009118541, |
|
"grad_norm": 0.6535399849598605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.673, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6990881458966566, |
|
"grad_norm": 0.5983112715015926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6701, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.729483282674772, |
|
"grad_norm": 0.6315519097135667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6699, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7598784194528876, |
|
"grad_norm": 0.752213223333162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6649, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.790273556231003, |
|
"grad_norm": 0.5498218531784089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6697, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8206686930091185, |
|
"grad_norm": 0.8862901715920243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6639, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.5808746614523029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6624, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8814589665653495, |
|
"grad_norm": 0.5459654532749203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6606, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9118541033434651, |
|
"grad_norm": 0.5838459305444006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6632, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9422492401215805, |
|
"grad_norm": 0.7036382282933219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6576, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9726443768996961, |
|
"grad_norm": 0.5259058744593071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6591, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6591953635215759, |
|
"eval_runtime": 31.7982, |
|
"eval_samples_per_second": 278.664, |
|
"eval_steps_per_second": 0.566, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.0030395136778116, |
|
"grad_norm": 0.8394036975692434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6601, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.033434650455927, |
|
"grad_norm": 0.8451859406825153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6224, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 0.5812996686128133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.615, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.094224924012158, |
|
"grad_norm": 0.5503154879132033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6236, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1246200607902737, |
|
"grad_norm": 0.7833862166535409, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6177, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.155015197568389, |
|
"grad_norm": 0.6294679579661799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6186, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1854103343465046, |
|
"grad_norm": 0.5626057919504072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6131, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.21580547112462, |
|
"grad_norm": 0.5655023836689654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6133, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2462006079027357, |
|
"grad_norm": 0.6419841359149339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6117, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 0.9501856765863967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6154, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3069908814589666, |
|
"grad_norm": 0.598897751556055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6162, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.337386018237082, |
|
"grad_norm": 0.7429134385869901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6167, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3677811550151975, |
|
"grad_norm": 0.5294832137073032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6169, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3981762917933132, |
|
"grad_norm": 0.8585569002266061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6134, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.5980490230617989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6148, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.458966565349544, |
|
"grad_norm": 0.5763371418857346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6173, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 0.5080499060023709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6139, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5197568389057752, |
|
"grad_norm": 0.599791892872901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.62, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5501519756838906, |
|
"grad_norm": 0.561193296506177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6151, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.580547112462006, |
|
"grad_norm": 0.55216025845368, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6109422492401215, |
|
"grad_norm": 0.5020053919351462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6138, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.641337386018237, |
|
"grad_norm": 0.804833961940261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6065, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6717325227963524, |
|
"grad_norm": 0.5098656593355803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6128, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.5672353564854465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6113, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7325227963525835, |
|
"grad_norm": 0.5373878158094728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6165, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7629179331306992, |
|
"grad_norm": 0.6400981957262623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6135, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7933130699088147, |
|
"grad_norm": 0.5784668286610699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6127, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8237082066869301, |
|
"grad_norm": 0.4829691384216596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6165, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8541033434650456, |
|
"grad_norm": 0.5607220085804383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.884498480243161, |
|
"grad_norm": 0.554409072476674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6108, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 0.4666111631306896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6054, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9452887537993921, |
|
"grad_norm": 0.6239667480279937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.611, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9756838905775076, |
|
"grad_norm": 0.7028679873862859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6102, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6488233208656311, |
|
"eval_runtime": 32.2972, |
|
"eval_samples_per_second": 274.358, |
|
"eval_steps_per_second": 0.557, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 2.0060790273556233, |
|
"grad_norm": 0.7027126564788553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6004, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0364741641337387, |
|
"grad_norm": 0.6726728809115222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5684, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.066869300911854, |
|
"grad_norm": 0.7037164967207233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5618, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0972644376899696, |
|
"grad_norm": 0.5634457537187532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5656, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 0.7190014758588847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5669, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1580547112462005, |
|
"grad_norm": 0.6182141475596381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.567, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.188449848024316, |
|
"grad_norm": 0.6494642619820201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5625, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2188449848024314, |
|
"grad_norm": 0.5352624075289281, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2492401215805473, |
|
"grad_norm": 0.5779884492140585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5637, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.2796352583586628, |
|
"grad_norm": 0.6212702567540722, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5755, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.310030395136778, |
|
"grad_norm": 0.6675573139313373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5715, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 1.112080252593875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5673, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.370820668693009, |
|
"grad_norm": 0.5430387056330088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5703, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4012158054711246, |
|
"grad_norm": 0.5730573646763748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5716, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.43161094224924, |
|
"grad_norm": 0.5782525038445755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5673, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4620060790273555, |
|
"grad_norm": 0.5515752300271801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5729, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4924012158054714, |
|
"grad_norm": 0.6159973777815156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5712, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.522796352583587, |
|
"grad_norm": 0.5694018418127859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5667, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 0.5797243112894562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5835866261398177, |
|
"grad_norm": 0.6616634425335868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5739, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.613981762917933, |
|
"grad_norm": 0.6133935953312176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5731, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6443768996960486, |
|
"grad_norm": 0.6410077762466703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5772, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.674772036474164, |
|
"grad_norm": 0.6957749590141841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5705, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7051671732522795, |
|
"grad_norm": 0.5103295479540869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5718, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.735562310030395, |
|
"grad_norm": 0.5434510084681313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5635, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 0.5490760128674873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5689, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.7963525835866263, |
|
"grad_norm": 0.5024890606168032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5725, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8267477203647418, |
|
"grad_norm": 0.558224951103413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5731, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.5770328368518338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5682, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8875379939209727, |
|
"grad_norm": 0.5500792024748634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5691, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.917933130699088, |
|
"grad_norm": 0.6529577507817819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5768, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9483282674772036, |
|
"grad_norm": 0.49823556701097355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5671, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 0.5712256269882896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5707, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6519396305084229, |
|
"eval_runtime": 30.3855, |
|
"eval_samples_per_second": 291.619, |
|
"eval_steps_per_second": 0.592, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 987, |
|
"total_flos": 1653261761249280.0, |
|
"train_loss": 0.6273607391233744, |
|
"train_runtime": 4771.414, |
|
"train_samples_per_second": 105.851, |
|
"train_steps_per_second": 0.207 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 987, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1653261761249280.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|