{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 987, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030395136778115502, "grad_norm": 16.64022370418662, "learning_rate": 5e-06, "loss": 0.9277, "step": 10 }, { "epoch": 0.060790273556231005, "grad_norm": 1.5999241644765967, "learning_rate": 5e-06, "loss": 0.825, "step": 20 }, { "epoch": 0.0911854103343465, "grad_norm": 0.9838744519445904, "learning_rate": 5e-06, "loss": 0.7817, "step": 30 }, { "epoch": 0.12158054711246201, "grad_norm": 0.876398417764137, "learning_rate": 5e-06, "loss": 0.7545, "step": 40 }, { "epoch": 0.1519756838905775, "grad_norm": 0.9156799635831374, "learning_rate": 5e-06, "loss": 0.7358, "step": 50 }, { "epoch": 0.182370820668693, "grad_norm": 0.808319513356986, "learning_rate": 5e-06, "loss": 0.7226, "step": 60 }, { "epoch": 0.2127659574468085, "grad_norm": 0.8914678061831848, "learning_rate": 5e-06, "loss": 0.7122, "step": 70 }, { "epoch": 0.24316109422492402, "grad_norm": 0.6828512024884075, "learning_rate": 5e-06, "loss": 0.7016, "step": 80 }, { "epoch": 0.2735562310030395, "grad_norm": 0.6845268978851149, "learning_rate": 5e-06, "loss": 0.6982, "step": 90 }, { "epoch": 0.303951367781155, "grad_norm": 0.7312914857408909, "learning_rate": 5e-06, "loss": 0.6865, "step": 100 }, { "epoch": 0.3343465045592705, "grad_norm": 0.7352540765774106, "learning_rate": 5e-06, "loss": 0.6907, "step": 110 }, { "epoch": 0.364741641337386, "grad_norm": 0.7102916388595696, "learning_rate": 5e-06, "loss": 0.6824, "step": 120 }, { "epoch": 0.3951367781155015, "grad_norm": 0.751885584367901, "learning_rate": 5e-06, "loss": 0.6879, "step": 130 }, { "epoch": 0.425531914893617, "grad_norm": 0.589815976371268, "learning_rate": 5e-06, "loss": 0.6824, "step": 140 }, { "epoch": 0.45592705167173253, "grad_norm": 0.5807246304943029, "learning_rate": 5e-06, "loss": 0.6785, "step": 150 }, { "epoch": 0.48632218844984804, "grad_norm": 0.594562932961839, "learning_rate": 5e-06, "loss": 0.6857, "step": 160 }, { "epoch": 0.5167173252279635, "grad_norm": 0.5466511252566243, "learning_rate": 5e-06, "loss": 0.6769, "step": 170 }, { "epoch": 0.547112462006079, "grad_norm": 0.5857763993093636, "learning_rate": 5e-06, "loss": 0.6853, "step": 180 }, { "epoch": 0.5775075987841946, "grad_norm": 0.564419379546199, "learning_rate": 5e-06, "loss": 0.6716, "step": 190 }, { "epoch": 0.60790273556231, "grad_norm": 0.5133760076952175, "learning_rate": 5e-06, "loss": 0.6726, "step": 200 }, { "epoch": 0.6382978723404256, "grad_norm": 0.7195067283606177, "learning_rate": 5e-06, "loss": 0.6747, "step": 210 }, { "epoch": 0.668693009118541, "grad_norm": 0.6535399849598605, "learning_rate": 5e-06, "loss": 0.673, "step": 220 }, { "epoch": 0.6990881458966566, "grad_norm": 0.5983112715015926, "learning_rate": 5e-06, "loss": 0.6701, "step": 230 }, { "epoch": 0.729483282674772, "grad_norm": 0.6315519097135667, "learning_rate": 5e-06, "loss": 0.6699, "step": 240 }, { "epoch": 0.7598784194528876, "grad_norm": 0.752213223333162, "learning_rate": 5e-06, "loss": 0.6649, "step": 250 }, { "epoch": 0.790273556231003, "grad_norm": 0.5498218531784089, "learning_rate": 5e-06, "loss": 0.6697, "step": 260 }, { "epoch": 0.8206686930091185, "grad_norm": 0.8862901715920243, "learning_rate": 5e-06, "loss": 0.6639, "step": 270 }, { "epoch": 0.851063829787234, "grad_norm": 0.5808746614523029, "learning_rate": 5e-06, "loss": 0.6624, "step": 280 }, { "epoch": 0.8814589665653495, "grad_norm": 0.5459654532749203, "learning_rate": 5e-06, "loss": 0.6606, "step": 290 }, { "epoch": 0.9118541033434651, "grad_norm": 0.5838459305444006, "learning_rate": 5e-06, "loss": 0.6632, "step": 300 }, { "epoch": 0.9422492401215805, "grad_norm": 0.7036382282933219, "learning_rate": 5e-06, "loss": 0.6576, "step": 310 }, { "epoch": 0.9726443768996961, "grad_norm": 0.5259058744593071, "learning_rate": 5e-06, "loss": 0.6591, "step": 320 }, { "epoch": 1.0, "eval_loss": 0.6591953635215759, "eval_runtime": 31.7982, "eval_samples_per_second": 278.664, "eval_steps_per_second": 0.566, "step": 329 }, { "epoch": 1.0030395136778116, "grad_norm": 0.8394036975692434, "learning_rate": 5e-06, "loss": 0.6601, "step": 330 }, { "epoch": 1.033434650455927, "grad_norm": 0.8451859406825153, "learning_rate": 5e-06, "loss": 0.6224, "step": 340 }, { "epoch": 1.0638297872340425, "grad_norm": 0.5812996686128133, "learning_rate": 5e-06, "loss": 0.615, "step": 350 }, { "epoch": 1.094224924012158, "grad_norm": 0.5503154879132033, "learning_rate": 5e-06, "loss": 0.6236, "step": 360 }, { "epoch": 1.1246200607902737, "grad_norm": 0.7833862166535409, "learning_rate": 5e-06, "loss": 0.6177, "step": 370 }, { "epoch": 1.155015197568389, "grad_norm": 0.6294679579661799, "learning_rate": 5e-06, "loss": 0.6186, "step": 380 }, { "epoch": 1.1854103343465046, "grad_norm": 0.5626057919504072, "learning_rate": 5e-06, "loss": 0.6131, "step": 390 }, { "epoch": 1.21580547112462, "grad_norm": 0.5655023836689654, "learning_rate": 5e-06, "loss": 0.6133, "step": 400 }, { "epoch": 1.2462006079027357, "grad_norm": 0.6419841359149339, "learning_rate": 5e-06, "loss": 0.6117, "step": 410 }, { "epoch": 1.2765957446808511, "grad_norm": 0.9501856765863967, "learning_rate": 5e-06, "loss": 0.6154, "step": 420 }, { "epoch": 1.3069908814589666, "grad_norm": 0.598897751556055, "learning_rate": 5e-06, "loss": 0.6162, "step": 430 }, { "epoch": 1.337386018237082, "grad_norm": 0.7429134385869901, "learning_rate": 5e-06, "loss": 0.6167, "step": 440 }, { "epoch": 1.3677811550151975, "grad_norm": 0.5294832137073032, "learning_rate": 5e-06, "loss": 0.6169, "step": 450 }, { "epoch": 1.3981762917933132, "grad_norm": 0.8585569002266061, "learning_rate": 5e-06, "loss": 0.6134, "step": 460 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5980490230617989, "learning_rate": 5e-06, "loss": 0.6148, "step": 470 }, { "epoch": 1.458966565349544, "grad_norm": 0.5763371418857346, "learning_rate": 5e-06, "loss": 0.6173, "step": 480 }, { "epoch": 1.4893617021276595, "grad_norm": 0.5080499060023709, "learning_rate": 5e-06, "loss": 0.6139, "step": 490 }, { "epoch": 1.5197568389057752, "grad_norm": 0.599791892872901, "learning_rate": 5e-06, "loss": 0.62, "step": 500 }, { "epoch": 1.5501519756838906, "grad_norm": 0.561193296506177, "learning_rate": 5e-06, "loss": 0.6151, "step": 510 }, { "epoch": 1.580547112462006, "grad_norm": 0.55216025845368, "learning_rate": 5e-06, "loss": 0.6142, "step": 520 }, { "epoch": 1.6109422492401215, "grad_norm": 0.5020053919351462, "learning_rate": 5e-06, "loss": 0.6138, "step": 530 }, { "epoch": 1.641337386018237, "grad_norm": 0.804833961940261, "learning_rate": 5e-06, "loss": 0.6065, "step": 540 }, { "epoch": 1.6717325227963524, "grad_norm": 0.5098656593355803, "learning_rate": 5e-06, "loss": 0.6128, "step": 550 }, { "epoch": 1.702127659574468, "grad_norm": 0.5672353564854465, "learning_rate": 5e-06, "loss": 0.6113, "step": 560 }, { "epoch": 1.7325227963525835, "grad_norm": 0.5373878158094728, "learning_rate": 5e-06, "loss": 0.6165, "step": 570 }, { "epoch": 1.7629179331306992, "grad_norm": 0.6400981957262623, "learning_rate": 5e-06, "loss": 0.6135, "step": 580 }, { "epoch": 1.7933130699088147, "grad_norm": 0.5784668286610699, "learning_rate": 5e-06, "loss": 0.6127, "step": 590 }, { "epoch": 1.8237082066869301, "grad_norm": 0.4829691384216596, "learning_rate": 5e-06, "loss": 0.6165, "step": 600 }, { "epoch": 1.8541033434650456, "grad_norm": 0.5607220085804383, "learning_rate": 5e-06, "loss": 0.6142, "step": 610 }, { "epoch": 1.884498480243161, "grad_norm": 0.554409072476674, "learning_rate": 5e-06, "loss": 0.6108, "step": 620 }, { "epoch": 1.9148936170212765, "grad_norm": 0.4666111631306896, "learning_rate": 5e-06, "loss": 0.6054, "step": 630 }, { "epoch": 1.9452887537993921, "grad_norm": 0.6239667480279937, "learning_rate": 5e-06, "loss": 0.611, "step": 640 }, { "epoch": 1.9756838905775076, "grad_norm": 0.7028679873862859, "learning_rate": 5e-06, "loss": 0.6102, "step": 650 }, { "epoch": 2.0, "eval_loss": 0.6488233208656311, "eval_runtime": 32.2972, "eval_samples_per_second": 274.358, "eval_steps_per_second": 0.557, "step": 658 }, { "epoch": 2.0060790273556233, "grad_norm": 0.7027126564788553, "learning_rate": 5e-06, "loss": 0.6004, "step": 660 }, { "epoch": 2.0364741641337387, "grad_norm": 0.6726728809115222, "learning_rate": 5e-06, "loss": 0.5684, "step": 670 }, { "epoch": 2.066869300911854, "grad_norm": 0.7037164967207233, "learning_rate": 5e-06, "loss": 0.5618, "step": 680 }, { "epoch": 2.0972644376899696, "grad_norm": 0.5634457537187532, "learning_rate": 5e-06, "loss": 0.5656, "step": 690 }, { "epoch": 2.127659574468085, "grad_norm": 0.7190014758588847, "learning_rate": 5e-06, "loss": 0.5669, "step": 700 }, { "epoch": 2.1580547112462005, "grad_norm": 0.6182141475596381, "learning_rate": 5e-06, "loss": 0.567, "step": 710 }, { "epoch": 2.188449848024316, "grad_norm": 0.6494642619820201, "learning_rate": 5e-06, "loss": 0.5625, "step": 720 }, { "epoch": 2.2188449848024314, "grad_norm": 0.5352624075289281, "learning_rate": 5e-06, "loss": 0.5674, "step": 730 }, { "epoch": 2.2492401215805473, "grad_norm": 0.5779884492140585, "learning_rate": 5e-06, "loss": 0.5637, "step": 740 }, { "epoch": 2.2796352583586628, "grad_norm": 0.6212702567540722, "learning_rate": 5e-06, "loss": 0.5755, "step": 750 }, { "epoch": 2.310030395136778, "grad_norm": 0.6675573139313373, "learning_rate": 5e-06, "loss": 0.5715, "step": 760 }, { "epoch": 2.3404255319148937, "grad_norm": 1.112080252593875, "learning_rate": 5e-06, "loss": 0.5673, "step": 770 }, { "epoch": 2.370820668693009, "grad_norm": 0.5430387056330088, "learning_rate": 5e-06, "loss": 0.5703, "step": 780 }, { "epoch": 2.4012158054711246, "grad_norm": 0.5730573646763748, "learning_rate": 5e-06, "loss": 0.5716, "step": 790 }, { "epoch": 2.43161094224924, "grad_norm": 0.5782525038445755, "learning_rate": 5e-06, "loss": 0.5673, "step": 800 }, { "epoch": 2.4620060790273555, "grad_norm": 0.5515752300271801, "learning_rate": 5e-06, "loss": 0.5729, "step": 810 }, { "epoch": 2.4924012158054714, "grad_norm": 0.6159973777815156, "learning_rate": 5e-06, "loss": 0.5712, "step": 820 }, { "epoch": 2.522796352583587, "grad_norm": 0.5694018418127859, "learning_rate": 5e-06, "loss": 0.5667, "step": 830 }, { "epoch": 2.5531914893617023, "grad_norm": 0.5797243112894562, "learning_rate": 5e-06, "loss": 0.5687, "step": 840 }, { "epoch": 2.5835866261398177, "grad_norm": 0.6616634425335868, "learning_rate": 5e-06, "loss": 0.5739, "step": 850 }, { "epoch": 2.613981762917933, "grad_norm": 0.6133935953312176, "learning_rate": 5e-06, "loss": 0.5731, "step": 860 }, { "epoch": 2.6443768996960486, "grad_norm": 0.6410077762466703, "learning_rate": 5e-06, "loss": 0.5772, "step": 870 }, { "epoch": 2.674772036474164, "grad_norm": 0.6957749590141841, "learning_rate": 5e-06, "loss": 0.5705, "step": 880 }, { "epoch": 2.7051671732522795, "grad_norm": 0.5103295479540869, "learning_rate": 5e-06, "loss": 0.5718, "step": 890 }, { "epoch": 2.735562310030395, "grad_norm": 0.5434510084681313, "learning_rate": 5e-06, "loss": 0.5635, "step": 900 }, { "epoch": 2.7659574468085104, "grad_norm": 0.5490760128674873, "learning_rate": 5e-06, "loss": 0.5689, "step": 910 }, { "epoch": 2.7963525835866263, "grad_norm": 0.5024890606168032, "learning_rate": 5e-06, "loss": 0.5725, "step": 920 }, { "epoch": 2.8267477203647418, "grad_norm": 0.558224951103413, "learning_rate": 5e-06, "loss": 0.5731, "step": 930 }, { "epoch": 2.857142857142857, "grad_norm": 0.5770328368518338, "learning_rate": 5e-06, "loss": 0.5682, "step": 940 }, { "epoch": 2.8875379939209727, "grad_norm": 0.5500792024748634, "learning_rate": 5e-06, "loss": 0.5691, "step": 950 }, { "epoch": 2.917933130699088, "grad_norm": 0.6529577507817819, "learning_rate": 5e-06, "loss": 0.5768, "step": 960 }, { "epoch": 2.9483282674772036, "grad_norm": 0.49823556701097355, "learning_rate": 5e-06, "loss": 0.5671, "step": 970 }, { "epoch": 2.978723404255319, "grad_norm": 0.5712256269882896, "learning_rate": 5e-06, "loss": 0.5707, "step": 980 }, { "epoch": 3.0, "eval_loss": 0.6519396305084229, "eval_runtime": 30.3855, "eval_samples_per_second": 291.619, "eval_steps_per_second": 0.592, "step": 987 }, { "epoch": 3.0, "step": 987, "total_flos": 1653261761249280.0, "train_loss": 0.6273607391233744, "train_runtime": 4771.414, "train_samples_per_second": 105.851, "train_steps_per_second": 0.207 } ], "logging_steps": 10, "max_steps": 987, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1653261761249280.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }