{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9956108266276518, "eval_steps": 500, "global_step": 1023, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029261155815654718, "grad_norm": 2.817451946105921, "learning_rate": 5e-06, "loss": 0.7976, "step": 10 }, { "epoch": 0.058522311631309436, "grad_norm": 1.284434355820121, "learning_rate": 5e-06, "loss": 0.7389, "step": 20 }, { "epoch": 0.08778346744696415, "grad_norm": 1.2876126989168348, "learning_rate": 5e-06, "loss": 0.7071, "step": 30 }, { "epoch": 0.11704462326261887, "grad_norm": 1.094058121641866, "learning_rate": 5e-06, "loss": 0.6965, "step": 40 }, { "epoch": 0.14630577907827358, "grad_norm": 1.148621345747248, "learning_rate": 5e-06, "loss": 0.687, "step": 50 }, { "epoch": 0.1755669348939283, "grad_norm": 0.9050464163153896, "learning_rate": 5e-06, "loss": 0.6765, "step": 60 }, { "epoch": 0.20482809070958302, "grad_norm": 0.5948745622322165, "learning_rate": 5e-06, "loss": 0.6699, "step": 70 }, { "epoch": 0.23408924652523774, "grad_norm": 0.4179284598235861, "learning_rate": 5e-06, "loss": 0.6647, "step": 80 }, { "epoch": 0.26335040234089246, "grad_norm": 0.4320197149625007, "learning_rate": 5e-06, "loss": 0.6619, "step": 90 }, { "epoch": 0.29261155815654716, "grad_norm": 0.45765152029005024, "learning_rate": 5e-06, "loss": 0.6543, "step": 100 }, { "epoch": 0.3218727139722019, "grad_norm": 0.4044893755849006, "learning_rate": 5e-06, "loss": 0.6632, "step": 110 }, { "epoch": 0.3511338697878566, "grad_norm": 0.35821605431124326, "learning_rate": 5e-06, "loss": 0.668, "step": 120 }, { "epoch": 0.38039502560351135, "grad_norm": 0.3756878575578981, "learning_rate": 5e-06, "loss": 0.6518, "step": 130 }, { "epoch": 0.40965618141916604, "grad_norm": 0.3621384928557948, "learning_rate": 5e-06, "loss": 0.6525, "step": 140 }, { "epoch": 0.4389173372348208, "grad_norm": 0.3496185059505636, "learning_rate": 5e-06, "loss": 0.648, "step": 150 }, { "epoch": 0.4681784930504755, "grad_norm": 0.3872142083182003, "learning_rate": 5e-06, "loss": 0.6498, "step": 160 }, { "epoch": 0.49743964886613024, "grad_norm": 0.3658858276235106, "learning_rate": 5e-06, "loss": 0.6549, "step": 170 }, { "epoch": 0.5267008046817849, "grad_norm": 0.3357007312829614, "learning_rate": 5e-06, "loss": 0.6546, "step": 180 }, { "epoch": 0.5559619604974396, "grad_norm": 0.3562481959188732, "learning_rate": 5e-06, "loss": 0.6431, "step": 190 }, { "epoch": 0.5852231163130943, "grad_norm": 0.344934498132061, "learning_rate": 5e-06, "loss": 0.6443, "step": 200 }, { "epoch": 0.6144842721287491, "grad_norm": 0.34218669140197133, "learning_rate": 5e-06, "loss": 0.6509, "step": 210 }, { "epoch": 0.6437454279444038, "grad_norm": 0.35256592076055937, "learning_rate": 5e-06, "loss": 0.6389, "step": 220 }, { "epoch": 0.6730065837600585, "grad_norm": 0.36473082870109524, "learning_rate": 5e-06, "loss": 0.6388, "step": 230 }, { "epoch": 0.7022677395757132, "grad_norm": 0.3299100883057849, "learning_rate": 5e-06, "loss": 0.6456, "step": 240 }, { "epoch": 0.731528895391368, "grad_norm": 0.3416365052817771, "learning_rate": 5e-06, "loss": 0.644, "step": 250 }, { "epoch": 0.7607900512070227, "grad_norm": 0.34197703127811097, "learning_rate": 5e-06, "loss": 0.6466, "step": 260 }, { "epoch": 0.7900512070226774, "grad_norm": 0.37223452358809667, "learning_rate": 5e-06, "loss": 0.6489, "step": 270 }, { "epoch": 0.8193123628383321, "grad_norm": 0.33976179205185286, "learning_rate": 5e-06, "loss": 0.6477, "step": 280 }, { "epoch": 0.8485735186539868, "grad_norm": 0.3398894606792935, "learning_rate": 5e-06, "loss": 0.6399, "step": 290 }, { "epoch": 0.8778346744696416, "grad_norm": 0.3510531345907806, "learning_rate": 5e-06, "loss": 0.6318, "step": 300 }, { "epoch": 0.9070958302852963, "grad_norm": 0.3558397390544605, "learning_rate": 5e-06, "loss": 0.6449, "step": 310 }, { "epoch": 0.936356986100951, "grad_norm": 0.3356596511911474, "learning_rate": 5e-06, "loss": 0.6369, "step": 320 }, { "epoch": 0.9656181419166057, "grad_norm": 0.37733017841506045, "learning_rate": 5e-06, "loss": 0.6401, "step": 330 }, { "epoch": 0.9948792977322605, "grad_norm": 0.34951561267640846, "learning_rate": 5e-06, "loss": 0.6361, "step": 340 }, { "epoch": 0.9978054133138259, "eval_loss": 0.6406562924385071, "eval_runtime": 346.2688, "eval_samples_per_second": 26.595, "eval_steps_per_second": 0.416, "step": 341 }, { "epoch": 1.025237746891002, "grad_norm": 0.3690522380396755, "learning_rate": 5e-06, "loss": 0.6513, "step": 350 }, { "epoch": 1.054498902706657, "grad_norm": 0.34935298880477006, "learning_rate": 5e-06, "loss": 0.6055, "step": 360 }, { "epoch": 1.0837600585223117, "grad_norm": 0.4071561749341452, "learning_rate": 5e-06, "loss": 0.6089, "step": 370 }, { "epoch": 1.1130212143379663, "grad_norm": 0.3423935118081486, "learning_rate": 5e-06, "loss": 0.6063, "step": 380 }, { "epoch": 1.142282370153621, "grad_norm": 0.40033069734559046, "learning_rate": 5e-06, "loss": 0.6116, "step": 390 }, { "epoch": 1.1715435259692757, "grad_norm": 0.35662843692652024, "learning_rate": 5e-06, "loss": 0.613, "step": 400 }, { "epoch": 1.2008046817849305, "grad_norm": 0.3579810712341978, "learning_rate": 5e-06, "loss": 0.6127, "step": 410 }, { "epoch": 1.2300658376005853, "grad_norm": 0.4145283245344341, "learning_rate": 5e-06, "loss": 0.6009, "step": 420 }, { "epoch": 1.2593269934162399, "grad_norm": 0.3559214339096793, "learning_rate": 5e-06, "loss": 0.611, "step": 430 }, { "epoch": 1.2885881492318947, "grad_norm": 0.3413599580993331, "learning_rate": 5e-06, "loss": 0.6037, "step": 440 }, { "epoch": 1.3178493050475493, "grad_norm": 0.3622841783657278, "learning_rate": 5e-06, "loss": 0.6074, "step": 450 }, { "epoch": 1.347110460863204, "grad_norm": 0.3746581322669035, "learning_rate": 5e-06, "loss": 0.6051, "step": 460 }, { "epoch": 1.3763716166788589, "grad_norm": 0.31849740102906093, "learning_rate": 5e-06, "loss": 0.6024, "step": 470 }, { "epoch": 1.4056327724945135, "grad_norm": 0.3734761795611288, "learning_rate": 5e-06, "loss": 0.6033, "step": 480 }, { "epoch": 1.4348939283101683, "grad_norm": 0.34675275832571, "learning_rate": 5e-06, "loss": 0.6097, "step": 490 }, { "epoch": 1.464155084125823, "grad_norm": 0.33843498837923025, "learning_rate": 5e-06, "loss": 0.598, "step": 500 }, { "epoch": 1.4934162399414777, "grad_norm": 0.350146725219343, "learning_rate": 5e-06, "loss": 0.606, "step": 510 }, { "epoch": 1.5226773957571325, "grad_norm": 0.39166843083583675, "learning_rate": 5e-06, "loss": 0.6084, "step": 520 }, { "epoch": 1.5519385515727873, "grad_norm": 0.350003587462171, "learning_rate": 5e-06, "loss": 0.607, "step": 530 }, { "epoch": 1.5811997073884418, "grad_norm": 0.3667439111520098, "learning_rate": 5e-06, "loss": 0.6135, "step": 540 }, { "epoch": 1.6104608632040964, "grad_norm": 0.3537244934912836, "learning_rate": 5e-06, "loss": 0.6046, "step": 550 }, { "epoch": 1.6397220190197512, "grad_norm": 0.3530231160276065, "learning_rate": 5e-06, "loss": 0.6063, "step": 560 }, { "epoch": 1.668983174835406, "grad_norm": 0.323025619993113, "learning_rate": 5e-06, "loss": 0.5969, "step": 570 }, { "epoch": 1.6982443306510606, "grad_norm": 0.349358548931987, "learning_rate": 5e-06, "loss": 0.6062, "step": 580 }, { "epoch": 1.7275054864667154, "grad_norm": 0.39980408460241657, "learning_rate": 5e-06, "loss": 0.6065, "step": 590 }, { "epoch": 1.7567666422823702, "grad_norm": 0.34113313353194774, "learning_rate": 5e-06, "loss": 0.6033, "step": 600 }, { "epoch": 1.7860277980980248, "grad_norm": 0.34531388262994633, "learning_rate": 5e-06, "loss": 0.6058, "step": 610 }, { "epoch": 1.8152889539136796, "grad_norm": 0.32514799592510446, "learning_rate": 5e-06, "loss": 0.6062, "step": 620 }, { "epoch": 1.8445501097293344, "grad_norm": 0.35614080019739464, "learning_rate": 5e-06, "loss": 0.6053, "step": 630 }, { "epoch": 1.873811265544989, "grad_norm": 0.3300616433320744, "learning_rate": 5e-06, "loss": 0.6002, "step": 640 }, { "epoch": 1.9030724213606436, "grad_norm": 0.3622402238263096, "learning_rate": 5e-06, "loss": 0.6058, "step": 650 }, { "epoch": 1.9323335771762986, "grad_norm": 0.3444768451054272, "learning_rate": 5e-06, "loss": 0.6064, "step": 660 }, { "epoch": 1.9615947329919532, "grad_norm": 0.32853379599999255, "learning_rate": 5e-06, "loss": 0.5952, "step": 670 }, { "epoch": 1.9908558888076078, "grad_norm": 0.3587293884689086, "learning_rate": 5e-06, "loss": 0.6097, "step": 680 }, { "epoch": 1.9967081199707388, "eval_loss": 0.6332100629806519, "eval_runtime": 347.8152, "eval_samples_per_second": 26.477, "eval_steps_per_second": 0.414, "step": 682 }, { "epoch": 2.0212143379663496, "grad_norm": 0.3914747236499232, "learning_rate": 5e-06, "loss": 0.6197, "step": 690 }, { "epoch": 2.050475493782004, "grad_norm": 0.4451317704931522, "learning_rate": 5e-06, "loss": 0.5673, "step": 700 }, { "epoch": 2.0797366495976592, "grad_norm": 0.35310727796985897, "learning_rate": 5e-06, "loss": 0.5694, "step": 710 }, { "epoch": 2.108997805413314, "grad_norm": 0.3748176044370228, "learning_rate": 5e-06, "loss": 0.5704, "step": 720 }, { "epoch": 2.1382589612289684, "grad_norm": 0.3494354561294143, "learning_rate": 5e-06, "loss": 0.5723, "step": 730 }, { "epoch": 2.1675201170446234, "grad_norm": 0.35129264971107693, "learning_rate": 5e-06, "loss": 0.5775, "step": 740 }, { "epoch": 2.196781272860278, "grad_norm": 0.31508074197114816, "learning_rate": 5e-06, "loss": 0.5763, "step": 750 }, { "epoch": 2.2260424286759326, "grad_norm": 0.3670643138923896, "learning_rate": 5e-06, "loss": 0.5737, "step": 760 }, { "epoch": 2.255303584491587, "grad_norm": 0.3304298759476488, "learning_rate": 5e-06, "loss": 0.5725, "step": 770 }, { "epoch": 2.284564740307242, "grad_norm": 0.3388699840011243, "learning_rate": 5e-06, "loss": 0.5716, "step": 780 }, { "epoch": 2.313825896122897, "grad_norm": 0.3280155833683051, "learning_rate": 5e-06, "loss": 0.5699, "step": 790 }, { "epoch": 2.3430870519385514, "grad_norm": 0.35970533064711685, "learning_rate": 5e-06, "loss": 0.5777, "step": 800 }, { "epoch": 2.3723482077542064, "grad_norm": 0.33656864947696286, "learning_rate": 5e-06, "loss": 0.5728, "step": 810 }, { "epoch": 2.401609363569861, "grad_norm": 0.33260691432954365, "learning_rate": 5e-06, "loss": 0.5791, "step": 820 }, { "epoch": 2.4308705193855156, "grad_norm": 0.3167221056111725, "learning_rate": 5e-06, "loss": 0.5714, "step": 830 }, { "epoch": 2.4601316752011706, "grad_norm": 0.3552708228986059, "learning_rate": 5e-06, "loss": 0.5719, "step": 840 }, { "epoch": 2.489392831016825, "grad_norm": 0.3504786037960249, "learning_rate": 5e-06, "loss": 0.5741, "step": 850 }, { "epoch": 2.5186539868324798, "grad_norm": 0.35601444454617853, "learning_rate": 5e-06, "loss": 0.5671, "step": 860 }, { "epoch": 2.547915142648135, "grad_norm": 0.3992627060706797, "learning_rate": 5e-06, "loss": 0.5696, "step": 870 }, { "epoch": 2.5771762984637894, "grad_norm": 0.3511848795894042, "learning_rate": 5e-06, "loss": 0.5738, "step": 880 }, { "epoch": 2.606437454279444, "grad_norm": 0.3640553368412107, "learning_rate": 5e-06, "loss": 0.5731, "step": 890 }, { "epoch": 2.6356986100950985, "grad_norm": 0.3426464519086621, "learning_rate": 5e-06, "loss": 0.5777, "step": 900 }, { "epoch": 2.6649597659107536, "grad_norm": 0.3610877908075611, "learning_rate": 5e-06, "loss": 0.5778, "step": 910 }, { "epoch": 2.694220921726408, "grad_norm": 0.3556310393337805, "learning_rate": 5e-06, "loss": 0.5722, "step": 920 }, { "epoch": 2.723482077542063, "grad_norm": 0.3504629693430583, "learning_rate": 5e-06, "loss": 0.5758, "step": 930 }, { "epoch": 2.7527432333577178, "grad_norm": 0.3486578461965166, "learning_rate": 5e-06, "loss": 0.5748, "step": 940 }, { "epoch": 2.7820043891733723, "grad_norm": 0.3422817889043645, "learning_rate": 5e-06, "loss": 0.5735, "step": 950 }, { "epoch": 2.811265544989027, "grad_norm": 0.37588989818519764, "learning_rate": 5e-06, "loss": 0.5774, "step": 960 }, { "epoch": 2.840526700804682, "grad_norm": 0.34577188011968074, "learning_rate": 5e-06, "loss": 0.5731, "step": 970 }, { "epoch": 2.8697878566203365, "grad_norm": 0.3788340529933626, "learning_rate": 5e-06, "loss": 0.5729, "step": 980 }, { "epoch": 2.899049012435991, "grad_norm": 0.3263212597720765, "learning_rate": 5e-06, "loss": 0.5659, "step": 990 }, { "epoch": 2.928310168251646, "grad_norm": 0.32654471141276226, "learning_rate": 5e-06, "loss": 0.5737, "step": 1000 }, { "epoch": 2.9575713240673007, "grad_norm": 0.350030993954914, "learning_rate": 5e-06, "loss": 0.5654, "step": 1010 }, { "epoch": 2.9868324798829553, "grad_norm": 0.33894932153015206, "learning_rate": 5e-06, "loss": 0.5771, "step": 1020 }, { "epoch": 2.9956108266276518, "eval_loss": 0.6330604553222656, "eval_runtime": 347.7245, "eval_samples_per_second": 26.484, "eval_steps_per_second": 0.414, "step": 1023 }, { "epoch": 2.9956108266276518, "step": 1023, "total_flos": 2144987064041472.0, "train_loss": 0.6141412728925837, "train_runtime": 55205.9507, "train_samples_per_second": 9.507, "train_steps_per_second": 0.019 } ], "logging_steps": 10, "max_steps": 1023, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2144987064041472.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }