{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.77744862062581, "eval_steps": 300, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014812071838548418, "grad_norm": 0.6219717264175415, "learning_rate": 2.4630541871921186e-06, "loss": 1.1503, "step": 10 }, { "epoch": 0.029624143677096836, "grad_norm": 0.5424634218215942, "learning_rate": 4.926108374384237e-06, "loss": 1.1246, "step": 20 }, { "epoch": 0.04443621551564525, "grad_norm": 0.37799498438835144, "learning_rate": 7.3891625615763555e-06, "loss": 1.1465, "step": 30 }, { "epoch": 0.05924828735419367, "grad_norm": 0.4693998396396637, "learning_rate": 9.852216748768475e-06, "loss": 1.1335, "step": 40 }, { "epoch": 0.07406035919274208, "grad_norm": 0.8345885276794434, "learning_rate": 1.2315270935960592e-05, "loss": 1.0185, "step": 50 }, { "epoch": 0.0888724310312905, "grad_norm": 0.8352222442626953, "learning_rate": 1.4778325123152711e-05, "loss": 0.9585, "step": 60 }, { "epoch": 0.10368450286983892, "grad_norm": 0.5202337503433228, "learning_rate": 1.7241379310344828e-05, "loss": 0.6836, "step": 70 }, { "epoch": 0.11849657470838734, "grad_norm": 0.46258121728897095, "learning_rate": 1.970443349753695e-05, "loss": 0.6584, "step": 80 }, { "epoch": 0.13330864654693575, "grad_norm": 0.6236300468444824, "learning_rate": 2.2167487684729066e-05, "loss": 0.5989, "step": 90 }, { "epoch": 0.14812071838548416, "grad_norm": 0.5138072967529297, "learning_rate": 2.4630541871921184e-05, "loss": 0.5787, "step": 100 }, { "epoch": 0.1629327902240326, "grad_norm": 0.3929985761642456, "learning_rate": 2.70935960591133e-05, "loss": 0.5146, "step": 110 }, { "epoch": 0.177744862062581, "grad_norm": 0.563312828540802, "learning_rate": 2.9556650246305422e-05, "loss": 0.4704, "step": 120 }, { "epoch": 0.1925569339011294, "grad_norm": 0.5815229415893555, "learning_rate": 3.2019704433497536e-05, "loss": 0.424, "step": 130 }, { "epoch": 0.20736900573967784, "grad_norm": 0.4600641429424286, "learning_rate": 3.4482758620689657e-05, "loss": 0.4321, "step": 140 }, { "epoch": 0.22218107757822625, "grad_norm": 0.5328271389007568, "learning_rate": 3.694581280788178e-05, "loss": 0.4136, "step": 150 }, { "epoch": 0.23699314941677468, "grad_norm": 0.4838107228279114, "learning_rate": 3.94088669950739e-05, "loss": 0.4548, "step": 160 }, { "epoch": 0.25180522125532306, "grad_norm": 0.5641084909439087, "learning_rate": 4.187192118226601e-05, "loss": 0.4958, "step": 170 }, { "epoch": 0.2666172930938715, "grad_norm": 0.5724189281463623, "learning_rate": 4.433497536945813e-05, "loss": 0.4136, "step": 180 }, { "epoch": 0.28142936493241993, "grad_norm": 0.6268028020858765, "learning_rate": 4.679802955665025e-05, "loss": 0.4202, "step": 190 }, { "epoch": 0.2962414367709683, "grad_norm": 0.8757884502410889, "learning_rate": 4.926108374384237e-05, "loss": 0.3965, "step": 200 }, { "epoch": 0.31105350860951675, "grad_norm": 0.6613091826438904, "learning_rate": 4.999817902568189e-05, "loss": 0.4333, "step": 210 }, { "epoch": 0.3258655804480652, "grad_norm": 0.7051518559455872, "learning_rate": 4.9989260606397816e-05, "loss": 0.436, "step": 220 }, { "epoch": 0.34067765228661356, "grad_norm": 0.8129429221153259, "learning_rate": 4.997291292559225e-05, "loss": 0.4403, "step": 230 }, { "epoch": 0.355489724125162, "grad_norm": 0.8796783089637756, "learning_rate": 4.994914084340082e-05, "loss": 0.3911, "step": 240 }, { "epoch": 0.37030179596371043, "grad_norm": 0.9231064915657043, "learning_rate": 4.991795142722012e-05, "loss": 0.432, "step": 250 }, { "epoch": 0.3851138678022588, "grad_norm": 0.9661022424697876, "learning_rate": 4.987935394960661e-05, "loss": 0.3614, "step": 260 }, { "epoch": 0.39992593964080725, "grad_norm": 0.8705118894577026, "learning_rate": 4.983335988551986e-05, "loss": 0.4216, "step": 270 }, { "epoch": 0.4147380114793557, "grad_norm": 0.758140504360199, "learning_rate": 4.9779982908911095e-05, "loss": 0.3533, "step": 280 }, { "epoch": 0.4295500833179041, "grad_norm": 0.5562009215354919, "learning_rate": 4.971923888865792e-05, "loss": 0.3351, "step": 290 }, { "epoch": 0.4443621551564525, "grad_norm": 1.151169776916504, "learning_rate": 4.96511458838466e-05, "loss": 0.3569, "step": 300 }, { "epoch": 0.4443621551564525, "eval_loss": 0.4030126631259918, "eval_runtime": 51.0129, "eval_samples_per_second": 11.781, "eval_steps_per_second": 11.781, "step": 300 }, { "epoch": 0.45917422699500093, "grad_norm": 0.7866811156272888, "learning_rate": 4.957572413840302e-05, "loss": 0.3461, "step": 310 }, { "epoch": 0.47398629883354937, "grad_norm": 0.7985939383506775, "learning_rate": 4.949299607507434e-05, "loss": 0.3691, "step": 320 }, { "epoch": 0.48879837067209775, "grad_norm": 0.6339073777198792, "learning_rate": 4.940298628876261e-05, "loss": 0.3415, "step": 330 }, { "epoch": 0.5036104425106461, "grad_norm": 0.9889721870422363, "learning_rate": 4.930572153921287e-05, "loss": 0.3963, "step": 340 }, { "epoch": 0.5184225143491946, "grad_norm": 0.9128195643424988, "learning_rate": 4.9201230743057425e-05, "loss": 0.4293, "step": 350 }, { "epoch": 0.533234586187743, "grad_norm": 0.6651906371116638, "learning_rate": 4.9089544965219094e-05, "loss": 0.4184, "step": 360 }, { "epoch": 0.5480466580262914, "grad_norm": 1.1542905569076538, "learning_rate": 4.8970697409675536e-05, "loss": 0.3938, "step": 370 }, { "epoch": 0.5628587298648399, "grad_norm": 0.7983744144439697, "learning_rate": 4.884472340958791e-05, "loss": 0.2896, "step": 380 }, { "epoch": 0.5776708017033882, "grad_norm": 0.7634603977203369, "learning_rate": 4.871166041679626e-05, "loss": 0.3483, "step": 390 }, { "epoch": 0.5924828735419366, "grad_norm": 0.6266176700592041, "learning_rate": 4.8571547990685225e-05, "loss": 0.3227, "step": 400 }, { "epoch": 0.6072949453804851, "grad_norm": 1.0247130393981934, "learning_rate": 4.84244277864231e-05, "loss": 0.3696, "step": 410 }, { "epoch": 0.6221070172190335, "grad_norm": 1.0751911401748657, "learning_rate": 4.8270343542577825e-05, "loss": 0.353, "step": 420 }, { "epoch": 0.6369190890575819, "grad_norm": 1.0235852003097534, "learning_rate": 4.810934106811357e-05, "loss": 0.3833, "step": 430 }, { "epoch": 0.6517311608961304, "grad_norm": 0.8674798011779785, "learning_rate": 4.7941468228771816e-05, "loss": 0.375, "step": 440 }, { "epoch": 0.6665432327346787, "grad_norm": 0.5289329886436462, "learning_rate": 4.776677493284101e-05, "loss": 0.3759, "step": 450 }, { "epoch": 0.6813553045732271, "grad_norm": 1.1018059253692627, "learning_rate": 4.758531311631884e-05, "loss": 0.3367, "step": 460 }, { "epoch": 0.6961673764117756, "grad_norm": 0.8128293752670288, "learning_rate": 4.7397136727471833e-05, "loss": 0.3545, "step": 470 }, { "epoch": 0.710979448250324, "grad_norm": 0.8319720029830933, "learning_rate": 4.720230171079657e-05, "loss": 0.3349, "step": 480 }, { "epoch": 0.7257915200888724, "grad_norm": 0.708547830581665, "learning_rate": 4.7000865990387544e-05, "loss": 0.3238, "step": 490 }, { "epoch": 0.7406035919274209, "grad_norm": 0.9730454683303833, "learning_rate": 4.679288945271639e-05, "loss": 0.3195, "step": 500 }, { "epoch": 0.7554156637659692, "grad_norm": 0.5458475947380066, "learning_rate": 4.657843392882778e-05, "loss": 0.3455, "step": 510 }, { "epoch": 0.7702277356045176, "grad_norm": 0.7745851278305054, "learning_rate": 4.635756317595714e-05, "loss": 0.318, "step": 520 }, { "epoch": 0.7850398074430661, "grad_norm": 0.8679530024528503, "learning_rate": 4.6130342858575746e-05, "loss": 0.3197, "step": 530 }, { "epoch": 0.7998518792816145, "grad_norm": 0.8396092057228088, "learning_rate": 4.589684052886883e-05, "loss": 0.3854, "step": 540 }, { "epoch": 0.814663951120163, "grad_norm": 0.8567986488342285, "learning_rate": 4.5657125606652385e-05, "loss": 0.3444, "step": 550 }, { "epoch": 0.8294760229587114, "grad_norm": 0.7525165677070618, "learning_rate": 4.541126935873481e-05, "loss": 0.3416, "step": 560 }, { "epoch": 0.8442880947972597, "grad_norm": 0.788314700126648, "learning_rate": 4.515934487772942e-05, "loss": 0.3404, "step": 570 }, { "epoch": 0.8591001666358082, "grad_norm": 0.7747524380683899, "learning_rate": 4.490142706032414e-05, "loss": 0.343, "step": 580 }, { "epoch": 0.8739122384743566, "grad_norm": 0.8506792783737183, "learning_rate": 4.4637592585014844e-05, "loss": 0.3086, "step": 590 }, { "epoch": 0.888724310312905, "grad_norm": 0.9564735293388367, "learning_rate": 4.4367919889309e-05, "loss": 0.3237, "step": 600 }, { "epoch": 0.888724310312905, "eval_loss": 0.3525813817977905, "eval_runtime": 51.0572, "eval_samples_per_second": 11.771, "eval_steps_per_second": 11.771, "step": 600 }, { "epoch": 0.9035363821514535, "grad_norm": 0.9576528668403625, "learning_rate": 4.409248914640636e-05, "loss": 0.3223, "step": 610 }, { "epoch": 0.9183484539900019, "grad_norm": 0.6880804300308228, "learning_rate": 4.381138224136354e-05, "loss": 0.3404, "step": 620 }, { "epoch": 0.9331605258285502, "grad_norm": 0.7245656847953796, "learning_rate": 4.352468274674981e-05, "loss": 0.3618, "step": 630 }, { "epoch": 0.9479725976670987, "grad_norm": 1.080275297164917, "learning_rate": 4.323247589780111e-05, "loss": 0.3531, "step": 640 }, { "epoch": 0.9627846695056471, "grad_norm": 0.9100397229194641, "learning_rate": 4.293484856707974e-05, "loss": 0.3268, "step": 650 }, { "epoch": 0.9775967413441955, "grad_norm": 0.6466996669769287, "learning_rate": 4.263188923864737e-05, "loss": 0.2773, "step": 660 }, { "epoch": 0.992408813182744, "grad_norm": 0.8750647902488708, "learning_rate": 4.2323687981758796e-05, "loss": 0.2924, "step": 670 }, { "epoch": 1.0072208850212923, "grad_norm": 0.8797662258148193, "learning_rate": 4.2010336424084594e-05, "loss": 0.3101, "step": 680 }, { "epoch": 1.0220329568598407, "grad_norm": 1.079053282737732, "learning_rate": 4.1691927724470356e-05, "loss": 0.2497, "step": 690 }, { "epoch": 1.0368450286983892, "grad_norm": 0.5908809900283813, "learning_rate": 4.136855654524072e-05, "loss": 0.2835, "step": 700 }, { "epoch": 1.0516571005369375, "grad_norm": 0.8815209269523621, "learning_rate": 4.104031902405646e-05, "loss": 0.2889, "step": 710 }, { "epoch": 1.066469172375486, "grad_norm": 0.8487938642501831, "learning_rate": 4.070731274533291e-05, "loss": 0.2689, "step": 720 }, { "epoch": 1.0812812442140345, "grad_norm": 1.1524460315704346, "learning_rate": 4.0369636711228316e-05, "loss": 0.2711, "step": 730 }, { "epoch": 1.0960933160525828, "grad_norm": 1.0302244424819946, "learning_rate": 4.0027391312210664e-05, "loss": 0.2875, "step": 740 }, { "epoch": 1.1109053878911312, "grad_norm": 1.3304013013839722, "learning_rate": 3.968067829721177e-05, "loss": 0.3397, "step": 750 }, { "epoch": 1.1257174597296797, "grad_norm": 1.2835785150527954, "learning_rate": 3.932960074337755e-05, "loss": 0.2515, "step": 760 }, { "epoch": 1.140529531568228, "grad_norm": 1.0710926055908203, "learning_rate": 3.897426302542331e-05, "loss": 0.2724, "step": 770 }, { "epoch": 1.1553416034067765, "grad_norm": 1.0360386371612549, "learning_rate": 3.861477078460337e-05, "loss": 0.2428, "step": 780 }, { "epoch": 1.170153675245325, "grad_norm": 1.1646299362182617, "learning_rate": 3.825123089730412e-05, "loss": 0.2797, "step": 790 }, { "epoch": 1.1849657470838735, "grad_norm": 1.0502442121505737, "learning_rate": 3.788375144326985e-05, "loss": 0.2812, "step": 800 }, { "epoch": 1.1997778189224217, "grad_norm": 1.209110975265503, "learning_rate": 3.751244167347083e-05, "loss": 0.3137, "step": 810 }, { "epoch": 1.2145898907609702, "grad_norm": 1.0595998764038086, "learning_rate": 3.713741197762323e-05, "loss": 0.2929, "step": 820 }, { "epoch": 1.2294019625995185, "grad_norm": 0.9376611113548279, "learning_rate": 3.67587738513704e-05, "loss": 0.2652, "step": 830 }, { "epoch": 1.244214034438067, "grad_norm": 0.8865551948547363, "learning_rate": 3.63766398631355e-05, "loss": 0.228, "step": 840 }, { "epoch": 1.2590261062766155, "grad_norm": 1.2026184797286987, "learning_rate": 3.599112362065506e-05, "loss": 0.2501, "step": 850 }, { "epoch": 1.273838178115164, "grad_norm": 1.1819926500320435, "learning_rate": 3.5602339737203595e-05, "loss": 0.2634, "step": 860 }, { "epoch": 1.2886502499537122, "grad_norm": 0.9832616448402405, "learning_rate": 3.521040379751933e-05, "loss": 0.2827, "step": 870 }, { "epoch": 1.3034623217922607, "grad_norm": 1.0796929597854614, "learning_rate": 3.481543232344104e-05, "loss": 0.2876, "step": 880 }, { "epoch": 1.318274393630809, "grad_norm": 1.038341999053955, "learning_rate": 3.4417542739266336e-05, "loss": 0.2865, "step": 890 }, { "epoch": 1.3330864654693575, "grad_norm": 1.2865569591522217, "learning_rate": 3.401685333684164e-05, "loss": 0.253, "step": 900 }, { "epoch": 1.3330864654693575, "eval_loss": 0.3409191370010376, "eval_runtime": 51.3011, "eval_samples_per_second": 11.715, "eval_steps_per_second": 11.715, "step": 900 }, { "epoch": 1.347898537307906, "grad_norm": 1.0919033288955688, "learning_rate": 3.361348324039419e-05, "loss": 0.2605, "step": 910 }, { "epoch": 1.3627106091464545, "grad_norm": 0.8445180058479309, "learning_rate": 3.320755237111669e-05, "loss": 0.2912, "step": 920 }, { "epoch": 1.3775226809850027, "grad_norm": 1.177072525024414, "learning_rate": 3.2799181411514915e-05, "loss": 0.2847, "step": 930 }, { "epoch": 1.3923347528235512, "grad_norm": 0.8392846584320068, "learning_rate": 3.238849176952904e-05, "loss": 0.3112, "step": 940 }, { "epoch": 1.4071468246620995, "grad_norm": 0.904247522354126, "learning_rate": 3.1975605542439276e-05, "loss": 0.2908, "step": 950 }, { "epoch": 1.421958896500648, "grad_norm": 1.144309639930725, "learning_rate": 3.156064548056656e-05, "loss": 0.2437, "step": 960 }, { "epoch": 1.4367709683391965, "grad_norm": 1.3833638429641724, "learning_rate": 3.114373495077915e-05, "loss": 0.3246, "step": 970 }, { "epoch": 1.451583040177745, "grad_norm": 1.1326124668121338, "learning_rate": 3.072499789981582e-05, "loss": 0.288, "step": 980 }, { "epoch": 1.4663951120162932, "grad_norm": 0.9968894124031067, "learning_rate": 3.030455881743677e-05, "loss": 0.2311, "step": 990 }, { "epoch": 1.4812071838548417, "grad_norm": 1.0647014379501343, "learning_rate": 2.988254269941302e-05, "loss": 0.2739, "step": 1000 }, { "epoch": 1.49601925569339, "grad_norm": 0.813777506351471, "learning_rate": 2.9459075010365405e-05, "loss": 0.2642, "step": 1010 }, { "epoch": 1.5108313275319385, "grad_norm": 1.0000636577606201, "learning_rate": 2.9034281646464194e-05, "loss": 0.2936, "step": 1020 }, { "epoch": 1.525643399370487, "grad_norm": 0.9163423180580139, "learning_rate": 2.860828889800036e-05, "loss": 0.2543, "step": 1030 }, { "epoch": 1.5404554712090355, "grad_norm": 1.0908852815628052, "learning_rate": 2.8181223411839684e-05, "loss": 0.2968, "step": 1040 }, { "epoch": 1.5552675430475837, "grad_norm": 1.1358126401901245, "learning_rate": 2.7753212153770947e-05, "loss": 0.2476, "step": 1050 }, { "epoch": 1.5700796148861322, "grad_norm": 1.0470831394195557, "learning_rate": 2.7324382370759172e-05, "loss": 0.275, "step": 1060 }, { "epoch": 1.5848916867246805, "grad_norm": 1.3022797107696533, "learning_rate": 2.6894861553115336e-05, "loss": 0.3022, "step": 1070 }, { "epoch": 1.599703758563229, "grad_norm": 1.118695855140686, "learning_rate": 2.646477739659378e-05, "loss": 0.3242, "step": 1080 }, { "epoch": 1.6145158304017775, "grad_norm": 1.1856898069381714, "learning_rate": 2.6034257764428456e-05, "loss": 0.2554, "step": 1090 }, { "epoch": 1.629327902240326, "grad_norm": 1.2559823989868164, "learning_rate": 2.560343064931941e-05, "loss": 0.2542, "step": 1100 }, { "epoch": 1.6441399740788742, "grad_norm": 0.8543432950973511, "learning_rate": 2.5172424135380817e-05, "loss": 0.2702, "step": 1110 }, { "epoch": 1.6589520459174227, "grad_norm": 0.8952140212059021, "learning_rate": 2.474136636006181e-05, "loss": 0.2951, "step": 1120 }, { "epoch": 1.673764117755971, "grad_norm": 1.4929783344268799, "learning_rate": 2.4310385476051418e-05, "loss": 0.2831, "step": 1130 }, { "epoch": 1.6885761895945195, "grad_norm": 1.0444386005401611, "learning_rate": 2.3879609613179028e-05, "loss": 0.253, "step": 1140 }, { "epoch": 1.703388261433068, "grad_norm": 0.8181633353233337, "learning_rate": 2.3449166840321627e-05, "loss": 0.2836, "step": 1150 }, { "epoch": 1.7182003332716165, "grad_norm": 0.8273675441741943, "learning_rate": 2.3019185127329144e-05, "loss": 0.2922, "step": 1160 }, { "epoch": 1.7330124051101647, "grad_norm": 1.3424749374389648, "learning_rate": 2.258979230697923e-05, "loss": 0.2787, "step": 1170 }, { "epoch": 1.7478244769487132, "grad_norm": 1.03569495677948, "learning_rate": 2.2161116036972884e-05, "loss": 0.2781, "step": 1180 }, { "epoch": 1.7626365487872615, "grad_norm": 1.2545360326766968, "learning_rate": 2.1733283761981936e-05, "loss": 0.2541, "step": 1190 }, { "epoch": 1.77744862062581, "grad_norm": 1.7133238315582275, "learning_rate": 2.130642267576008e-05, "loss": 0.2651, "step": 1200 }, { "epoch": 1.77744862062581, "eval_loss": 0.32795146107673645, "eval_runtime": 51.064, "eval_samples_per_second": 11.77, "eval_steps_per_second": 11.77, "step": 1200 } ], "logging_steps": 10, "max_steps": 2025, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3424219794582733e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }