{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997049277072882, "eval_steps": 500, "global_step": 847, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011802891708468575, "grad_norm": 0.6382197141647339, "learning_rate": 5.294117647058824e-06, "loss": 1.7524, "step": 10 }, { "epoch": 0.02360578341693715, "grad_norm": 0.5001206994056702, "learning_rate": 1.1176470588235295e-05, "loss": 1.3315, "step": 20 }, { "epoch": 0.03540867512540572, "grad_norm": 0.41650518774986267, "learning_rate": 1.7058823529411767e-05, "loss": 1.1148, "step": 30 }, { "epoch": 0.0472115668338743, "grad_norm": 0.42574718594551086, "learning_rate": 2.235294117647059e-05, "loss": 1.0196, "step": 40 }, { "epoch": 0.05901445854234287, "grad_norm": 0.3408316373825073, "learning_rate": 2.823529411764706e-05, "loss": 0.94, "step": 50 }, { "epoch": 0.07081735025081144, "grad_norm": 0.39876773953437805, "learning_rate": 3.411764705882353e-05, "loss": 0.8918, "step": 60 }, { "epoch": 0.08262024195928003, "grad_norm": 0.32425975799560547, "learning_rate": 4e-05, "loss": 0.8412, "step": 70 }, { "epoch": 0.0944231336677486, "grad_norm": 0.40873634815216064, "learning_rate": 4.588235294117647e-05, "loss": 0.887, "step": 80 }, { "epoch": 0.10622602537621717, "grad_norm": 0.4909669756889343, "learning_rate": 4.9998087784700426e-05, "loss": 0.8888, "step": 90 }, { "epoch": 0.11802891708468574, "grad_norm": 0.3897865414619446, "learning_rate": 4.996410098317137e-05, "loss": 0.8555, "step": 100 }, { "epoch": 0.1298318087931543, "grad_norm": 0.3305865228176117, "learning_rate": 4.989723448187131e-05, "loss": 0.8424, "step": 110 }, { "epoch": 0.14163470050162288, "grad_norm": 0.3554224669933319, "learning_rate": 4.9845268462432916e-05, "loss": 0.8445, "step": 120 }, { "epoch": 0.15343759221009148, "grad_norm": 0.46097129583358765, "learning_rate": 4.970969070763177e-05, "loss": 0.8377, "step": 130 }, { "epoch": 0.16524048391856005, "grad_norm": 0.3145534098148346, "learning_rate": 4.953211814536217e-05, "loss": 0.759, "step": 140 }, { "epoch": 0.17704337562702863, "grad_norm": 0.42392656207084656, "learning_rate": 4.931285256513868e-05, "loss": 0.8121, "step": 150 }, { "epoch": 0.1888462673354972, "grad_norm": 0.4339812994003296, "learning_rate": 4.905226661492095e-05, "loss": 0.7896, "step": 160 }, { "epoch": 0.20064915904396577, "grad_norm": 0.44723227620124817, "learning_rate": 4.8750803167788136e-05, "loss": 0.8057, "step": 170 }, { "epoch": 0.21245205075243434, "grad_norm": 0.46169158816337585, "learning_rate": 4.840897456926373e-05, "loss": 0.7724, "step": 180 }, { "epoch": 0.2242549424609029, "grad_norm": 0.41829928755760193, "learning_rate": 4.8027361766570117e-05, "loss": 0.7458, "step": 190 }, { "epoch": 0.23605783416937148, "grad_norm": 0.4120149612426758, "learning_rate": 4.760661332129254e-05, "loss": 0.7686, "step": 200 }, { "epoch": 0.24786072587784008, "grad_norm": 0.3918631970882416, "learning_rate": 4.7147444307130686e-05, "loss": 0.769, "step": 210 }, { "epoch": 0.2596636175863086, "grad_norm": 0.4276711642742157, "learning_rate": 4.665063509461097e-05, "loss": 0.7574, "step": 220 }, { "epoch": 0.2714665092947772, "grad_norm": 0.42904192209243774, "learning_rate": 4.6117030024825114e-05, "loss": 0.7826, "step": 230 }, { "epoch": 0.28326940100324577, "grad_norm": 0.5145927667617798, "learning_rate": 4.554753597444896e-05, "loss": 0.7954, "step": 240 }, { "epoch": 0.29507229271171437, "grad_norm": 0.3549771010875702, "learning_rate": 4.494312081448029e-05, "loss": 0.7527, "step": 250 }, { "epoch": 0.30687518442018297, "grad_norm": 0.4441188871860504, "learning_rate": 4.4304811765315105e-05, "loss": 0.7321, "step": 260 }, { "epoch": 0.3186780761286515, "grad_norm": 0.3967060148715973, "learning_rate": 4.3633693650957976e-05, "loss": 0.7047, "step": 270 }, { "epoch": 0.3304809678371201, "grad_norm": 0.44348135590553284, "learning_rate": 4.293090705533342e-05, "loss": 0.7431, "step": 280 }, { "epoch": 0.34228385954558865, "grad_norm": 0.9141893982887268, "learning_rate": 4.219764638383177e-05, "loss": 0.7177, "step": 290 }, { "epoch": 0.35408675125405725, "grad_norm": 0.45525214076042175, "learning_rate": 4.1435157833383955e-05, "loss": 0.7128, "step": 300 }, { "epoch": 0.3658896429625258, "grad_norm": 0.537662148475647, "learning_rate": 4.06447372745151e-05, "loss": 0.7162, "step": 310 }, { "epoch": 0.3776925346709944, "grad_norm": 0.4020293653011322, "learning_rate": 3.982772804897649e-05, "loss": 0.7212, "step": 320 }, { "epoch": 0.389495426379463, "grad_norm": 0.6390876173973083, "learning_rate": 3.898551868669883e-05, "loss": 0.716, "step": 330 }, { "epoch": 0.40129831808793154, "grad_norm": 0.47102075815200806, "learning_rate": 3.811954054594702e-05, "loss": 0.733, "step": 340 }, { "epoch": 0.41310120979640014, "grad_norm": 0.5660268664360046, "learning_rate": 3.723126538068686e-05, "loss": 0.764, "step": 350 }, { "epoch": 0.4249041015048687, "grad_norm": 0.595162570476532, "learning_rate": 3.632220283929822e-05, "loss": 0.7302, "step": 360 }, { "epoch": 0.4367069932133373, "grad_norm": 0.5331649780273438, "learning_rate": 3.5393897898885606e-05, "loss": 0.7127, "step": 370 }, { "epoch": 0.4485098849218058, "grad_norm": 0.4248451590538025, "learning_rate": 3.444792823954651e-05, "loss": 0.6933, "step": 380 }, { "epoch": 0.4603127766302744, "grad_norm": 0.5570621490478516, "learning_rate": 3.348590156306017e-05, "loss": 0.7012, "step": 390 }, { "epoch": 0.47211566833874297, "grad_norm": 0.41210871934890747, "learning_rate": 3.25094528605536e-05, "loss": 0.7006, "step": 400 }, { "epoch": 0.48391856004721157, "grad_norm": 0.5020595788955688, "learning_rate": 3.152024163378867e-05, "loss": 0.7159, "step": 410 }, { "epoch": 0.49572145175568016, "grad_norm": 0.5407310724258423, "learning_rate": 3.051994907479265e-05, "loss": 0.7002, "step": 420 }, { "epoch": 0.5075243434641488, "grad_norm": 0.422695130109787, "learning_rate": 2.9510275208625522e-05, "loss": 0.6721, "step": 430 }, { "epoch": 0.5193272351726173, "grad_norm": 0.4953523576259613, "learning_rate": 2.849293600414002e-05, "loss": 0.6612, "step": 440 }, { "epoch": 0.5311301268810859, "grad_norm": 0.44490641355514526, "learning_rate": 2.7469660457644857e-05, "loss": 0.6786, "step": 450 }, { "epoch": 0.5429330185895545, "grad_norm": 0.3714945912361145, "learning_rate": 2.644218765442728e-05, "loss": 0.6731, "step": 460 }, { "epoch": 0.554735910298023, "grad_norm": 0.44450584053993225, "learning_rate": 2.541226381312924e-05, "loss": 0.6876, "step": 470 }, { "epoch": 0.5665388020064915, "grad_norm": 0.4537455439567566, "learning_rate": 2.4381639318000126e-05, "loss": 0.6757, "step": 480 }, { "epoch": 0.5783416937149601, "grad_norm": 0.4810272753238678, "learning_rate": 2.3352065744070072e-05, "loss": 0.7128, "step": 490 }, { "epoch": 0.5901445854234287, "grad_norm": 0.49226102232933044, "learning_rate": 2.2325292880299335e-05, "loss": 0.6928, "step": 500 }, { "epoch": 0.6019474771318973, "grad_norm": 0.46990668773651123, "learning_rate": 2.1303065755763277e-05, "loss": 0.6482, "step": 510 }, { "epoch": 0.6137503688403659, "grad_norm": 0.43036311864852905, "learning_rate": 2.0287121673926828e-05, "loss": 0.6759, "step": 520 }, { "epoch": 0.6255532605488344, "grad_norm": 0.373436838388443, "learning_rate": 1.92791872600489e-05, "loss": 0.674, "step": 530 }, { "epoch": 0.637356152257303, "grad_norm": 0.4169735312461853, "learning_rate": 1.8280975526734657e-05, "loss": 0.6636, "step": 540 }, { "epoch": 0.6491590439657716, "grad_norm": 0.3966214060783386, "learning_rate": 1.7294182962622846e-05, "loss": 0.658, "step": 550 }, { "epoch": 0.6609619356742402, "grad_norm": 0.45455384254455566, "learning_rate": 1.632048664915622e-05, "loss": 0.6563, "step": 560 }, { "epoch": 0.6727648273827088, "grad_norm": 0.513671875, "learning_rate": 1.536154141033482e-05, "loss": 0.6481, "step": 570 }, { "epoch": 0.6845677190911773, "grad_norm": 0.4144147038459778, "learning_rate": 1.4418977000296552e-05, "loss": 0.681, "step": 580 }, { "epoch": 0.6963706107996459, "grad_norm": 0.4277999997138977, "learning_rate": 1.3494395333504622e-05, "loss": 0.655, "step": 590 }, { "epoch": 0.7081735025081145, "grad_norm": 0.4542660415172577, "learning_rate": 1.2589367762249347e-05, "loss": 0.6557, "step": 600 }, { "epoch": 0.7199763942165831, "grad_norm": 0.518882155418396, "learning_rate": 1.1705432406091085e-05, "loss": 0.6504, "step": 610 }, { "epoch": 0.7317792859250516, "grad_norm": 0.3764165937900543, "learning_rate": 1.0844091537783316e-05, "loss": 0.6509, "step": 620 }, { "epoch": 0.7435821776335202, "grad_norm": 0.40605178475379944, "learning_rate": 1.0006809030118181e-05, "loss": 0.6619, "step": 630 }, { "epoch": 0.7553850693419888, "grad_norm": 0.42034676671028137, "learning_rate": 9.195007868033933e-06, "loss": 0.6083, "step": 640 }, { "epoch": 0.7671879610504574, "grad_norm": 0.4199008345603943, "learning_rate": 8.410067730212439e-06, "loss": 0.6464, "step": 650 }, { "epoch": 0.778990852758926, "grad_norm": 0.4271228611469269, "learning_rate": 7.653322644276779e-06, "loss": 0.6342, "step": 660 }, { "epoch": 0.7907937444673945, "grad_norm": 0.49036702513694763, "learning_rate": 6.926058719574207e-06, "loss": 0.6492, "step": 670 }, { "epoch": 0.8025966361758631, "grad_norm": 0.4103890061378479, "learning_rate": 6.229511961397455e-06, "loss": 0.6294, "step": 680 }, { "epoch": 0.8143995278843317, "grad_norm": 0.38033077120780945, "learning_rate": 5.564866170359351e-06, "loss": 0.638, "step": 690 }, { "epoch": 0.8262024195928003, "grad_norm": 0.3652307987213135, "learning_rate": 4.933250930490715e-06, "loss": 0.6096, "step": 700 }, { "epoch": 0.8380053113012688, "grad_norm": 0.5351826548576355, "learning_rate": 4.335739689480778e-06, "loss": 0.6285, "step": 710 }, { "epoch": 0.8498082030097374, "grad_norm": 0.427626371383667, "learning_rate": 3.773347934323035e-06, "loss": 0.6257, "step": 720 }, { "epoch": 0.861611094718206, "grad_norm": 0.46427205204963684, "learning_rate": 3.2470314654667487e-06, "loss": 0.6142, "step": 730 }, { "epoch": 0.8734139864266746, "grad_norm": 0.5393053293228149, "learning_rate": 2.7576847724075123e-06, "loss": 0.6485, "step": 740 }, { "epoch": 0.8852168781351432, "grad_norm": 0.4637604057788849, "learning_rate": 2.3061395134774038e-06, "loss": 0.6407, "step": 750 }, { "epoch": 0.8970197698436116, "grad_norm": 0.40724095702171326, "learning_rate": 1.8931631024185327e-06, "loss": 0.6535, "step": 760 }, { "epoch": 0.9088226615520802, "grad_norm": 0.4840000569820404, "learning_rate": 1.5194574041419802e-06, "loss": 0.642, "step": 770 }, { "epoch": 0.9206255532605488, "grad_norm": 0.41105934977531433, "learning_rate": 1.185657541888857e-06, "loss": 0.617, "step": 780 }, { "epoch": 0.9324284449690174, "grad_norm": 0.557059645652771, "learning_rate": 8.923308178206552e-07, "loss": 0.6415, "step": 790 }, { "epoch": 0.9442313366774859, "grad_norm": 0.38617223501205444, "learning_rate": 6.39975748873431e-07, "loss": 0.6388, "step": 800 }, { "epoch": 0.9560342283859545, "grad_norm": 0.4779140055179596, "learning_rate": 4.2902121951440834e-07, "loss": 0.6366, "step": 810 }, { "epoch": 0.9678371200944231, "grad_norm": 0.4569835662841797, "learning_rate": 2.5982575284084486e-07, "loss": 0.6735, "step": 820 }, { "epoch": 0.9796400118028917, "grad_norm": 0.4118465185165405, "learning_rate": 1.3267690126008425e-07, "loss": 0.6238, "step": 830 }, { "epoch": 0.9914429035113603, "grad_norm": 0.4550204873085022, "learning_rate": 4.779075778620079e-08, "loss": 0.6613, "step": 840 }, { "epoch": 0.9997049277072882, "step": 847, "total_flos": 5.491458012295987e+18, "train_loss": 0.7367874357185229, "train_runtime": 38132.292, "train_samples_per_second": 0.711, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 847, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.491458012295987e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }