|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997049277072882, |
|
"eval_steps": 500, |
|
"global_step": 847, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011802891708468575, |
|
"grad_norm": 0.6382197141647339, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 1.7524, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02360578341693715, |
|
"grad_norm": 0.5001206994056702, |
|
"learning_rate": 1.1176470588235295e-05, |
|
"loss": 1.3315, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03540867512540572, |
|
"grad_norm": 0.41650518774986267, |
|
"learning_rate": 1.7058823529411767e-05, |
|
"loss": 1.1148, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0472115668338743, |
|
"grad_norm": 0.42574718594551086, |
|
"learning_rate": 2.235294117647059e-05, |
|
"loss": 1.0196, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05901445854234287, |
|
"grad_norm": 0.3408316373825073, |
|
"learning_rate": 2.823529411764706e-05, |
|
"loss": 0.94, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07081735025081144, |
|
"grad_norm": 0.39876773953437805, |
|
"learning_rate": 3.411764705882353e-05, |
|
"loss": 0.8918, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08262024195928003, |
|
"grad_norm": 0.32425975799560547, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8412, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0944231336677486, |
|
"grad_norm": 0.40873634815216064, |
|
"learning_rate": 4.588235294117647e-05, |
|
"loss": 0.887, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10622602537621717, |
|
"grad_norm": 0.4909669756889343, |
|
"learning_rate": 4.9998087784700426e-05, |
|
"loss": 0.8888, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11802891708468574, |
|
"grad_norm": 0.3897865414619446, |
|
"learning_rate": 4.996410098317137e-05, |
|
"loss": 0.8555, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1298318087931543, |
|
"grad_norm": 0.3305865228176117, |
|
"learning_rate": 4.989723448187131e-05, |
|
"loss": 0.8424, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14163470050162288, |
|
"grad_norm": 0.3554224669933319, |
|
"learning_rate": 4.9845268462432916e-05, |
|
"loss": 0.8445, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15343759221009148, |
|
"grad_norm": 0.46097129583358765, |
|
"learning_rate": 4.970969070763177e-05, |
|
"loss": 0.8377, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16524048391856005, |
|
"grad_norm": 0.3145534098148346, |
|
"learning_rate": 4.953211814536217e-05, |
|
"loss": 0.759, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17704337562702863, |
|
"grad_norm": 0.42392656207084656, |
|
"learning_rate": 4.931285256513868e-05, |
|
"loss": 0.8121, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1888462673354972, |
|
"grad_norm": 0.4339812994003296, |
|
"learning_rate": 4.905226661492095e-05, |
|
"loss": 0.7896, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20064915904396577, |
|
"grad_norm": 0.44723227620124817, |
|
"learning_rate": 4.8750803167788136e-05, |
|
"loss": 0.8057, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21245205075243434, |
|
"grad_norm": 0.46169158816337585, |
|
"learning_rate": 4.840897456926373e-05, |
|
"loss": 0.7724, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2242549424609029, |
|
"grad_norm": 0.41829928755760193, |
|
"learning_rate": 4.8027361766570117e-05, |
|
"loss": 0.7458, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23605783416937148, |
|
"grad_norm": 0.4120149612426758, |
|
"learning_rate": 4.760661332129254e-05, |
|
"loss": 0.7686, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24786072587784008, |
|
"grad_norm": 0.3918631970882416, |
|
"learning_rate": 4.7147444307130686e-05, |
|
"loss": 0.769, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2596636175863086, |
|
"grad_norm": 0.4276711642742157, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.7574, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2714665092947772, |
|
"grad_norm": 0.42904192209243774, |
|
"learning_rate": 4.6117030024825114e-05, |
|
"loss": 0.7826, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28326940100324577, |
|
"grad_norm": 0.5145927667617798, |
|
"learning_rate": 4.554753597444896e-05, |
|
"loss": 0.7954, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.29507229271171437, |
|
"grad_norm": 0.3549771010875702, |
|
"learning_rate": 4.494312081448029e-05, |
|
"loss": 0.7527, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.30687518442018297, |
|
"grad_norm": 0.4441188871860504, |
|
"learning_rate": 4.4304811765315105e-05, |
|
"loss": 0.7321, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3186780761286515, |
|
"grad_norm": 0.3967060148715973, |
|
"learning_rate": 4.3633693650957976e-05, |
|
"loss": 0.7047, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3304809678371201, |
|
"grad_norm": 0.44348135590553284, |
|
"learning_rate": 4.293090705533342e-05, |
|
"loss": 0.7431, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34228385954558865, |
|
"grad_norm": 0.9141893982887268, |
|
"learning_rate": 4.219764638383177e-05, |
|
"loss": 0.7177, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35408675125405725, |
|
"grad_norm": 0.45525214076042175, |
|
"learning_rate": 4.1435157833383955e-05, |
|
"loss": 0.7128, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3658896429625258, |
|
"grad_norm": 0.537662148475647, |
|
"learning_rate": 4.06447372745151e-05, |
|
"loss": 0.7162, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3776925346709944, |
|
"grad_norm": 0.4020293653011322, |
|
"learning_rate": 3.982772804897649e-05, |
|
"loss": 0.7212, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.389495426379463, |
|
"grad_norm": 0.6390876173973083, |
|
"learning_rate": 3.898551868669883e-05, |
|
"loss": 0.716, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.40129831808793154, |
|
"grad_norm": 0.47102075815200806, |
|
"learning_rate": 3.811954054594702e-05, |
|
"loss": 0.733, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.41310120979640014, |
|
"grad_norm": 0.5660268664360046, |
|
"learning_rate": 3.723126538068686e-05, |
|
"loss": 0.764, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4249041015048687, |
|
"grad_norm": 0.595162570476532, |
|
"learning_rate": 3.632220283929822e-05, |
|
"loss": 0.7302, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4367069932133373, |
|
"grad_norm": 0.5331649780273438, |
|
"learning_rate": 3.5393897898885606e-05, |
|
"loss": 0.7127, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4485098849218058, |
|
"grad_norm": 0.4248451590538025, |
|
"learning_rate": 3.444792823954651e-05, |
|
"loss": 0.6933, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4603127766302744, |
|
"grad_norm": 0.5570621490478516, |
|
"learning_rate": 3.348590156306017e-05, |
|
"loss": 0.7012, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.47211566833874297, |
|
"grad_norm": 0.41210871934890747, |
|
"learning_rate": 3.25094528605536e-05, |
|
"loss": 0.7006, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.48391856004721157, |
|
"grad_norm": 0.5020595788955688, |
|
"learning_rate": 3.152024163378867e-05, |
|
"loss": 0.7159, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.49572145175568016, |
|
"grad_norm": 0.5407310724258423, |
|
"learning_rate": 3.051994907479265e-05, |
|
"loss": 0.7002, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5075243434641488, |
|
"grad_norm": 0.422695130109787, |
|
"learning_rate": 2.9510275208625522e-05, |
|
"loss": 0.6721, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5193272351726173, |
|
"grad_norm": 0.4953523576259613, |
|
"learning_rate": 2.849293600414002e-05, |
|
"loss": 0.6612, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5311301268810859, |
|
"grad_norm": 0.44490641355514526, |
|
"learning_rate": 2.7469660457644857e-05, |
|
"loss": 0.6786, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5429330185895545, |
|
"grad_norm": 0.3714945912361145, |
|
"learning_rate": 2.644218765442728e-05, |
|
"loss": 0.6731, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.554735910298023, |
|
"grad_norm": 0.44450584053993225, |
|
"learning_rate": 2.541226381312924e-05, |
|
"loss": 0.6876, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5665388020064915, |
|
"grad_norm": 0.4537455439567566, |
|
"learning_rate": 2.4381639318000126e-05, |
|
"loss": 0.6757, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5783416937149601, |
|
"grad_norm": 0.4810272753238678, |
|
"learning_rate": 2.3352065744070072e-05, |
|
"loss": 0.7128, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5901445854234287, |
|
"grad_norm": 0.49226102232933044, |
|
"learning_rate": 2.2325292880299335e-05, |
|
"loss": 0.6928, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6019474771318973, |
|
"grad_norm": 0.46990668773651123, |
|
"learning_rate": 2.1303065755763277e-05, |
|
"loss": 0.6482, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6137503688403659, |
|
"grad_norm": 0.43036311864852905, |
|
"learning_rate": 2.0287121673926828e-05, |
|
"loss": 0.6759, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6255532605488344, |
|
"grad_norm": 0.373436838388443, |
|
"learning_rate": 1.92791872600489e-05, |
|
"loss": 0.674, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.637356152257303, |
|
"grad_norm": 0.4169735312461853, |
|
"learning_rate": 1.8280975526734657e-05, |
|
"loss": 0.6636, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6491590439657716, |
|
"grad_norm": 0.3966214060783386, |
|
"learning_rate": 1.7294182962622846e-05, |
|
"loss": 0.658, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6609619356742402, |
|
"grad_norm": 0.45455384254455566, |
|
"learning_rate": 1.632048664915622e-05, |
|
"loss": 0.6563, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6727648273827088, |
|
"grad_norm": 0.513671875, |
|
"learning_rate": 1.536154141033482e-05, |
|
"loss": 0.6481, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6845677190911773, |
|
"grad_norm": 0.4144147038459778, |
|
"learning_rate": 1.4418977000296552e-05, |
|
"loss": 0.681, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6963706107996459, |
|
"grad_norm": 0.4277999997138977, |
|
"learning_rate": 1.3494395333504622e-05, |
|
"loss": 0.655, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7081735025081145, |
|
"grad_norm": 0.4542660415172577, |
|
"learning_rate": 1.2589367762249347e-05, |
|
"loss": 0.6557, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7199763942165831, |
|
"grad_norm": 0.518882155418396, |
|
"learning_rate": 1.1705432406091085e-05, |
|
"loss": 0.6504, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7317792859250516, |
|
"grad_norm": 0.3764165937900543, |
|
"learning_rate": 1.0844091537783316e-05, |
|
"loss": 0.6509, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7435821776335202, |
|
"grad_norm": 0.40605178475379944, |
|
"learning_rate": 1.0006809030118181e-05, |
|
"loss": 0.6619, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7553850693419888, |
|
"grad_norm": 0.42034676671028137, |
|
"learning_rate": 9.195007868033933e-06, |
|
"loss": 0.6083, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7671879610504574, |
|
"grad_norm": 0.4199008345603943, |
|
"learning_rate": 8.410067730212439e-06, |
|
"loss": 0.6464, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.778990852758926, |
|
"grad_norm": 0.4271228611469269, |
|
"learning_rate": 7.653322644276779e-06, |
|
"loss": 0.6342, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7907937444673945, |
|
"grad_norm": 0.49036702513694763, |
|
"learning_rate": 6.926058719574207e-06, |
|
"loss": 0.6492, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8025966361758631, |
|
"grad_norm": 0.4103890061378479, |
|
"learning_rate": 6.229511961397455e-06, |
|
"loss": 0.6294, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8143995278843317, |
|
"grad_norm": 0.38033077120780945, |
|
"learning_rate": 5.564866170359351e-06, |
|
"loss": 0.638, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8262024195928003, |
|
"grad_norm": 0.3652307987213135, |
|
"learning_rate": 4.933250930490715e-06, |
|
"loss": 0.6096, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8380053113012688, |
|
"grad_norm": 0.5351826548576355, |
|
"learning_rate": 4.335739689480778e-06, |
|
"loss": 0.6285, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8498082030097374, |
|
"grad_norm": 0.427626371383667, |
|
"learning_rate": 3.773347934323035e-06, |
|
"loss": 0.6257, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.861611094718206, |
|
"grad_norm": 0.46427205204963684, |
|
"learning_rate": 3.2470314654667487e-06, |
|
"loss": 0.6142, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8734139864266746, |
|
"grad_norm": 0.5393053293228149, |
|
"learning_rate": 2.7576847724075123e-06, |
|
"loss": 0.6485, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8852168781351432, |
|
"grad_norm": 0.4637604057788849, |
|
"learning_rate": 2.3061395134774038e-06, |
|
"loss": 0.6407, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8970197698436116, |
|
"grad_norm": 0.40724095702171326, |
|
"learning_rate": 1.8931631024185327e-06, |
|
"loss": 0.6535, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9088226615520802, |
|
"grad_norm": 0.4840000569820404, |
|
"learning_rate": 1.5194574041419802e-06, |
|
"loss": 0.642, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9206255532605488, |
|
"grad_norm": 0.41105934977531433, |
|
"learning_rate": 1.185657541888857e-06, |
|
"loss": 0.617, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9324284449690174, |
|
"grad_norm": 0.557059645652771, |
|
"learning_rate": 8.923308178206552e-07, |
|
"loss": 0.6415, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9442313366774859, |
|
"grad_norm": 0.38617223501205444, |
|
"learning_rate": 6.39975748873431e-07, |
|
"loss": 0.6388, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9560342283859545, |
|
"grad_norm": 0.4779140055179596, |
|
"learning_rate": 4.2902121951440834e-07, |
|
"loss": 0.6366, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9678371200944231, |
|
"grad_norm": 0.4569835662841797, |
|
"learning_rate": 2.5982575284084486e-07, |
|
"loss": 0.6735, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9796400118028917, |
|
"grad_norm": 0.4118465185165405, |
|
"learning_rate": 1.3267690126008425e-07, |
|
"loss": 0.6238, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9914429035113603, |
|
"grad_norm": 0.4550204873085022, |
|
"learning_rate": 4.779075778620079e-08, |
|
"loss": 0.6613, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9997049277072882, |
|
"step": 847, |
|
"total_flos": 5.491458012295987e+18, |
|
"train_loss": 0.7367874357185229, |
|
"train_runtime": 38132.292, |
|
"train_samples_per_second": 0.711, |
|
"train_steps_per_second": 0.022 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 847, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.491458012295987e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|