Boffl's picture
Upload trainer_state.json with huggingface_hub
f8ffd7a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997049277072882,
"eval_steps": 500,
"global_step": 847,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011802891708468575,
"grad_norm": 0.6382197141647339,
"learning_rate": 5.294117647058824e-06,
"loss": 1.7524,
"step": 10
},
{
"epoch": 0.02360578341693715,
"grad_norm": 0.5001206994056702,
"learning_rate": 1.1176470588235295e-05,
"loss": 1.3315,
"step": 20
},
{
"epoch": 0.03540867512540572,
"grad_norm": 0.41650518774986267,
"learning_rate": 1.7058823529411767e-05,
"loss": 1.1148,
"step": 30
},
{
"epoch": 0.0472115668338743,
"grad_norm": 0.42574718594551086,
"learning_rate": 2.235294117647059e-05,
"loss": 1.0196,
"step": 40
},
{
"epoch": 0.05901445854234287,
"grad_norm": 0.3408316373825073,
"learning_rate": 2.823529411764706e-05,
"loss": 0.94,
"step": 50
},
{
"epoch": 0.07081735025081144,
"grad_norm": 0.39876773953437805,
"learning_rate": 3.411764705882353e-05,
"loss": 0.8918,
"step": 60
},
{
"epoch": 0.08262024195928003,
"grad_norm": 0.32425975799560547,
"learning_rate": 4e-05,
"loss": 0.8412,
"step": 70
},
{
"epoch": 0.0944231336677486,
"grad_norm": 0.40873634815216064,
"learning_rate": 4.588235294117647e-05,
"loss": 0.887,
"step": 80
},
{
"epoch": 0.10622602537621717,
"grad_norm": 0.4909669756889343,
"learning_rate": 4.9998087784700426e-05,
"loss": 0.8888,
"step": 90
},
{
"epoch": 0.11802891708468574,
"grad_norm": 0.3897865414619446,
"learning_rate": 4.996410098317137e-05,
"loss": 0.8555,
"step": 100
},
{
"epoch": 0.1298318087931543,
"grad_norm": 0.3305865228176117,
"learning_rate": 4.989723448187131e-05,
"loss": 0.8424,
"step": 110
},
{
"epoch": 0.14163470050162288,
"grad_norm": 0.3554224669933319,
"learning_rate": 4.9845268462432916e-05,
"loss": 0.8445,
"step": 120
},
{
"epoch": 0.15343759221009148,
"grad_norm": 0.46097129583358765,
"learning_rate": 4.970969070763177e-05,
"loss": 0.8377,
"step": 130
},
{
"epoch": 0.16524048391856005,
"grad_norm": 0.3145534098148346,
"learning_rate": 4.953211814536217e-05,
"loss": 0.759,
"step": 140
},
{
"epoch": 0.17704337562702863,
"grad_norm": 0.42392656207084656,
"learning_rate": 4.931285256513868e-05,
"loss": 0.8121,
"step": 150
},
{
"epoch": 0.1888462673354972,
"grad_norm": 0.4339812994003296,
"learning_rate": 4.905226661492095e-05,
"loss": 0.7896,
"step": 160
},
{
"epoch": 0.20064915904396577,
"grad_norm": 0.44723227620124817,
"learning_rate": 4.8750803167788136e-05,
"loss": 0.8057,
"step": 170
},
{
"epoch": 0.21245205075243434,
"grad_norm": 0.46169158816337585,
"learning_rate": 4.840897456926373e-05,
"loss": 0.7724,
"step": 180
},
{
"epoch": 0.2242549424609029,
"grad_norm": 0.41829928755760193,
"learning_rate": 4.8027361766570117e-05,
"loss": 0.7458,
"step": 190
},
{
"epoch": 0.23605783416937148,
"grad_norm": 0.4120149612426758,
"learning_rate": 4.760661332129254e-05,
"loss": 0.7686,
"step": 200
},
{
"epoch": 0.24786072587784008,
"grad_norm": 0.3918631970882416,
"learning_rate": 4.7147444307130686e-05,
"loss": 0.769,
"step": 210
},
{
"epoch": 0.2596636175863086,
"grad_norm": 0.4276711642742157,
"learning_rate": 4.665063509461097e-05,
"loss": 0.7574,
"step": 220
},
{
"epoch": 0.2714665092947772,
"grad_norm": 0.42904192209243774,
"learning_rate": 4.6117030024825114e-05,
"loss": 0.7826,
"step": 230
},
{
"epoch": 0.28326940100324577,
"grad_norm": 0.5145927667617798,
"learning_rate": 4.554753597444896e-05,
"loss": 0.7954,
"step": 240
},
{
"epoch": 0.29507229271171437,
"grad_norm": 0.3549771010875702,
"learning_rate": 4.494312081448029e-05,
"loss": 0.7527,
"step": 250
},
{
"epoch": 0.30687518442018297,
"grad_norm": 0.4441188871860504,
"learning_rate": 4.4304811765315105e-05,
"loss": 0.7321,
"step": 260
},
{
"epoch": 0.3186780761286515,
"grad_norm": 0.3967060148715973,
"learning_rate": 4.3633693650957976e-05,
"loss": 0.7047,
"step": 270
},
{
"epoch": 0.3304809678371201,
"grad_norm": 0.44348135590553284,
"learning_rate": 4.293090705533342e-05,
"loss": 0.7431,
"step": 280
},
{
"epoch": 0.34228385954558865,
"grad_norm": 0.9141893982887268,
"learning_rate": 4.219764638383177e-05,
"loss": 0.7177,
"step": 290
},
{
"epoch": 0.35408675125405725,
"grad_norm": 0.45525214076042175,
"learning_rate": 4.1435157833383955e-05,
"loss": 0.7128,
"step": 300
},
{
"epoch": 0.3658896429625258,
"grad_norm": 0.537662148475647,
"learning_rate": 4.06447372745151e-05,
"loss": 0.7162,
"step": 310
},
{
"epoch": 0.3776925346709944,
"grad_norm": 0.4020293653011322,
"learning_rate": 3.982772804897649e-05,
"loss": 0.7212,
"step": 320
},
{
"epoch": 0.389495426379463,
"grad_norm": 0.6390876173973083,
"learning_rate": 3.898551868669883e-05,
"loss": 0.716,
"step": 330
},
{
"epoch": 0.40129831808793154,
"grad_norm": 0.47102075815200806,
"learning_rate": 3.811954054594702e-05,
"loss": 0.733,
"step": 340
},
{
"epoch": 0.41310120979640014,
"grad_norm": 0.5660268664360046,
"learning_rate": 3.723126538068686e-05,
"loss": 0.764,
"step": 350
},
{
"epoch": 0.4249041015048687,
"grad_norm": 0.595162570476532,
"learning_rate": 3.632220283929822e-05,
"loss": 0.7302,
"step": 360
},
{
"epoch": 0.4367069932133373,
"grad_norm": 0.5331649780273438,
"learning_rate": 3.5393897898885606e-05,
"loss": 0.7127,
"step": 370
},
{
"epoch": 0.4485098849218058,
"grad_norm": 0.4248451590538025,
"learning_rate": 3.444792823954651e-05,
"loss": 0.6933,
"step": 380
},
{
"epoch": 0.4603127766302744,
"grad_norm": 0.5570621490478516,
"learning_rate": 3.348590156306017e-05,
"loss": 0.7012,
"step": 390
},
{
"epoch": 0.47211566833874297,
"grad_norm": 0.41210871934890747,
"learning_rate": 3.25094528605536e-05,
"loss": 0.7006,
"step": 400
},
{
"epoch": 0.48391856004721157,
"grad_norm": 0.5020595788955688,
"learning_rate": 3.152024163378867e-05,
"loss": 0.7159,
"step": 410
},
{
"epoch": 0.49572145175568016,
"grad_norm": 0.5407310724258423,
"learning_rate": 3.051994907479265e-05,
"loss": 0.7002,
"step": 420
},
{
"epoch": 0.5075243434641488,
"grad_norm": 0.422695130109787,
"learning_rate": 2.9510275208625522e-05,
"loss": 0.6721,
"step": 430
},
{
"epoch": 0.5193272351726173,
"grad_norm": 0.4953523576259613,
"learning_rate": 2.849293600414002e-05,
"loss": 0.6612,
"step": 440
},
{
"epoch": 0.5311301268810859,
"grad_norm": 0.44490641355514526,
"learning_rate": 2.7469660457644857e-05,
"loss": 0.6786,
"step": 450
},
{
"epoch": 0.5429330185895545,
"grad_norm": 0.3714945912361145,
"learning_rate": 2.644218765442728e-05,
"loss": 0.6731,
"step": 460
},
{
"epoch": 0.554735910298023,
"grad_norm": 0.44450584053993225,
"learning_rate": 2.541226381312924e-05,
"loss": 0.6876,
"step": 470
},
{
"epoch": 0.5665388020064915,
"grad_norm": 0.4537455439567566,
"learning_rate": 2.4381639318000126e-05,
"loss": 0.6757,
"step": 480
},
{
"epoch": 0.5783416937149601,
"grad_norm": 0.4810272753238678,
"learning_rate": 2.3352065744070072e-05,
"loss": 0.7128,
"step": 490
},
{
"epoch": 0.5901445854234287,
"grad_norm": 0.49226102232933044,
"learning_rate": 2.2325292880299335e-05,
"loss": 0.6928,
"step": 500
},
{
"epoch": 0.6019474771318973,
"grad_norm": 0.46990668773651123,
"learning_rate": 2.1303065755763277e-05,
"loss": 0.6482,
"step": 510
},
{
"epoch": 0.6137503688403659,
"grad_norm": 0.43036311864852905,
"learning_rate": 2.0287121673926828e-05,
"loss": 0.6759,
"step": 520
},
{
"epoch": 0.6255532605488344,
"grad_norm": 0.373436838388443,
"learning_rate": 1.92791872600489e-05,
"loss": 0.674,
"step": 530
},
{
"epoch": 0.637356152257303,
"grad_norm": 0.4169735312461853,
"learning_rate": 1.8280975526734657e-05,
"loss": 0.6636,
"step": 540
},
{
"epoch": 0.6491590439657716,
"grad_norm": 0.3966214060783386,
"learning_rate": 1.7294182962622846e-05,
"loss": 0.658,
"step": 550
},
{
"epoch": 0.6609619356742402,
"grad_norm": 0.45455384254455566,
"learning_rate": 1.632048664915622e-05,
"loss": 0.6563,
"step": 560
},
{
"epoch": 0.6727648273827088,
"grad_norm": 0.513671875,
"learning_rate": 1.536154141033482e-05,
"loss": 0.6481,
"step": 570
},
{
"epoch": 0.6845677190911773,
"grad_norm": 0.4144147038459778,
"learning_rate": 1.4418977000296552e-05,
"loss": 0.681,
"step": 580
},
{
"epoch": 0.6963706107996459,
"grad_norm": 0.4277999997138977,
"learning_rate": 1.3494395333504622e-05,
"loss": 0.655,
"step": 590
},
{
"epoch": 0.7081735025081145,
"grad_norm": 0.4542660415172577,
"learning_rate": 1.2589367762249347e-05,
"loss": 0.6557,
"step": 600
},
{
"epoch": 0.7199763942165831,
"grad_norm": 0.518882155418396,
"learning_rate": 1.1705432406091085e-05,
"loss": 0.6504,
"step": 610
},
{
"epoch": 0.7317792859250516,
"grad_norm": 0.3764165937900543,
"learning_rate": 1.0844091537783316e-05,
"loss": 0.6509,
"step": 620
},
{
"epoch": 0.7435821776335202,
"grad_norm": 0.40605178475379944,
"learning_rate": 1.0006809030118181e-05,
"loss": 0.6619,
"step": 630
},
{
"epoch": 0.7553850693419888,
"grad_norm": 0.42034676671028137,
"learning_rate": 9.195007868033933e-06,
"loss": 0.6083,
"step": 640
},
{
"epoch": 0.7671879610504574,
"grad_norm": 0.4199008345603943,
"learning_rate": 8.410067730212439e-06,
"loss": 0.6464,
"step": 650
},
{
"epoch": 0.778990852758926,
"grad_norm": 0.4271228611469269,
"learning_rate": 7.653322644276779e-06,
"loss": 0.6342,
"step": 660
},
{
"epoch": 0.7907937444673945,
"grad_norm": 0.49036702513694763,
"learning_rate": 6.926058719574207e-06,
"loss": 0.6492,
"step": 670
},
{
"epoch": 0.8025966361758631,
"grad_norm": 0.4103890061378479,
"learning_rate": 6.229511961397455e-06,
"loss": 0.6294,
"step": 680
},
{
"epoch": 0.8143995278843317,
"grad_norm": 0.38033077120780945,
"learning_rate": 5.564866170359351e-06,
"loss": 0.638,
"step": 690
},
{
"epoch": 0.8262024195928003,
"grad_norm": 0.3652307987213135,
"learning_rate": 4.933250930490715e-06,
"loss": 0.6096,
"step": 700
},
{
"epoch": 0.8380053113012688,
"grad_norm": 0.5351826548576355,
"learning_rate": 4.335739689480778e-06,
"loss": 0.6285,
"step": 710
},
{
"epoch": 0.8498082030097374,
"grad_norm": 0.427626371383667,
"learning_rate": 3.773347934323035e-06,
"loss": 0.6257,
"step": 720
},
{
"epoch": 0.861611094718206,
"grad_norm": 0.46427205204963684,
"learning_rate": 3.2470314654667487e-06,
"loss": 0.6142,
"step": 730
},
{
"epoch": 0.8734139864266746,
"grad_norm": 0.5393053293228149,
"learning_rate": 2.7576847724075123e-06,
"loss": 0.6485,
"step": 740
},
{
"epoch": 0.8852168781351432,
"grad_norm": 0.4637604057788849,
"learning_rate": 2.3061395134774038e-06,
"loss": 0.6407,
"step": 750
},
{
"epoch": 0.8970197698436116,
"grad_norm": 0.40724095702171326,
"learning_rate": 1.8931631024185327e-06,
"loss": 0.6535,
"step": 760
},
{
"epoch": 0.9088226615520802,
"grad_norm": 0.4840000569820404,
"learning_rate": 1.5194574041419802e-06,
"loss": 0.642,
"step": 770
},
{
"epoch": 0.9206255532605488,
"grad_norm": 0.41105934977531433,
"learning_rate": 1.185657541888857e-06,
"loss": 0.617,
"step": 780
},
{
"epoch": 0.9324284449690174,
"grad_norm": 0.557059645652771,
"learning_rate": 8.923308178206552e-07,
"loss": 0.6415,
"step": 790
},
{
"epoch": 0.9442313366774859,
"grad_norm": 0.38617223501205444,
"learning_rate": 6.39975748873431e-07,
"loss": 0.6388,
"step": 800
},
{
"epoch": 0.9560342283859545,
"grad_norm": 0.4779140055179596,
"learning_rate": 4.2902121951440834e-07,
"loss": 0.6366,
"step": 810
},
{
"epoch": 0.9678371200944231,
"grad_norm": 0.4569835662841797,
"learning_rate": 2.5982575284084486e-07,
"loss": 0.6735,
"step": 820
},
{
"epoch": 0.9796400118028917,
"grad_norm": 0.4118465185165405,
"learning_rate": 1.3267690126008425e-07,
"loss": 0.6238,
"step": 830
},
{
"epoch": 0.9914429035113603,
"grad_norm": 0.4550204873085022,
"learning_rate": 4.779075778620079e-08,
"loss": 0.6613,
"step": 840
},
{
"epoch": 0.9997049277072882,
"step": 847,
"total_flos": 5.491458012295987e+18,
"train_loss": 0.7367874357185229,
"train_runtime": 38132.292,
"train_samples_per_second": 0.711,
"train_steps_per_second": 0.022
}
],
"logging_steps": 10,
"max_steps": 847,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.491458012295987e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}