Boffl
/

BullingerLM-llama3.1-70B-instruct-qa

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997049277072882,
+  "eval_steps": 500,
+  "global_step": 847,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011802891708468575,
+      "grad_norm": 0.6382197141647339,
+      "learning_rate": 5.294117647058824e-06,
+      "loss": 1.7524,
+      "step": 10
+    },
+    {
+      "epoch": 0.02360578341693715,
+      "grad_norm": 0.5001206994056702,
+      "learning_rate": 1.1176470588235295e-05,
+      "loss": 1.3315,
+      "step": 20
+    },
+    {
+      "epoch": 0.03540867512540572,
+      "grad_norm": 0.41650518774986267,
+      "learning_rate": 1.7058823529411767e-05,
+      "loss": 1.1148,
+      "step": 30
+    },
+    {
+      "epoch": 0.0472115668338743,
+      "grad_norm": 0.42574718594551086,
+      "learning_rate": 2.235294117647059e-05,
+      "loss": 1.0196,
+      "step": 40
+    },
+    {
+      "epoch": 0.05901445854234287,
+      "grad_norm": 0.3408316373825073,
+      "learning_rate": 2.823529411764706e-05,
+      "loss": 0.94,
+      "step": 50
+    },
+    {
+      "epoch": 0.07081735025081144,
+      "grad_norm": 0.39876773953437805,
+      "learning_rate": 3.411764705882353e-05,
+      "loss": 0.8918,
+      "step": 60
+    },
+    {
+      "epoch": 0.08262024195928003,
+      "grad_norm": 0.32425975799560547,
+      "learning_rate": 4e-05,
+      "loss": 0.8412,
+      "step": 70
+    },
+    {
+      "epoch": 0.0944231336677486,
+      "grad_norm": 0.40873634815216064,
+      "learning_rate": 4.588235294117647e-05,
+      "loss": 0.887,
+      "step": 80
+    },
+    {
+      "epoch": 0.10622602537621717,
+      "grad_norm": 0.4909669756889343,
+      "learning_rate": 4.9998087784700426e-05,
+      "loss": 0.8888,
+      "step": 90
+    },
+    {
+      "epoch": 0.11802891708468574,
+      "grad_norm": 0.3897865414619446,
+      "learning_rate": 4.996410098317137e-05,
+      "loss": 0.8555,
+      "step": 100
+    },
+    {
+      "epoch": 0.1298318087931543,
+      "grad_norm": 0.3305865228176117,
+      "learning_rate": 4.989723448187131e-05,
+      "loss": 0.8424,
+      "step": 110
+    },
+    {
+      "epoch": 0.14163470050162288,
+      "grad_norm": 0.3554224669933319,
+      "learning_rate": 4.9845268462432916e-05,
+      "loss": 0.8445,
+      "step": 120
+    },
+    {
+      "epoch": 0.15343759221009148,
+      "grad_norm": 0.46097129583358765,
+      "learning_rate": 4.970969070763177e-05,
+      "loss": 0.8377,
+      "step": 130
+    },
+    {
+      "epoch": 0.16524048391856005,
+      "grad_norm": 0.3145534098148346,
+      "learning_rate": 4.953211814536217e-05,
+      "loss": 0.759,
+      "step": 140
+    },
+    {
+      "epoch": 0.17704337562702863,
+      "grad_norm": 0.42392656207084656,
+      "learning_rate": 4.931285256513868e-05,
+      "loss": 0.8121,
+      "step": 150
+    },
+    {
+      "epoch": 0.1888462673354972,
+      "grad_norm": 0.4339812994003296,
+      "learning_rate": 4.905226661492095e-05,
+      "loss": 0.7896,
+      "step": 160
+    },
+    {
+      "epoch": 0.20064915904396577,
+      "grad_norm": 0.44723227620124817,
+      "learning_rate": 4.8750803167788136e-05,
+      "loss": 0.8057,
+      "step": 170
+    },
+    {
+      "epoch": 0.21245205075243434,
+      "grad_norm": 0.46169158816337585,
+      "learning_rate": 4.840897456926373e-05,
+      "loss": 0.7724,
+      "step": 180
+    },
+    {
+      "epoch": 0.2242549424609029,
+      "grad_norm": 0.41829928755760193,
+      "learning_rate": 4.8027361766570117e-05,
+      "loss": 0.7458,
+      "step": 190
+    },
+    {
+      "epoch": 0.23605783416937148,
+      "grad_norm": 0.4120149612426758,
+      "learning_rate": 4.760661332129254e-05,
+      "loss": 0.7686,
+      "step": 200
+    },
+    {
+      "epoch": 0.24786072587784008,
+      "grad_norm": 0.3918631970882416,
+      "learning_rate": 4.7147444307130686e-05,
+      "loss": 0.769,
+      "step": 210
+    },
+    {
+      "epoch": 0.2596636175863086,
+      "grad_norm": 0.4276711642742157,
+      "learning_rate": 4.665063509461097e-05,
+      "loss": 0.7574,
+      "step": 220
+    },
+    {
+      "epoch": 0.2714665092947772,
+      "grad_norm": 0.42904192209243774,
+      "learning_rate": 4.6117030024825114e-05,
+      "loss": 0.7826,
+      "step": 230
+    },
+    {
+      "epoch": 0.28326940100324577,
+      "grad_norm": 0.5145927667617798,
+      "learning_rate": 4.554753597444896e-05,
+      "loss": 0.7954,
+      "step": 240
+    },
+    {
+      "epoch": 0.29507229271171437,
+      "grad_norm": 0.3549771010875702,
+      "learning_rate": 4.494312081448029e-05,
+      "loss": 0.7527,
+      "step": 250
+    },
+    {
+      "epoch": 0.30687518442018297,
+      "grad_norm": 0.4441188871860504,
+      "learning_rate": 4.4304811765315105e-05,
+      "loss": 0.7321,
+      "step": 260
+    },
+    {
+      "epoch": 0.3186780761286515,
+      "grad_norm": 0.3967060148715973,
+      "learning_rate": 4.3633693650957976e-05,
+      "loss": 0.7047,
+      "step": 270
+    },
+    {
+      "epoch": 0.3304809678371201,
+      "grad_norm": 0.44348135590553284,
+      "learning_rate": 4.293090705533342e-05,
+      "loss": 0.7431,
+      "step": 280
+    },
+    {
+      "epoch": 0.34228385954558865,
+      "grad_norm": 0.9141893982887268,
+      "learning_rate": 4.219764638383177e-05,
+      "loss": 0.7177,
+      "step": 290
+    },
+    {
+      "epoch": 0.35408675125405725,
+      "grad_norm": 0.45525214076042175,
+      "learning_rate": 4.1435157833383955e-05,
+      "loss": 0.7128,
+      "step": 300
+    },
+    {
+      "epoch": 0.3658896429625258,
+      "grad_norm": 0.537662148475647,
+      "learning_rate": 4.06447372745151e-05,
+      "loss": 0.7162,
+      "step": 310
+    },
+    {
+      "epoch": 0.3776925346709944,
+      "grad_norm": 0.4020293653011322,
+      "learning_rate": 3.982772804897649e-05,
+      "loss": 0.7212,
+      "step": 320
+    },
+    {
+      "epoch": 0.389495426379463,
+      "grad_norm": 0.6390876173973083,
+      "learning_rate": 3.898551868669883e-05,
+      "loss": 0.716,
+      "step": 330
+    },
+    {
+      "epoch": 0.40129831808793154,
+      "grad_norm": 0.47102075815200806,
+      "learning_rate": 3.811954054594702e-05,
+      "loss": 0.733,
+      "step": 340
+    },
+    {
+      "epoch": 0.41310120979640014,
+      "grad_norm": 0.5660268664360046,
+      "learning_rate": 3.723126538068686e-05,
+      "loss": 0.764,
+      "step": 350
+    },
+    {
+      "epoch": 0.4249041015048687,
+      "grad_norm": 0.595162570476532,
+      "learning_rate": 3.632220283929822e-05,
+      "loss": 0.7302,
+      "step": 360
+    },
+    {
+      "epoch": 0.4367069932133373,
+      "grad_norm": 0.5331649780273438,
+      "learning_rate": 3.5393897898885606e-05,
+      "loss": 0.7127,
+      "step": 370
+    },
+    {
+      "epoch": 0.4485098849218058,
+      "grad_norm": 0.4248451590538025,
+      "learning_rate": 3.444792823954651e-05,
+      "loss": 0.6933,
+      "step": 380
+    },
+    {
+      "epoch": 0.4603127766302744,
+      "grad_norm": 0.5570621490478516,
+      "learning_rate": 3.348590156306017e-05,
+      "loss": 0.7012,
+      "step": 390
+    },
+    {
+      "epoch": 0.47211566833874297,
+      "grad_norm": 0.41210871934890747,
+      "learning_rate": 3.25094528605536e-05,
+      "loss": 0.7006,
+      "step": 400
+    },
+    {
+      "epoch": 0.48391856004721157,
+      "grad_norm": 0.5020595788955688,
+      "learning_rate": 3.152024163378867e-05,
+      "loss": 0.7159,
+      "step": 410
+    },
+    {
+      "epoch": 0.49572145175568016,
+      "grad_norm": 0.5407310724258423,
+      "learning_rate": 3.051994907479265e-05,
+      "loss": 0.7002,
+      "step": 420
+    },
+    {
+      "epoch": 0.5075243434641488,
+      "grad_norm": 0.422695130109787,
+      "learning_rate": 2.9510275208625522e-05,
+      "loss": 0.6721,
+      "step": 430
+    },
+    {
+      "epoch": 0.5193272351726173,
+      "grad_norm": 0.4953523576259613,
+      "learning_rate": 2.849293600414002e-05,
+      "loss": 0.6612,
+      "step": 440
+    },
+    {
+      "epoch": 0.5311301268810859,
+      "grad_norm": 0.44490641355514526,
+      "learning_rate": 2.7469660457644857e-05,
+      "loss": 0.6786,
+      "step": 450
+    },
+    {
+      "epoch": 0.5429330185895545,
+      "grad_norm": 0.3714945912361145,
+      "learning_rate": 2.644218765442728e-05,
+      "loss": 0.6731,
+      "step": 460
+    },
+    {
+      "epoch": 0.554735910298023,
+      "grad_norm": 0.44450584053993225,
+      "learning_rate": 2.541226381312924e-05,
+      "loss": 0.6876,
+      "step": 470
+    },
+    {
+      "epoch": 0.5665388020064915,
+      "grad_norm": 0.4537455439567566,
+      "learning_rate": 2.4381639318000126e-05,
+      "loss": 0.6757,
+      "step": 480
+    },
+    {
+      "epoch": 0.5783416937149601,
+      "grad_norm": 0.4810272753238678,
+      "learning_rate": 2.3352065744070072e-05,
+      "loss": 0.7128,
+      "step": 490
+    },
+    {
+      "epoch": 0.5901445854234287,
+      "grad_norm": 0.49226102232933044,
+      "learning_rate": 2.2325292880299335e-05,
+      "loss": 0.6928,
+      "step": 500
+    },
+    {
+      "epoch": 0.6019474771318973,
+      "grad_norm": 0.46990668773651123,
+      "learning_rate": 2.1303065755763277e-05,
+      "loss": 0.6482,
+      "step": 510
+    },
+    {
+      "epoch": 0.6137503688403659,
+      "grad_norm": 0.43036311864852905,
+      "learning_rate": 2.0287121673926828e-05,
+      "loss": 0.6759,
+      "step": 520
+    },
+    {
+      "epoch": 0.6255532605488344,
+      "grad_norm": 0.373436838388443,
+      "learning_rate": 1.92791872600489e-05,
+      "loss": 0.674,
+      "step": 530
+    },
+    {
+      "epoch": 0.637356152257303,
+      "grad_norm": 0.4169735312461853,
+      "learning_rate": 1.8280975526734657e-05,
+      "loss": 0.6636,
+      "step": 540
+    },
+    {
+      "epoch": 0.6491590439657716,
+      "grad_norm": 0.3966214060783386,
+      "learning_rate": 1.7294182962622846e-05,
+      "loss": 0.658,
+      "step": 550
+    },
+    {
+      "epoch": 0.6609619356742402,
+      "grad_norm": 0.45455384254455566,
+      "learning_rate": 1.632048664915622e-05,
+      "loss": 0.6563,
+      "step": 560
+    },
+    {
+      "epoch": 0.6727648273827088,
+      "grad_norm": 0.513671875,
+      "learning_rate": 1.536154141033482e-05,
+      "loss": 0.6481,
+      "step": 570
+    },
+    {
+      "epoch": 0.6845677190911773,
+      "grad_norm": 0.4144147038459778,
+      "learning_rate": 1.4418977000296552e-05,
+      "loss": 0.681,
+      "step": 580
+    },
+    {
+      "epoch": 0.6963706107996459,
+      "grad_norm": 0.4277999997138977,
+      "learning_rate": 1.3494395333504622e-05,
+      "loss": 0.655,
+      "step": 590
+    },
+    {
+      "epoch": 0.7081735025081145,
+      "grad_norm": 0.4542660415172577,
+      "learning_rate": 1.2589367762249347e-05,
+      "loss": 0.6557,
+      "step": 600
+    },
+    {
+      "epoch": 0.7199763942165831,
+      "grad_norm": 0.518882155418396,
+      "learning_rate": 1.1705432406091085e-05,
+      "loss": 0.6504,
+      "step": 610
+    },
+    {
+      "epoch": 0.7317792859250516,
+      "grad_norm": 0.3764165937900543,
+      "learning_rate": 1.0844091537783316e-05,
+      "loss": 0.6509,
+      "step": 620
+    },
+    {
+      "epoch": 0.7435821776335202,
+      "grad_norm": 0.40605178475379944,
+      "learning_rate": 1.0006809030118181e-05,
+      "loss": 0.6619,
+      "step": 630
+    },
+    {
+      "epoch": 0.7553850693419888,
+      "grad_norm": 0.42034676671028137,
+      "learning_rate": 9.195007868033933e-06,
+      "loss": 0.6083,
+      "step": 640
+    },
+    {
+      "epoch": 0.7671879610504574,
+      "grad_norm": 0.4199008345603943,
+      "learning_rate": 8.410067730212439e-06,
+      "loss": 0.6464,
+      "step": 650
+    },
+    {
+      "epoch": 0.778990852758926,
+      "grad_norm": 0.4271228611469269,
+      "learning_rate": 7.653322644276779e-06,
+      "loss": 0.6342,
+      "step": 660
+    },
+    {
+      "epoch": 0.7907937444673945,
+      "grad_norm": 0.49036702513694763,
+      "learning_rate": 6.926058719574207e-06,
+      "loss": 0.6492,
+      "step": 670
+    },
+    {
+      "epoch": 0.8025966361758631,
+      "grad_norm": 0.4103890061378479,
+      "learning_rate": 6.229511961397455e-06,
+      "loss": 0.6294,
+      "step": 680
+    },
+    {
+      "epoch": 0.8143995278843317,
+      "grad_norm": 0.38033077120780945,
+      "learning_rate": 5.564866170359351e-06,
+      "loss": 0.638,
+      "step": 690
+    },
+    {
+      "epoch": 0.8262024195928003,
+      "grad_norm": 0.3652307987213135,
+      "learning_rate": 4.933250930490715e-06,
+      "loss": 0.6096,
+      "step": 700
+    },
+    {
+      "epoch": 0.8380053113012688,
+      "grad_norm": 0.5351826548576355,
+      "learning_rate": 4.335739689480778e-06,
+      "loss": 0.6285,
+      "step": 710
+    },
+    {
+      "epoch": 0.8498082030097374,
+      "grad_norm": 0.427626371383667,
+      "learning_rate": 3.773347934323035e-06,
+      "loss": 0.6257,
+      "step": 720
+    },
+    {
+      "epoch": 0.861611094718206,
+      "grad_norm": 0.46427205204963684,
+      "learning_rate": 3.2470314654667487e-06,
+      "loss": 0.6142,
+      "step": 730
+    },
+    {
+      "epoch": 0.8734139864266746,
+      "grad_norm": 0.5393053293228149,
+      "learning_rate": 2.7576847724075123e-06,
+      "loss": 0.6485,
+      "step": 740
+    },
+    {
+      "epoch": 0.8852168781351432,
+      "grad_norm": 0.4637604057788849,
+      "learning_rate": 2.3061395134774038e-06,
+      "loss": 0.6407,
+      "step": 750
+    },
+    {
+      "epoch": 0.8970197698436116,
+      "grad_norm": 0.40724095702171326,
+      "learning_rate": 1.8931631024185327e-06,
+      "loss": 0.6535,
+      "step": 760
+    },
+    {
+      "epoch": 0.9088226615520802,
+      "grad_norm": 0.4840000569820404,
+      "learning_rate": 1.5194574041419802e-06,
+      "loss": 0.642,
+      "step": 770
+    },
+    {
+      "epoch": 0.9206255532605488,
+      "grad_norm": 0.41105934977531433,
+      "learning_rate": 1.185657541888857e-06,
+      "loss": 0.617,
+      "step": 780
+    },
+    {
+      "epoch": 0.9324284449690174,
+      "grad_norm": 0.557059645652771,
+      "learning_rate": 8.923308178206552e-07,
+      "loss": 0.6415,
+      "step": 790
+    },
+    {
+      "epoch": 0.9442313366774859,
+      "grad_norm": 0.38617223501205444,
+      "learning_rate": 6.39975748873431e-07,
+      "loss": 0.6388,
+      "step": 800
+    },
+    {
+      "epoch": 0.9560342283859545,
+      "grad_norm": 0.4779140055179596,
+      "learning_rate": 4.2902121951440834e-07,
+      "loss": 0.6366,
+      "step": 810
+    },
+    {
+      "epoch": 0.9678371200944231,
+      "grad_norm": 0.4569835662841797,
+      "learning_rate": 2.5982575284084486e-07,
+      "loss": 0.6735,
+      "step": 820
+    },
+    {
+      "epoch": 0.9796400118028917,
+      "grad_norm": 0.4118465185165405,
+      "learning_rate": 1.3267690126008425e-07,
+      "loss": 0.6238,
+      "step": 830
+    },
+    {
+      "epoch": 0.9914429035113603,
+      "grad_norm": 0.4550204873085022,
+      "learning_rate": 4.779075778620079e-08,
+      "loss": 0.6613,
+      "step": 840
+    },
+    {
+      "epoch": 0.9997049277072882,
+      "step": 847,
+      "total_flos": 5.491458012295987e+18,
+      "train_loss": 0.7367874357185229,
+      "train_runtime": 38132.292,
+      "train_samples_per_second": 0.711,
+      "train_steps_per_second": 0.022
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 847,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.491458012295987e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}