End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +836 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: hp_ablations_mistral_adambeta1_0.92_dcftv1.2
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # hp_ablations_mistral_adambeta1_0.92_dcftv1.2
-This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.0765

 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: hp_ablations_mistral_adambeta1_0.92_dcftv1.2
 # hp_ablations_mistral_adambeta1_0.92_dcftv1.2
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the mlfoundations-dev/oh-dcft-v1.2_no-curation_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.0765

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.99695843190267,
+    "eval_loss": 0.07649821043014526,
+    "eval_runtime": 384.4835,
+    "eval_samples_per_second": 25.918,
+    "eval_steps_per_second": 0.406,
+    "total_flos": 1854056851046400.0,
+    "train_loss": 0.49201587243670264,
+    "train_runtime": 63342.9422,
+    "train_samples_per_second": 8.967,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.99695843190267,
+    "eval_loss": 0.07649821043014526,
+    "eval_runtime": 384.4835,
+    "eval_samples_per_second": 25.918,
+    "eval_steps_per_second": 0.406
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.99695843190267,
+    "total_flos": 1854056851046400.0,
+    "train_loss": 0.49201587243670264,
+    "train_runtime": 63342.9422,
+    "train_samples_per_second": 8.967,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,836 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.99695843190267,
+  "eval_steps": 500,
+  "global_step": 1107,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.027036160865157147,
+      "grad_norm": 3.3627634794530086,
+      "learning_rate": 5e-06,
+      "loss": 0.8839,
+      "step": 10
+    },
+    {
+      "epoch": 0.054072321730314295,
+      "grad_norm": 1.9703511972746102,
+      "learning_rate": 5e-06,
+      "loss": 0.6886,
+      "step": 20
+    },
+    {
+      "epoch": 0.08110848259547145,
+      "grad_norm": 1.895618934030067,
+      "learning_rate": 5e-06,
+      "loss": 0.649,
+      "step": 30
+    },
+    {
+      "epoch": 0.10814464346062859,
+      "grad_norm": 3.018653004689794,
+      "learning_rate": 5e-06,
+      "loss": 0.6375,
+      "step": 40
+    },
+    {
+      "epoch": 0.13518080432578575,
+      "grad_norm": 2.253174703767421,
+      "learning_rate": 5e-06,
+      "loss": 0.6233,
+      "step": 50
+    },
+    {
+      "epoch": 0.1622169651909429,
+      "grad_norm": 2.5770804247981793,
+      "learning_rate": 5e-06,
+      "loss": 0.6164,
+      "step": 60
+    },
+    {
+      "epoch": 0.18925312605610004,
+      "grad_norm": 1.9685443465813035,
+      "learning_rate": 5e-06,
+      "loss": 0.6127,
+      "step": 70
+    },
+    {
+      "epoch": 0.21628928692125718,
+      "grad_norm": 2.1325268775254034,
+      "learning_rate": 5e-06,
+      "loss": 0.6039,
+      "step": 80
+    },
+    {
+      "epoch": 0.24332544778641432,
+      "grad_norm": 1.4745840951249662,
+      "learning_rate": 5e-06,
+      "loss": 0.601,
+      "step": 90
+    },
+    {
+      "epoch": 0.2703616086515715,
+      "grad_norm": 1.971048371029162,
+      "learning_rate": 5e-06,
+      "loss": 0.6032,
+      "step": 100
+    },
+    {
+      "epoch": 0.29739776951672864,
+      "grad_norm": 1.6569004063408272,
+      "learning_rate": 5e-06,
+      "loss": 0.6012,
+      "step": 110
+    },
+    {
+      "epoch": 0.3244339303818858,
+      "grad_norm": 2.057401724353791,
+      "learning_rate": 5e-06,
+      "loss": 0.5971,
+      "step": 120
+    },
+    {
+      "epoch": 0.3514700912470429,
+      "grad_norm": 1.525579699451329,
+      "learning_rate": 5e-06,
+      "loss": 0.5993,
+      "step": 130
+    },
+    {
+      "epoch": 0.37850625211220007,
+      "grad_norm": 1.60308792852181,
+      "learning_rate": 5e-06,
+      "loss": 0.5985,
+      "step": 140
+    },
+    {
+      "epoch": 0.4055424129773572,
+      "grad_norm": 1.6398129211146073,
+      "learning_rate": 5e-06,
+      "loss": 0.5895,
+      "step": 150
+    },
+    {
+      "epoch": 0.43257857384251436,
+      "grad_norm": 1.9310976828981994,
+      "learning_rate": 5e-06,
+      "loss": 0.5918,
+      "step": 160
+    },
+    {
+      "epoch": 0.4596147347076715,
+      "grad_norm": 1.385584059931609,
+      "learning_rate": 5e-06,
+      "loss": 0.593,
+      "step": 170
+    },
+    {
+      "epoch": 0.48665089557282865,
+      "grad_norm": 1.2861159852159507,
+      "learning_rate": 5e-06,
+      "loss": 0.5918,
+      "step": 180
+    },
+    {
+      "epoch": 0.5136870564379858,
+      "grad_norm": 1.444176832285908,
+      "learning_rate": 5e-06,
+      "loss": 0.5885,
+      "step": 190
+    },
+    {
+      "epoch": 0.540723217303143,
+      "grad_norm": 1.4862672671337698,
+      "learning_rate": 5e-06,
+      "loss": 0.5867,
+      "step": 200
+    },
+    {
+      "epoch": 0.5677593781683001,
+      "grad_norm": 1.5075706410261351,
+      "learning_rate": 5e-06,
+      "loss": 0.5885,
+      "step": 210
+    },
+    {
+      "epoch": 0.5947955390334573,
+      "grad_norm": 1.2766576640698686,
+      "learning_rate": 5e-06,
+      "loss": 0.5831,
+      "step": 220
+    },
+    {
+      "epoch": 0.6218316998986144,
+      "grad_norm": 1.3449220128756942,
+      "learning_rate": 5e-06,
+      "loss": 0.5853,
+      "step": 230
+    },
+    {
+      "epoch": 0.6488678607637716,
+      "grad_norm": 1.2820825044433828,
+      "learning_rate": 5e-06,
+      "loss": 0.5816,
+      "step": 240
+    },
+    {
+      "epoch": 0.6759040216289287,
+      "grad_norm": 2.0436410705486927,
+      "learning_rate": 5e-06,
+      "loss": 0.575,
+      "step": 250
+    },
+    {
+      "epoch": 0.7029401824940859,
+      "grad_norm": 1.5111257680075043,
+      "learning_rate": 5e-06,
+      "loss": 0.5781,
+      "step": 260
+    },
+    {
+      "epoch": 0.729976343359243,
+      "grad_norm": 1.4284810364538454,
+      "learning_rate": 5e-06,
+      "loss": 0.5805,
+      "step": 270
+    },
+    {
+      "epoch": 0.7570125042244001,
+      "grad_norm": 1.2912410275065467,
+      "learning_rate": 5e-06,
+      "loss": 0.5753,
+      "step": 280
+    },
+    {
+      "epoch": 0.7840486650895573,
+      "grad_norm": 1.3589332122580937,
+      "learning_rate": 5e-06,
+      "loss": 0.5757,
+      "step": 290
+    },
+    {
+      "epoch": 0.8110848259547144,
+      "grad_norm": 1.4270507706028406,
+      "learning_rate": 5e-06,
+      "loss": 0.5759,
+      "step": 300
+    },
+    {
+      "epoch": 0.8381209868198716,
+      "grad_norm": 1.4167431698605149,
+      "learning_rate": 5e-06,
+      "loss": 0.5728,
+      "step": 310
+    },
+    {
+      "epoch": 0.8651571476850287,
+      "grad_norm": 1.476127949628171,
+      "learning_rate": 5e-06,
+      "loss": 0.5711,
+      "step": 320
+    },
+    {
+      "epoch": 0.8921933085501859,
+      "grad_norm": 1.3615337414773585,
+      "learning_rate": 5e-06,
+      "loss": 0.5729,
+      "step": 330
+    },
+    {
+      "epoch": 0.919229469415343,
+      "grad_norm": 1.3530496841079478,
+      "learning_rate": 5e-06,
+      "loss": 0.5797,
+      "step": 340
+    },
+    {
+      "epoch": 0.9462656302805001,
+      "grad_norm": 1.6939163932161898,
+      "learning_rate": 5e-06,
+      "loss": 0.5713,
+      "step": 350
+    },
+    {
+      "epoch": 0.9733017911456573,
+      "grad_norm": 1.2503611827765622,
+      "learning_rate": 5e-06,
+      "loss": 0.5711,
+      "step": 360
+    },
+    {
+      "epoch": 0.9976343359242987,
+      "eval_loss": 0.07110526412725449,
+      "eval_runtime": 379.7453,
+      "eval_samples_per_second": 26.241,
+      "eval_steps_per_second": 0.411,
+      "step": 369
+    },
+    {
+      "epoch": 1.0023656640757013,
+      "grad_norm": 2.951009083063046,
+      "learning_rate": 5e-06,
+      "loss": 0.5659,
+      "step": 370
+    },
+    {
+      "epoch": 1.0294018249408583,
+      "grad_norm": 2.275911553065513,
+      "learning_rate": 5e-06,
+      "loss": 0.4791,
+      "step": 380
+    },
+    {
+      "epoch": 1.0564379858060156,
+      "grad_norm": 1.9146887520712665,
+      "learning_rate": 5e-06,
+      "loss": 0.4742,
+      "step": 390
+    },
+    {
+      "epoch": 1.0834741466711728,
+      "grad_norm": 1.7060151561064856,
+      "learning_rate": 5e-06,
+      "loss": 0.4754,
+      "step": 400
+    },
+    {
+      "epoch": 1.1105103075363298,
+      "grad_norm": 1.4307333510541713,
+      "learning_rate": 5e-06,
+      "loss": 0.4723,
+      "step": 410
+    },
+    {
+      "epoch": 1.1375464684014869,
+      "grad_norm": 1.3914100165165024,
+      "learning_rate": 5e-06,
+      "loss": 0.4744,
+      "step": 420
+    },
+    {
+      "epoch": 1.1645826292666441,
+      "grad_norm": 1.7655561621577454,
+      "learning_rate": 5e-06,
+      "loss": 0.4823,
+      "step": 430
+    },
+    {
+      "epoch": 1.1916187901318014,
+      "grad_norm": 1.750541560120252,
+      "learning_rate": 5e-06,
+      "loss": 0.4746,
+      "step": 440
+    },
+    {
+      "epoch": 1.2186549509969584,
+      "grad_norm": 1.3542180298546558,
+      "learning_rate": 5e-06,
+      "loss": 0.4795,
+      "step": 450
+    },
+    {
+      "epoch": 1.2456911118621157,
+      "grad_norm": 1.3487709786995525,
+      "learning_rate": 5e-06,
+      "loss": 0.4811,
+      "step": 460
+    },
+    {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 1.4890081594482487,
+      "learning_rate": 5e-06,
+      "loss": 0.4838,
+      "step": 470
+    },
+    {
+      "epoch": 1.29976343359243,
+      "grad_norm": 1.6710110403178111,
+      "learning_rate": 5e-06,
+      "loss": 0.4803,
+      "step": 480
+    },
+    {
+      "epoch": 1.326799594457587,
+      "grad_norm": 1.3946918887630642,
+      "learning_rate": 5e-06,
+      "loss": 0.4814,
+      "step": 490
+    },
+    {
+      "epoch": 1.3538357553227442,
+      "grad_norm": 1.349529389304425,
+      "learning_rate": 5e-06,
+      "loss": 0.4838,
+      "step": 500
+    },
+    {
+      "epoch": 1.3808719161879013,
+      "grad_norm": 1.7196111346917198,
+      "learning_rate": 5e-06,
+      "loss": 0.4834,
+      "step": 510
+    },
+    {
+      "epoch": 1.4079080770530585,
+      "grad_norm": 1.3671419685233817,
+      "learning_rate": 5e-06,
+      "loss": 0.4868,
+      "step": 520
+    },
+    {
+      "epoch": 1.4349442379182156,
+      "grad_norm": 1.6390806735333066,
+      "learning_rate": 5e-06,
+      "loss": 0.4797,
+      "step": 530
+    },
+    {
+      "epoch": 1.4619803987833728,
+      "grad_norm": 2.033732375443223,
+      "learning_rate": 5e-06,
+      "loss": 0.4838,
+      "step": 540
+    },
+    {
+      "epoch": 1.4890165596485299,
+      "grad_norm": 1.6780750098228978,
+      "learning_rate": 5e-06,
+      "loss": 0.4828,
+      "step": 550
+    },
+    {
+      "epoch": 1.5160527205136871,
+      "grad_norm": 1.4650147086655332,
+      "learning_rate": 5e-06,
+      "loss": 0.4833,
+      "step": 560
+    },
+    {
+      "epoch": 1.5430888813788441,
+      "grad_norm": 1.6474963909234748,
+      "learning_rate": 5e-06,
+      "loss": 0.4801,
+      "step": 570
+    },
+    {
+      "epoch": 1.5701250422440014,
+      "grad_norm": 1.2672048074445312,
+      "learning_rate": 5e-06,
+      "loss": 0.4821,
+      "step": 580
+    },
+    {
+      "epoch": 1.5971612031091587,
+      "grad_norm": 1.2455105330473952,
+      "learning_rate": 5e-06,
+      "loss": 0.4858,
+      "step": 590
+    },
+    {
+      "epoch": 1.6241973639743157,
+      "grad_norm": 1.3365218119955355,
+      "learning_rate": 5e-06,
+      "loss": 0.4826,
+      "step": 600
+    },
+    {
+      "epoch": 1.6512335248394727,
+      "grad_norm": 1.453771936609652,
+      "learning_rate": 5e-06,
+      "loss": 0.4824,
+      "step": 610
+    },
+    {
+      "epoch": 1.67826968570463,
+      "grad_norm": 1.3826705153584662,
+      "learning_rate": 5e-06,
+      "loss": 0.4872,
+      "step": 620
+    },
+    {
+      "epoch": 1.7053058465697872,
+      "grad_norm": 1.3948399069255963,
+      "learning_rate": 5e-06,
+      "loss": 0.4923,
+      "step": 630
+    },
+    {
+      "epoch": 1.7323420074349443,
+      "grad_norm": 1.3015399889788772,
+      "learning_rate": 5e-06,
+      "loss": 0.4897,
+      "step": 640
+    },
+    {
+      "epoch": 1.7593781683001013,
+      "grad_norm": 1.5649098361174691,
+      "learning_rate": 5e-06,
+      "loss": 0.4882,
+      "step": 650
+    },
+    {
+      "epoch": 1.7864143291652586,
+      "grad_norm": 1.4369334673977943,
+      "learning_rate": 5e-06,
+      "loss": 0.4856,
+      "step": 660
+    },
+    {
+      "epoch": 1.8134504900304158,
+      "grad_norm": 1.3582444869164498,
+      "learning_rate": 5e-06,
+      "loss": 0.4872,
+      "step": 670
+    },
+    {
+      "epoch": 1.8404866508955728,
+      "grad_norm": 1.4410245166819187,
+      "learning_rate": 5e-06,
+      "loss": 0.4902,
+      "step": 680
+    },
+    {
+      "epoch": 1.8675228117607299,
+      "grad_norm": 1.2401548016118424,
+      "learning_rate": 5e-06,
+      "loss": 0.4876,
+      "step": 690
+    },
+    {
+      "epoch": 1.8945589726258871,
+      "grad_norm": 1.3435104700539477,
+      "learning_rate": 5e-06,
+      "loss": 0.4906,
+      "step": 700
+    },
+    {
+      "epoch": 1.9215951334910444,
+      "grad_norm": 1.4535634930233825,
+      "learning_rate": 5e-06,
+      "loss": 0.4955,
+      "step": 710
+    },
+    {
+      "epoch": 1.9486312943562014,
+      "grad_norm": 1.267760090624259,
+      "learning_rate": 5e-06,
+      "loss": 0.4887,
+      "step": 720
+    },
+    {
+      "epoch": 1.9756674552213584,
+      "grad_norm": 1.2488776839475728,
+      "learning_rate": 5e-06,
+      "loss": 0.4845,
+      "step": 730
+    },
+    {
+      "epoch": 1.9972963839134843,
+      "eval_loss": 0.07172359526157379,
+      "eval_runtime": 380.951,
+      "eval_samples_per_second": 26.158,
+      "eval_steps_per_second": 0.41,
+      "step": 738
+    },
+    {
+      "epoch": 2.0047313281514025,
+      "grad_norm": 3.0892791498740326,
+      "learning_rate": 5e-06,
+      "loss": 0.4728,
+      "step": 740
+    },
+    {
+      "epoch": 2.0317674890165596,
+      "grad_norm": 2.0695271423861037,
+      "learning_rate": 5e-06,
+      "loss": 0.3816,
+      "step": 750
+    },
+    {
+      "epoch": 2.0588036498817166,
+      "grad_norm": 1.694338271769957,
+      "learning_rate": 5e-06,
+      "loss": 0.3823,
+      "step": 760
+    },
+    {
+      "epoch": 2.085839810746874,
+      "grad_norm": 1.492703018660841,
+      "learning_rate": 5e-06,
+      "loss": 0.3768,
+      "step": 770
+    },
+    {
+      "epoch": 2.112875971612031,
+      "grad_norm": 1.6552377461262013,
+      "learning_rate": 5e-06,
+      "loss": 0.3769,
+      "step": 780
+    },
+    {
+      "epoch": 2.139912132477188,
+      "grad_norm": 1.9420361419361558,
+      "learning_rate": 5e-06,
+      "loss": 0.3761,
+      "step": 790
+    },
+    {
+      "epoch": 2.1669482933423456,
+      "grad_norm": 1.5563624108859666,
+      "learning_rate": 5e-06,
+      "loss": 0.3826,
+      "step": 800
+    },
+    {
+      "epoch": 2.1939844542075027,
+      "grad_norm": 1.7915429380933352,
+      "learning_rate": 5e-06,
+      "loss": 0.3804,
+      "step": 810
+    },
+    {
+      "epoch": 2.2210206150726597,
+      "grad_norm": 1.5584683923881884,
+      "learning_rate": 5e-06,
+      "loss": 0.3816,
+      "step": 820
+    },
+    {
+      "epoch": 2.2480567759378167,
+      "grad_norm": 1.60097536516568,
+      "learning_rate": 5e-06,
+      "loss": 0.3844,
+      "step": 830
+    },
+    {
+      "epoch": 2.2750929368029738,
+      "grad_norm": 1.6548064908062865,
+      "learning_rate": 5e-06,
+      "loss": 0.3865,
+      "step": 840
+    },
+    {
+      "epoch": 2.3021290976681312,
+      "grad_norm": 1.7027619140998314,
+      "learning_rate": 5e-06,
+      "loss": 0.3818,
+      "step": 850
+    },
+    {
+      "epoch": 2.3291652585332883,
+      "grad_norm": 1.6016849568444829,
+      "learning_rate": 5e-06,
+      "loss": 0.3847,
+      "step": 860
+    },
+    {
+      "epoch": 2.3562014193984453,
+      "grad_norm": 1.8796231385046944,
+      "learning_rate": 5e-06,
+      "loss": 0.39,
+      "step": 870
+    },
+    {
+      "epoch": 2.3832375802636028,
+      "grad_norm": 1.5319470307978418,
+      "learning_rate": 5e-06,
+      "loss": 0.3892,
+      "step": 880
+    },
+    {
+      "epoch": 2.41027374112876,
+      "grad_norm": 1.7017719120193255,
+      "learning_rate": 5e-06,
+      "loss": 0.3881,
+      "step": 890
+    },
+    {
+      "epoch": 2.437309901993917,
+      "grad_norm": 1.5344718368107968,
+      "learning_rate": 5e-06,
+      "loss": 0.3873,
+      "step": 900
+    },
+    {
+      "epoch": 2.464346062859074,
+      "grad_norm": 1.6102507634771308,
+      "learning_rate": 5e-06,
+      "loss": 0.3854,
+      "step": 910
+    },
+    {
+      "epoch": 2.4913822237242313,
+      "grad_norm": 1.6690069949519504,
+      "learning_rate": 5e-06,
+      "loss": 0.3872,
+      "step": 920
+    },
+    {
+      "epoch": 2.5184183845893884,
+      "grad_norm": 1.5743935677314018,
+      "learning_rate": 5e-06,
+      "loss": 0.3867,
+      "step": 930
+    },
+    {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 1.5720807503966818,
+      "learning_rate": 5e-06,
+      "loss": 0.394,
+      "step": 940
+    },
+    {
+      "epoch": 2.5724907063197024,
+      "grad_norm": 1.4596744684339498,
+      "learning_rate": 5e-06,
+      "loss": 0.3896,
+      "step": 950
+    },
+    {
+      "epoch": 2.59952686718486,
+      "grad_norm": 1.4774112887538513,
+      "learning_rate": 5e-06,
+      "loss": 0.3959,
+      "step": 960
+    },
+    {
+      "epoch": 2.626563028050017,
+      "grad_norm": 1.6927054304904465,
+      "learning_rate": 5e-06,
+      "loss": 0.3946,
+      "step": 970
+    },
+    {
+      "epoch": 2.653599188915174,
+      "grad_norm": 1.6990634986226298,
+      "learning_rate": 5e-06,
+      "loss": 0.399,
+      "step": 980
+    },
+    {
+      "epoch": 2.6806353497803315,
+      "grad_norm": 1.5811069605653503,
+      "learning_rate": 5e-06,
+      "loss": 0.3968,
+      "step": 990
+    },
+    {
+      "epoch": 2.7076715106454885,
+      "grad_norm": 1.929742002611046,
+      "learning_rate": 5e-06,
+      "loss": 0.3906,
+      "step": 1000
+    },
+    {
+      "epoch": 2.7347076715106455,
+      "grad_norm": 1.4332871535309044,
+      "learning_rate": 5e-06,
+      "loss": 0.3984,
+      "step": 1010
+    },
+    {
+      "epoch": 2.7617438323758026,
+      "grad_norm": 1.6711055842838813,
+      "learning_rate": 5e-06,
+      "loss": 0.4002,
+      "step": 1020
+    },
+    {
+      "epoch": 2.7887799932409596,
+      "grad_norm": 1.6261611517040526,
+      "learning_rate": 5e-06,
+      "loss": 0.3984,
+      "step": 1030
+    },
+    {
+      "epoch": 2.815816154106117,
+      "grad_norm": 1.4326621075330992,
+      "learning_rate": 5e-06,
+      "loss": 0.3972,
+      "step": 1040
+    },
+    {
+      "epoch": 2.842852314971274,
+      "grad_norm": 1.4683518261050355,
+      "learning_rate": 5e-06,
+      "loss": 0.399,
+      "step": 1050
+    },
+    {
+      "epoch": 2.869888475836431,
+      "grad_norm": 1.4432147424830148,
+      "learning_rate": 5e-06,
+      "loss": 0.3953,
+      "step": 1060
+    },
+    {
+      "epoch": 2.8969246367015886,
+      "grad_norm": 1.4795447507798194,
+      "learning_rate": 5e-06,
+      "loss": 0.4029,
+      "step": 1070
+    },
+    {
+      "epoch": 2.9239607975667457,
+      "grad_norm": 1.54599126728265,
+      "learning_rate": 5e-06,
+      "loss": 0.3982,
+      "step": 1080
+    },
+    {
+      "epoch": 2.9509969584319027,
+      "grad_norm": 1.4383101466258315,
+      "learning_rate": 5e-06,
+      "loss": 0.4018,
+      "step": 1090
+    },
+    {
+      "epoch": 2.9780331192970597,
+      "grad_norm": 1.3572001471611468,
+      "learning_rate": 5e-06,
+      "loss": 0.401,
+      "step": 1100
+    },
+    {
+      "epoch": 2.99695843190267,
+      "eval_loss": 0.07649821043014526,
+      "eval_runtime": 382.4679,
+      "eval_samples_per_second": 26.054,
+      "eval_steps_per_second": 0.408,
+      "step": 1107
+    },
+    {
+      "epoch": 2.99695843190267,
+      "step": 1107,
+      "total_flos": 1854056851046400.0,
+      "train_loss": 0.49201587243670264,
+      "train_runtime": 63342.9422,
+      "train_samples_per_second": 8.967,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1107,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1854056851046400.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed