End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +836 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: hp_ablations_mistral_scheduler_cosine_warmup0.05_minlr1e-7_dcftv1.2
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # hp_ablations_mistral_scheduler_cosine_warmup0.05_minlr1e-7_dcftv1.2
-This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.0734

 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: hp_ablations_mistral_scheduler_cosine_warmup0.05_minlr1e-7_dcftv1.2
 # hp_ablations_mistral_scheduler_cosine_warmup0.05_minlr1e-7_dcftv1.2
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the mlfoundations-dev/oh-dcft-v1.2_no-curation_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.0734

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.99695843190267,
+    "eval_loss": 0.07343952357769012,
+    "eval_runtime": 380.6676,
+    "eval_samples_per_second": 26.178,
+    "eval_steps_per_second": 0.41,
+    "total_flos": 1854056851046400.0,
+    "train_loss": 0.4996785878489011,
+    "train_runtime": 63355.7173,
+    "train_samples_per_second": 8.965,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.99695843190267,
+    "eval_loss": 0.07343952357769012,
+    "eval_runtime": 380.6676,
+    "eval_samples_per_second": 26.178,
+    "eval_steps_per_second": 0.41
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.99695843190267,
+    "total_flos": 1854056851046400.0,
+    "train_loss": 0.4996785878489011,
+    "train_runtime": 63355.7173,
+    "train_samples_per_second": 8.965,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,836 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.99695843190267,
+  "eval_steps": 500,
+  "global_step": 1107,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.027036160865157147,
+      "grad_norm": 5.9832664062841445,
+      "learning_rate": 8.928571428571429e-07,
+      "loss": 0.8856,
+      "step": 10
+    },
+    {
+      "epoch": 0.054072321730314295,
+      "grad_norm": 2.422574640898435,
+      "learning_rate": 1.7857142857142859e-06,
+      "loss": 0.7436,
+      "step": 20
+    },
+    {
+      "epoch": 0.08110848259547145,
+      "grad_norm": 2.306483898812809,
+      "learning_rate": 2.6785714285714285e-06,
+      "loss": 0.6792,
+      "step": 30
+    },
+    {
+      "epoch": 0.10814464346062859,
+      "grad_norm": 1.737436000738722,
+      "learning_rate": 3.5714285714285718e-06,
+      "loss": 0.655,
+      "step": 40
+    },
+    {
+      "epoch": 0.13518080432578575,
+      "grad_norm": 1.823323318308892,
+      "learning_rate": 4.464285714285715e-06,
+      "loss": 0.6334,
+      "step": 50
+    },
+    {
+      "epoch": 0.1622169651909429,
+      "grad_norm": 3.6157460204989493,
+      "learning_rate": 4.999824876184517e-06,
+      "loss": 0.6258,
+      "step": 60
+    },
+    {
+      "epoch": 0.18925312605610004,
+      "grad_norm": 2.21572570092998,
+      "learning_rate": 4.997855020764802e-06,
+      "loss": 0.62,
+      "step": 70
+    },
+    {
+      "epoch": 0.21628928692125718,
+      "grad_norm": 2.4335006932811805,
+      "learning_rate": 4.99369817095752e-06,
+      "loss": 0.6104,
+      "step": 80
+    },
+    {
+      "epoch": 0.24332544778641432,
+      "grad_norm": 3.0492932227575755,
+      "learning_rate": 4.98735804062935e-06,
+      "loss": 0.6067,
+      "step": 90
+    },
+    {
+      "epoch": 0.2703616086515715,
+      "grad_norm": 2.148281590957587,
+      "learning_rate": 4.978840294261777e-06,
+      "loss": 0.6084,
+      "step": 100
+    },
+    {
+      "epoch": 0.29739776951672864,
+      "grad_norm": 1.7664347921328134,
+      "learning_rate": 4.968152541890256e-06,
+      "loss": 0.6054,
+      "step": 110
+    },
+    {
+      "epoch": 0.3244339303818858,
+      "grad_norm": 1.8560219332271684,
+      "learning_rate": 4.955304332305161e-06,
+      "loss": 0.6012,
+      "step": 120
+    },
+    {
+      "epoch": 0.3514700912470429,
+      "grad_norm": 1.8079805894465493,
+      "learning_rate": 4.9403071445205725e-06,
+      "loss": 0.6028,
+      "step": 130
+    },
+    {
+      "epoch": 0.37850625211220007,
+      "grad_norm": 1.8246347697698218,
+      "learning_rate": 4.923174377518552e-06,
+      "loss": 0.6014,
+      "step": 140
+    },
+    {
+      "epoch": 0.4055424129773572,
+      "grad_norm": 2.749005509317375,
+      "learning_rate": 4.903921338278051e-06,
+      "loss": 0.5932,
+      "step": 150
+    },
+    {
+      "epoch": 0.43257857384251436,
+      "grad_norm": 2.244874423316534,
+      "learning_rate": 4.882565228099148e-06,
+      "loss": 0.5958,
+      "step": 160
+    },
+    {
+      "epoch": 0.4596147347076715,
+      "grad_norm": 1.8552147310211213,
+      "learning_rate": 4.859125127234842e-06,
+      "loss": 0.5946,
+      "step": 170
+    },
+    {
+      "epoch": 0.48665089557282865,
+      "grad_norm": 2.058882586288013,
+      "learning_rate": 4.833621977844127e-06,
+      "loss": 0.5931,
+      "step": 180
+    },
+    {
+      "epoch": 0.5136870564379858,
+      "grad_norm": 1.7633755533968514,
+      "learning_rate": 4.80607856528157e-06,
+      "loss": 0.5905,
+      "step": 190
+    },
+    {
+      "epoch": 0.540723217303143,
+      "grad_norm": 1.5062385913475653,
+      "learning_rate": 4.776519497740133e-06,
+      "loss": 0.5881,
+      "step": 200
+    },
+    {
+      "epoch": 0.5677593781683001,
+      "grad_norm": 1.649775437241844,
+      "learning_rate": 4.744971184265394e-06,
+      "loss": 0.5898,
+      "step": 210
+    },
+    {
+      "epoch": 0.5947955390334573,
+      "grad_norm": 1.4372383743256916,
+      "learning_rate": 4.711461811160839e-06,
+      "loss": 0.5852,
+      "step": 220
+    },
+    {
+      "epoch": 0.6218316998986144,
+      "grad_norm": 1.6374054025057205,
+      "learning_rate": 4.676021316805291e-06,
+      "loss": 0.5864,
+      "step": 230
+    },
+    {
+      "epoch": 0.6488678607637716,
+      "grad_norm": 1.3705672684359966,
+      "learning_rate": 4.638681364904967e-06,
+      "loss": 0.5828,
+      "step": 240
+    },
+    {
+      "epoch": 0.6759040216289287,
+      "grad_norm": 1.4497378255708682,
+      "learning_rate": 4.599475316204093e-06,
+      "loss": 0.576,
+      "step": 250
+    },
+    {
+      "epoch": 0.7029401824940859,
+      "grad_norm": 1.4901529664738118,
+      "learning_rate": 4.558438198679298e-06,
+      "loss": 0.5789,
+      "step": 260
+    },
+    {
+      "epoch": 0.729976343359243,
+      "grad_norm": 1.509487695379131,
+      "learning_rate": 4.515606676244479e-06,
+      "loss": 0.5823,
+      "step": 270
+    },
+    {
+      "epoch": 0.7570125042244001,
+      "grad_norm": 1.4142815759753038,
+      "learning_rate": 4.471019015994042e-06,
+      "loss": 0.5763,
+      "step": 280
+    },
+    {
+      "epoch": 0.7840486650895573,
+      "grad_norm": 1.391721023396763,
+      "learning_rate": 4.424715054013825e-06,
+      "loss": 0.5763,
+      "step": 290
+    },
+    {
+      "epoch": 0.8110848259547144,
+      "grad_norm": 2.036811255977957,
+      "learning_rate": 4.376736159790221e-06,
+      "loss": 0.5768,
+      "step": 300
+    },
+    {
+      "epoch": 0.8381209868198716,
+      "grad_norm": 1.8743584001202067,
+      "learning_rate": 4.327125199249313e-06,
+      "loss": 0.5742,
+      "step": 310
+    },
+    {
+      "epoch": 0.8651571476850287,
+      "grad_norm": 1.7592149703550843,
+      "learning_rate": 4.275926496459052e-06,
+      "loss": 0.5721,
+      "step": 320
+    },
+    {
+      "epoch": 0.8921933085501859,
+      "grad_norm": 1.1705725093085595,
+      "learning_rate": 4.223185794028659e-06,
+      "loss": 0.5711,
+      "step": 330
+    },
+    {
+      "epoch": 0.919229469415343,
+      "grad_norm": 1.9309874457184006,
+      "learning_rate": 4.168950212240682e-06,
+      "loss": 0.5782,
+      "step": 340
+    },
+    {
+      "epoch": 0.9462656302805001,
+      "grad_norm": 2.440179989180274,
+      "learning_rate": 4.113268206952177e-06,
+      "loss": 0.57,
+      "step": 350
+    },
+    {
+      "epoch": 0.9733017911456573,
+      "grad_norm": 1.7341492741288718,
+      "learning_rate": 4.056189526302645e-06,
+      "loss": 0.5697,
+      "step": 360
+    },
+    {
+      "epoch": 0.9976343359242987,
+      "eval_loss": 0.07097312808036804,
+      "eval_runtime": 379.122,
+      "eval_samples_per_second": 26.284,
+      "eval_steps_per_second": 0.411,
+      "step": 369
+    },
+    {
+      "epoch": 1.0023656640757013,
+      "grad_norm": 3.541884342240149,
+      "learning_rate": 3.997765166267408e-06,
+      "loss": 0.5655,
+      "step": 370
+    },
+    {
+      "epoch": 1.0294018249408583,
+      "grad_norm": 2.144287711298856,
+      "learning_rate": 3.9380473250961195e-06,
+      "loss": 0.4883,
+      "step": 380
+    },
+    {
+      "epoch": 1.0564379858060156,
+      "grad_norm": 1.6382074544301437,
+      "learning_rate": 3.877089356677135e-06,
+      "loss": 0.4839,
+      "step": 390
+    },
+    {
+      "epoch": 1.0834741466711728,
+      "grad_norm": 1.5188972286145213,
+      "learning_rate": 3.814945722869378e-06,
+      "loss": 0.4853,
+      "step": 400
+    },
+    {
+      "epoch": 1.1105103075363298,
+      "grad_norm": 1.3764586999097552,
+      "learning_rate": 3.7516719448443244e-06,
+      "loss": 0.485,
+      "step": 410
+    },
+    {
+      "epoch": 1.1375464684014869,
+      "grad_norm": 1.3804469321724957,
+      "learning_rate": 3.6873245534815626e-06,
+      "loss": 0.4873,
+      "step": 420
+    },
+    {
+      "epoch": 1.1645826292666441,
+      "grad_norm": 1.4074948363242845,
+      "learning_rate": 3.621961038862231e-06,
+      "loss": 0.495,
+      "step": 430
+    },
+    {
+      "epoch": 1.1916187901318014,
+      "grad_norm": 1.3672614070984725,
+      "learning_rate": 3.5556397989054944e-06,
+      "loss": 0.4861,
+      "step": 440
+    },
+    {
+      "epoch": 1.2186549509969584,
+      "grad_norm": 1.5256951985176181,
+      "learning_rate": 3.4884200871939088e-06,
+      "loss": 0.4904,
+      "step": 450
+    },
+    {
+      "epoch": 1.2456911118621157,
+      "grad_norm": 1.8390241289827158,
+      "learning_rate": 3.4203619600343226e-06,
+      "loss": 0.4923,
+      "step": 460
+    },
+    {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 1.4516571737107622,
+      "learning_rate": 3.35152622280159e-06,
+      "loss": 0.4932,
+      "step": 470
+    },
+    {
+      "epoch": 1.29976343359243,
+      "grad_norm": 1.6389751646228208,
+      "learning_rate": 3.281974375613045e-06,
+      "loss": 0.4882,
+      "step": 480
+    },
+    {
+      "epoch": 1.326799594457587,
+      "grad_norm": 1.4400847542447044,
+      "learning_rate": 3.2117685583822672e-06,
+      "loss": 0.4877,
+      "step": 490
+    },
+    {
+      "epoch": 1.3538357553227442,
+      "grad_norm": 1.3579190420582792,
+      "learning_rate": 3.1409714953012355e-06,
+      "loss": 0.4897,
+      "step": 500
+    },
+    {
+      "epoch": 1.3808719161879013,
+      "grad_norm": 1.9092857680970101,
+      "learning_rate": 3.069646438800466e-06,
+      "loss": 0.4905,
+      "step": 510
+    },
+    {
+      "epoch": 1.4079080770530585,
+      "grad_norm": 1.3055916098302385,
+      "learning_rate": 2.9978571130372045e-06,
+      "loss": 0.4928,
+      "step": 520
+    },
+    {
+      "epoch": 1.4349442379182156,
+      "grad_norm": 1.4382290860035074,
+      "learning_rate": 2.925667656962165e-06,
+      "loss": 0.485,
+      "step": 530
+    },
+    {
+      "epoch": 1.4619803987833728,
+      "grad_norm": 1.522057711526301,
+      "learning_rate": 2.8531425670156767e-06,
+      "loss": 0.4897,
+      "step": 540
+    },
+    {
+      "epoch": 1.4890165596485299,
+      "grad_norm": 1.4339512710367517,
+      "learning_rate": 2.7803466395044337e-06,
+      "loss": 0.4876,
+      "step": 550
+    },
+    {
+      "epoch": 1.5160527205136871,
+      "grad_norm": 1.3073643288950498,
+      "learning_rate": 2.707344912710342e-06,
+      "loss": 0.4891,
+      "step": 560
+    },
+    {
+      "epoch": 1.5430888813788441,
+      "grad_norm": 1.369774353076571,
+      "learning_rate": 2.634202608783171e-06,
+      "loss": 0.4845,
+      "step": 570
+    },
+    {
+      "epoch": 1.5701250422440014,
+      "grad_norm": 1.2255590052602647,
+      "learning_rate": 2.5609850754689393e-06,
+      "loss": 0.486,
+      "step": 580
+    },
+    {
+      "epoch": 1.5971612031091587,
+      "grad_norm": 1.282149554684781,
+      "learning_rate": 2.4877577277260773e-06,
+      "loss": 0.489,
+      "step": 590
+    },
+    {
+      "epoch": 1.6241973639743157,
+      "grad_norm": 1.2721046491548977,
+      "learning_rate": 2.4145859892815512e-06,
+      "loss": 0.4858,
+      "step": 600
+    },
+    {
+      "epoch": 1.6512335248394727,
+      "grad_norm": 1.3827702644219955,
+      "learning_rate": 2.3415352341791467e-06,
+      "loss": 0.4837,
+      "step": 610
+    },
+    {
+      "epoch": 1.67826968570463,
+      "grad_norm": 1.327575194327116,
+      "learning_rate": 2.2686707283721406e-06,
+      "loss": 0.4877,
+      "step": 620
+    },
+    {
+      "epoch": 1.7053058465697872,
+      "grad_norm": 1.3568828598993092,
+      "learning_rate": 2.19605757141255e-06,
+      "loss": 0.4926,
+      "step": 630
+    },
+    {
+      "epoch": 1.7323420074349443,
+      "grad_norm": 1.2509722338502092,
+      "learning_rate": 2.123760638289043e-06,
+      "loss": 0.4893,
+      "step": 640
+    },
+    {
+      "epoch": 1.7593781683001013,
+      "grad_norm": 1.621567247889154,
+      "learning_rate": 2.0518445214654866e-06,
+      "loss": 0.487,
+      "step": 650
+    },
+    {
+      "epoch": 1.7864143291652586,
+      "grad_norm": 1.269029986877882,
+      "learning_rate": 1.980373473171907e-06,
+      "loss": 0.4835,
+      "step": 660
+    },
+    {
+      "epoch": 1.8134504900304158,
+      "grad_norm": 1.1955677021994464,
+      "learning_rate": 1.9094113479994346e-06,
+      "loss": 0.4849,
+      "step": 670
+    },
+    {
+      "epoch": 1.8404866508955728,
+      "grad_norm": 1.2345433836895534,
+      "learning_rate": 1.839021545850499e-06,
+      "loss": 0.4875,
+      "step": 680
+    },
+    {
+      "epoch": 1.8675228117607299,
+      "grad_norm": 1.2564382135038328,
+      "learning_rate": 1.7692669552952723e-06,
+      "loss": 0.4858,
+      "step": 690
+    },
+    {
+      "epoch": 1.8945589726258871,
+      "grad_norm": 1.256899424662725,
+      "learning_rate": 1.7002098973849416e-06,
+      "loss": 0.4872,
+      "step": 700
+    },
+    {
+      "epoch": 1.9215951334910444,
+      "grad_norm": 1.357653334253869,
+      "learning_rate": 1.631912069972033e-06,
+      "loss": 0.4916,
+      "step": 710
+    },
+    {
+      "epoch": 1.9486312943562014,
+      "grad_norm": 1.2314684983309747,
+      "learning_rate": 1.5644344925875097e-06,
+      "loss": 0.4836,
+      "step": 720
+    },
+    {
+      "epoch": 1.9756674552213584,
+      "grad_norm": 1.1478454732004184,
+      "learning_rate": 1.4978374519239131e-06,
+      "loss": 0.4788,
+      "step": 730
+    },
+    {
+      "epoch": 1.9972963839134843,
+      "eval_loss": 0.07031858712434769,
+      "eval_runtime": 378.0397,
+      "eval_samples_per_second": 26.36,
+      "eval_steps_per_second": 0.413,
+      "step": 738
+    },
+    {
+      "epoch": 2.0047313281514025,
+      "grad_norm": 3.2511739123964976,
+      "learning_rate": 1.432180447973249e-06,
+      "loss": 0.4716,
+      "step": 740
+    },
+    {
+      "epoch": 2.0317674890165596,
+      "grad_norm": 1.722212954142693,
+      "learning_rate": 1.3675221408677241e-06,
+      "loss": 0.4027,
+      "step": 750
+    },
+    {
+      "epoch": 2.0588036498817166,
+      "grad_norm": 1.5303304387352163,
+      "learning_rate": 1.303920298470856e-06,
+      "loss": 0.4068,
+      "step": 760
+    },
+    {
+      "epoch": 2.085839810746874,
+      "grad_norm": 1.3702186369869054,
+      "learning_rate": 1.2414317447657564e-06,
+      "loss": 0.4028,
+      "step": 770
+    },
+    {
+      "epoch": 2.112875971612031,
+      "grad_norm": 1.3334591253881896,
+      "learning_rate": 1.1801123090867129e-06,
+      "loss": 0.4024,
+      "step": 780
+    },
+    {
+      "epoch": 2.139912132477188,
+      "grad_norm": 1.370776472523979,
+      "learning_rate": 1.1200167762394187e-06,
+      "loss": 0.4016,
+      "step": 790
+    },
+    {
+      "epoch": 2.1669482933423456,
+      "grad_norm": 1.3101835210391692,
+      "learning_rate": 1.0611988375544214e-06,
+      "loss": 0.4072,
+      "step": 800
+    },
+    {
+      "epoch": 2.1939844542075027,
+      "grad_norm": 1.2890483237347024,
+      "learning_rate": 1.0037110429175214e-06,
+      "loss": 0.4043,
+      "step": 810
+    },
+    {
+      "epoch": 2.2210206150726597,
+      "grad_norm": 1.2394879407914554,
+      "learning_rate": 9.47604753819966e-07,
+      "loss": 0.4049,
+      "step": 820
+    },
+    {
+      "epoch": 2.2480567759378167,
+      "grad_norm": 1.2626807074989126,
+      "learning_rate": 8.929300974704016e-07,
+      "loss": 0.407,
+      "step": 830
+    },
+    {
+      "epoch": 2.2750929368029738,
+      "grad_norm": 1.2526670795727008,
+      "learning_rate": 8.397359220095739e-07,
+      "loss": 0.407,
+      "step": 840
+    },
+    {
+      "epoch": 2.3021290976681312,
+      "grad_norm": 1.2912703090742628,
+      "learning_rate": 7.88069752867787e-07,
+      "loss": 0.4012,
+      "step": 850
+    },
+    {
+      "epoch": 2.3291652585332883,
+      "grad_norm": 1.2248414426266379,
+      "learning_rate": 7.379777503041166e-07,
+      "loss": 0.4042,
+      "step": 860
+    },
+    {
+      "epoch": 2.3562014193984453,
+      "grad_norm": 1.245734138527781,
+      "learning_rate": 6.89504668165316e-07,
+      "loss": 0.4083,
+      "step": 870
+    },
+    {
+      "epoch": 2.3832375802636028,
+      "grad_norm": 1.2703147157094865,
+      "learning_rate": 6.426938139012476e-07,
+      "loss": 0.4068,
+      "step": 880
+    },
+    {
+      "epoch": 2.41027374112876,
+      "grad_norm": 1.26460233866951,
+      "learning_rate": 5.975870098725839e-07,
+      "loss": 0.4052,
+      "step": 890
+    },
+    {
+      "epoch": 2.437309901993917,
+      "grad_norm": 1.203035775818215,
+      "learning_rate": 5.542245559853314e-07,
+      "loss": 0.4043,
+      "step": 900
+    },
+    {
+      "epoch": 2.464346062859074,
+      "grad_norm": 1.2801464316323534,
+      "learning_rate": 5.12645193685563e-07,
+      "loss": 0.4026,
+      "step": 910
+    },
+    {
+      "epoch": 2.4913822237242313,
+      "grad_norm": 1.207211407952489,
+      "learning_rate": 4.728860713465348e-07,
+      "loss": 0.4018,
+      "step": 920
+    },
+    {
+      "epoch": 2.5184183845893884,
+      "grad_norm": 1.2978323617108884,
+      "learning_rate": 4.3498271107910423e-07,
+      "loss": 0.4006,
+      "step": 930
+    },
+    {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 1.2419638574449388,
+      "learning_rate": 3.989689769951037e-07,
+      "loss": 0.4072,
+      "step": 940
+    },
+    {
+      "epoch": 2.5724907063197024,
+      "grad_norm": 1.2184809628304476,
+      "learning_rate": 3.6487704495203003e-07,
+      "loss": 0.4012,
+      "step": 950
+    },
+    {
+      "epoch": 2.59952686718486,
+      "grad_norm": 1.2408761979548522,
+      "learning_rate": 3.3273737380606715e-07,
+      "loss": 0.407,
+      "step": 960
+    },
+    {
+      "epoch": 2.626563028050017,
+      "grad_norm": 1.2318951856879983,
+      "learning_rate": 3.02578678199144e-07,
+      "loss": 0.4042,
+      "step": 970
+    },
+    {
+      "epoch": 2.653599188915174,
+      "grad_norm": 1.2341509325385753,
+      "learning_rate": 2.74427902904325e-07,
+      "loss": 0.4074,
+      "step": 980
+    },
+    {
+      "epoch": 2.6806353497803315,
+      "grad_norm": 1.2021221404178184,
+      "learning_rate": 2.483101987524612e-07,
+      "loss": 0.4048,
+      "step": 990
+    },
+    {
+      "epoch": 2.7076715106454885,
+      "grad_norm": 1.2738693365179738,
+      "learning_rate": 2.2424890016160911e-07,
+      "loss": 0.3986,
+      "step": 1000
+    },
+    {
+      "epoch": 2.7347076715106455,
+      "grad_norm": 1.2692209511322345,
+      "learning_rate": 2.022655042892919e-07,
+      "loss": 0.4058,
+      "step": 1010
+    },
+    {
+      "epoch": 2.7617438323758026,
+      "grad_norm": 1.1771824503326054,
+      "learning_rate": 1.823796518262309e-07,
+      "loss": 0.4071,
+      "step": 1020
+    },
+    {
+      "epoch": 2.7887799932409596,
+      "grad_norm": 1.223679689500554,
+      "learning_rate": 1.6460910944870367e-07,
+      "loss": 0.4053,
+      "step": 1030
+    },
+    {
+      "epoch": 2.815816154106117,
+      "grad_norm": 1.1639416605117765,
+      "learning_rate": 1.489697539452106e-07,
+      "loss": 0.403,
+      "step": 1040
+    },
+    {
+      "epoch": 2.842852314971274,
+      "grad_norm": 1.191061950274548,
+      "learning_rate": 1.354755580316274e-07,
+      "loss": 0.4043,
+      "step": 1050
+    },
+    {
+      "epoch": 2.869888475836431,
+      "grad_norm": 1.1905738035063456,
+      "learning_rate": 1.2413857786752143e-07,
+      "loss": 0.399,
+      "step": 1060
+    },
+    {
+      "epoch": 2.8969246367015886,
+      "grad_norm": 1.1956160773000035,
+      "learning_rate": 1.1496894228478066e-07,
+      "loss": 0.4072,
+      "step": 1070
+    },
+    {
+      "epoch": 2.9239607975667457,
+      "grad_norm": 1.2066740723044425,
+      "learning_rate": 1.0797484373818065e-07,
+      "loss": 0.4014,
+      "step": 1080
+    },
+    {
+      "epoch": 2.9509969584319027,
+      "grad_norm": 1.1778515542586203,
+      "learning_rate": 1.0316253098597634e-07,
+      "loss": 0.4045,
+      "step": 1090
+    },
+    {
+      "epoch": 2.9780331192970597,
+      "grad_norm": 1.1602330819105369,
+      "learning_rate": 1.0053630350705497e-07,
+      "loss": 0.4034,
+      "step": 1100
+    },
+    {
+      "epoch": 2.99695843190267,
+      "eval_loss": 0.07343952357769012,
+      "eval_runtime": 380.9855,
+      "eval_samples_per_second": 26.156,
+      "eval_steps_per_second": 0.409,
+      "step": 1107
+    },
+    {
+      "epoch": 2.99695843190267,
+      "step": 1107,
+      "total_flos": 1854056851046400.0,
+      "train_loss": 0.4996785878489011,
+      "train_runtime": 63355.7173,
+      "train_samples_per_second": 8.965,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1107,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1854056851046400.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed