End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +927 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh_v1_w_v3_evol_instruct
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh_v1_w_v3_evol_instruct
-This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5306

 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh_v1_w_v3_evol_instruct
 # oh_v1_w_v3_evol_instruct
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/oh_v1_w_v3_evol_instruct dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5306

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.994548758328286,
+    "eval_loss": 0.530586838722229,
+    "eval_runtime": 221.2267,
+    "eval_samples_per_second": 50.27,
+    "eval_steps_per_second": 0.393,
+    "total_flos": 2069927276052480.0,
+    "train_loss": 0.5379253575716976,
+    "train_runtime": 37041.7776,
+    "train_samples_per_second": 17.113,
+    "train_steps_per_second": 0.033
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.994548758328286,
+    "eval_loss": 0.530586838722229,
+    "eval_runtime": 221.2267,
+    "eval_samples_per_second": 50.27,
+    "eval_steps_per_second": 0.393
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.994548758328286,
+    "total_flos": 2069927276052480.0,
+    "train_loss": 0.5379253575716976,
+    "train_runtime": 37041.7776,
+    "train_samples_per_second": 17.113,
+    "train_steps_per_second": 0.033
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,927 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.994548758328286,
+  "eval_steps": 500,
+  "global_step": 1236,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.024227740763173834,
+      "grad_norm": 3.9645989545939098,
+      "learning_rate": 5e-06,
+      "loss": 0.9024,
+      "step": 10
+    },
+    {
+      "epoch": 0.04845548152634767,
+      "grad_norm": 2.6091656491728203,
+      "learning_rate": 5e-06,
+      "loss": 0.7644,
+      "step": 20
+    },
+    {
+      "epoch": 0.0726832222895215,
+      "grad_norm": 1.6703261434511345,
+      "learning_rate": 5e-06,
+      "loss": 0.7238,
+      "step": 30
+    },
+    {
+      "epoch": 0.09691096305269534,
+      "grad_norm": 1.461094134640432,
+      "learning_rate": 5e-06,
+      "loss": 0.6977,
+      "step": 40
+    },
+    {
+      "epoch": 0.12113870381586916,
+      "grad_norm": 0.8067370661660782,
+      "learning_rate": 5e-06,
+      "loss": 0.6812,
+      "step": 50
+    },
+    {
+      "epoch": 0.145366444579043,
+      "grad_norm": 1.4104496160894404,
+      "learning_rate": 5e-06,
+      "loss": 0.6629,
+      "step": 60
+    },
+    {
+      "epoch": 0.16959418534221685,
+      "grad_norm": 1.2867495210279274,
+      "learning_rate": 5e-06,
+      "loss": 0.6521,
+      "step": 70
+    },
+    {
+      "epoch": 0.19382192610539067,
+      "grad_norm": 0.9866240288945327,
+      "learning_rate": 5e-06,
+      "loss": 0.6341,
+      "step": 80
+    },
+    {
+      "epoch": 0.2180496668685645,
+      "grad_norm": 0.7803813977069882,
+      "learning_rate": 5e-06,
+      "loss": 0.6285,
+      "step": 90
+    },
+    {
+      "epoch": 0.24227740763173833,
+      "grad_norm": 0.5198909215148648,
+      "learning_rate": 5e-06,
+      "loss": 0.6244,
+      "step": 100
+    },
+    {
+      "epoch": 0.2665051483949122,
+      "grad_norm": 0.5514332756156626,
+      "learning_rate": 5e-06,
+      "loss": 0.6184,
+      "step": 110
+    },
+    {
+      "epoch": 0.290732889158086,
+      "grad_norm": 1.2655246772691944,
+      "learning_rate": 5e-06,
+      "loss": 0.6167,
+      "step": 120
+    },
+    {
+      "epoch": 0.31496062992125984,
+      "grad_norm": 0.5276778250168523,
+      "learning_rate": 5e-06,
+      "loss": 0.6077,
+      "step": 130
+    },
+    {
+      "epoch": 0.3391883706844337,
+      "grad_norm": 0.5180368157026216,
+      "learning_rate": 5e-06,
+      "loss": 0.6025,
+      "step": 140
+    },
+    {
+      "epoch": 0.3634161114476075,
+      "grad_norm": 0.5457057952002654,
+      "learning_rate": 5e-06,
+      "loss": 0.6051,
+      "step": 150
+    },
+    {
+      "epoch": 0.38764385221078135,
+      "grad_norm": 0.6644602782199635,
+      "learning_rate": 5e-06,
+      "loss": 0.6054,
+      "step": 160
+    },
+    {
+      "epoch": 0.4118715929739552,
+      "grad_norm": 0.5187189241493725,
+      "learning_rate": 5e-06,
+      "loss": 0.6004,
+      "step": 170
+    },
+    {
+      "epoch": 0.436099333737129,
+      "grad_norm": 0.5056757936675704,
+      "learning_rate": 5e-06,
+      "loss": 0.5938,
+      "step": 180
+    },
+    {
+      "epoch": 0.46032707450030286,
+      "grad_norm": 0.5710728558545756,
+      "learning_rate": 5e-06,
+      "loss": 0.598,
+      "step": 190
+    },
+    {
+      "epoch": 0.48455481526347666,
+      "grad_norm": 0.5927928756195961,
+      "learning_rate": 5e-06,
+      "loss": 0.5983,
+      "step": 200
+    },
+    {
+      "epoch": 0.5087825560266506,
+      "grad_norm": 0.48351208074872803,
+      "learning_rate": 5e-06,
+      "loss": 0.59,
+      "step": 210
+    },
+    {
+      "epoch": 0.5330102967898244,
+      "grad_norm": 0.5309314168696151,
+      "learning_rate": 5e-06,
+      "loss": 0.5882,
+      "step": 220
+    },
+    {
+      "epoch": 0.5572380375529982,
+      "grad_norm": 0.5313229865997172,
+      "learning_rate": 5e-06,
+      "loss": 0.5926,
+      "step": 230
+    },
+    {
+      "epoch": 0.581465778316172,
+      "grad_norm": 0.5147829777258456,
+      "learning_rate": 5e-06,
+      "loss": 0.5851,
+      "step": 240
+    },
+    {
+      "epoch": 0.6056935190793459,
+      "grad_norm": 0.8359724219646064,
+      "learning_rate": 5e-06,
+      "loss": 0.5863,
+      "step": 250
+    },
+    {
+      "epoch": 0.6299212598425197,
+      "grad_norm": 0.4652543495079898,
+      "learning_rate": 5e-06,
+      "loss": 0.5813,
+      "step": 260
+    },
+    {
+      "epoch": 0.6541490006056935,
+      "grad_norm": 0.9372041174809007,
+      "learning_rate": 5e-06,
+      "loss": 0.5794,
+      "step": 270
+    },
+    {
+      "epoch": 0.6783767413688674,
+      "grad_norm": 0.5998442061578241,
+      "learning_rate": 5e-06,
+      "loss": 0.5828,
+      "step": 280
+    },
+    {
+      "epoch": 0.7026044821320412,
+      "grad_norm": 0.6092016010381666,
+      "learning_rate": 5e-06,
+      "loss": 0.5777,
+      "step": 290
+    },
+    {
+      "epoch": 0.726832222895215,
+      "grad_norm": 0.5895950017891558,
+      "learning_rate": 5e-06,
+      "loss": 0.5753,
+      "step": 300
+    },
+    {
+      "epoch": 0.7510599636583889,
+      "grad_norm": 0.45074902625510205,
+      "learning_rate": 5e-06,
+      "loss": 0.569,
+      "step": 310
+    },
+    {
+      "epoch": 0.7752877044215627,
+      "grad_norm": 0.6565760889840895,
+      "learning_rate": 5e-06,
+      "loss": 0.578,
+      "step": 320
+    },
+    {
+      "epoch": 0.7995154451847365,
+      "grad_norm": 0.6113007537505482,
+      "learning_rate": 5e-06,
+      "loss": 0.5792,
+      "step": 330
+    },
+    {
+      "epoch": 0.8237431859479104,
+      "grad_norm": 0.49731635826965837,
+      "learning_rate": 5e-06,
+      "loss": 0.5811,
+      "step": 340
+    },
+    {
+      "epoch": 0.8479709267110842,
+      "grad_norm": 0.5433055892052826,
+      "learning_rate": 5e-06,
+      "loss": 0.5727,
+      "step": 350
+    },
+    {
+      "epoch": 0.872198667474258,
+      "grad_norm": 0.5791498183091245,
+      "learning_rate": 5e-06,
+      "loss": 0.5722,
+      "step": 360
+    },
+    {
+      "epoch": 0.8964264082374318,
+      "grad_norm": 0.6563753601996463,
+      "learning_rate": 5e-06,
+      "loss": 0.5716,
+      "step": 370
+    },
+    {
+      "epoch": 0.9206541490006057,
+      "grad_norm": 0.5662298833739237,
+      "learning_rate": 5e-06,
+      "loss": 0.5673,
+      "step": 380
+    },
+    {
+      "epoch": 0.9448818897637795,
+      "grad_norm": 0.5441406943018641,
+      "learning_rate": 5e-06,
+      "loss": 0.5708,
+      "step": 390
+    },
+    {
+      "epoch": 0.9691096305269533,
+      "grad_norm": 0.6030868477920374,
+      "learning_rate": 5e-06,
+      "loss": 0.5707,
+      "step": 400
+    },
+    {
+      "epoch": 0.9933373712901272,
+      "grad_norm": 0.6009876519163959,
+      "learning_rate": 5e-06,
+      "loss": 0.5624,
+      "step": 410
+    },
+    {
+      "epoch": 0.9981829194427619,
+      "eval_loss": 0.5625064373016357,
+      "eval_runtime": 220.8608,
+      "eval_samples_per_second": 50.353,
+      "eval_steps_per_second": 0.394,
+      "step": 412
+    },
+    {
+      "epoch": 1.0175651120533011,
+      "grad_norm": 0.7010348009420115,
+      "learning_rate": 5e-06,
+      "loss": 0.542,
+      "step": 420
+    },
+    {
+      "epoch": 1.0417928528164748,
+      "grad_norm": 0.48384852790918653,
+      "learning_rate": 5e-06,
+      "loss": 0.5311,
+      "step": 430
+    },
+    {
+      "epoch": 1.0660205935796487,
+      "grad_norm": 0.7302174149962929,
+      "learning_rate": 5e-06,
+      "loss": 0.5346,
+      "step": 440
+    },
+    {
+      "epoch": 1.0902483343428226,
+      "grad_norm": 0.6089180089825021,
+      "learning_rate": 5e-06,
+      "loss": 0.5341,
+      "step": 450
+    },
+    {
+      "epoch": 1.1144760751059963,
+      "grad_norm": 0.6370901566175011,
+      "learning_rate": 5e-06,
+      "loss": 0.5278,
+      "step": 460
+    },
+    {
+      "epoch": 1.1387038158691702,
+      "grad_norm": 1.0810998793516953,
+      "learning_rate": 5e-06,
+      "loss": 0.5282,
+      "step": 470
+    },
+    {
+      "epoch": 1.1629315566323442,
+      "grad_norm": 0.5546290586428997,
+      "learning_rate": 5e-06,
+      "loss": 0.5294,
+      "step": 480
+    },
+    {
+      "epoch": 1.1871592973955178,
+      "grad_norm": 0.5393162492751344,
+      "learning_rate": 5e-06,
+      "loss": 0.5312,
+      "step": 490
+    },
+    {
+      "epoch": 1.2113870381586918,
+      "grad_norm": 0.539293775840461,
+      "learning_rate": 5e-06,
+      "loss": 0.5327,
+      "step": 500
+    },
+    {
+      "epoch": 1.2356147789218654,
+      "grad_norm": 0.5510862357568564,
+      "learning_rate": 5e-06,
+      "loss": 0.5292,
+      "step": 510
+    },
+    {
+      "epoch": 1.2598425196850394,
+      "grad_norm": 0.7420323514260083,
+      "learning_rate": 5e-06,
+      "loss": 0.5282,
+      "step": 520
+    },
+    {
+      "epoch": 1.2840702604482133,
+      "grad_norm": 0.7139637253621622,
+      "learning_rate": 5e-06,
+      "loss": 0.5305,
+      "step": 530
+    },
+    {
+      "epoch": 1.3082980012113872,
+      "grad_norm": 0.5530723883454477,
+      "learning_rate": 5e-06,
+      "loss": 0.5262,
+      "step": 540
+    },
+    {
+      "epoch": 1.3325257419745609,
+      "grad_norm": 0.5575001199886374,
+      "learning_rate": 5e-06,
+      "loss": 0.5219,
+      "step": 550
+    },
+    {
+      "epoch": 1.3567534827377348,
+      "grad_norm": 0.5598952872055329,
+      "learning_rate": 5e-06,
+      "loss": 0.5238,
+      "step": 560
+    },
+    {
+      "epoch": 1.3809812235009085,
+      "grad_norm": 0.541421977102063,
+      "learning_rate": 5e-06,
+      "loss": 0.5277,
+      "step": 570
+    },
+    {
+      "epoch": 1.4052089642640824,
+      "grad_norm": 0.4779799353516479,
+      "learning_rate": 5e-06,
+      "loss": 0.5233,
+      "step": 580
+    },
+    {
+      "epoch": 1.4294367050272563,
+      "grad_norm": 0.4967009823362501,
+      "learning_rate": 5e-06,
+      "loss": 0.5259,
+      "step": 590
+    },
+    {
+      "epoch": 1.45366444579043,
+      "grad_norm": 0.671567804923544,
+      "learning_rate": 5e-06,
+      "loss": 0.52,
+      "step": 600
+    },
+    {
+      "epoch": 1.4778921865536039,
+      "grad_norm": 0.5142557306648593,
+      "learning_rate": 5e-06,
+      "loss": 0.5219,
+      "step": 610
+    },
+    {
+      "epoch": 1.5021199273167776,
+      "grad_norm": 0.48133107485569593,
+      "learning_rate": 5e-06,
+      "loss": 0.5227,
+      "step": 620
+    },
+    {
+      "epoch": 1.5263476680799517,
+      "grad_norm": 0.4824602880697514,
+      "learning_rate": 5e-06,
+      "loss": 0.5254,
+      "step": 630
+    },
+    {
+      "epoch": 1.5505754088431254,
+      "grad_norm": 0.5818160150007355,
+      "learning_rate": 5e-06,
+      "loss": 0.5167,
+      "step": 640
+    },
+    {
+      "epoch": 1.574803149606299,
+      "grad_norm": 0.5210634892581837,
+      "learning_rate": 5e-06,
+      "loss": 0.5188,
+      "step": 650
+    },
+    {
+      "epoch": 1.5990308903694732,
+      "grad_norm": 0.5785709705676149,
+      "learning_rate": 5e-06,
+      "loss": 0.5247,
+      "step": 660
+    },
+    {
+      "epoch": 1.623258631132647,
+      "grad_norm": 0.49452413290015934,
+      "learning_rate": 5e-06,
+      "loss": 0.5163,
+      "step": 670
+    },
+    {
+      "epoch": 1.6474863718958206,
+      "grad_norm": 0.548802848702529,
+      "learning_rate": 5e-06,
+      "loss": 0.5156,
+      "step": 680
+    },
+    {
+      "epoch": 1.6717141126589945,
+      "grad_norm": 0.49091799711876205,
+      "learning_rate": 5e-06,
+      "loss": 0.519,
+      "step": 690
+    },
+    {
+      "epoch": 1.6959418534221684,
+      "grad_norm": 0.49801605722138825,
+      "learning_rate": 5e-06,
+      "loss": 0.5168,
+      "step": 700
+    },
+    {
+      "epoch": 1.720169594185342,
+      "grad_norm": 0.5325592423676614,
+      "learning_rate": 5e-06,
+      "loss": 0.5159,
+      "step": 710
+    },
+    {
+      "epoch": 1.744397334948516,
+      "grad_norm": 0.5638908149569352,
+      "learning_rate": 5e-06,
+      "loss": 0.5178,
+      "step": 720
+    },
+    {
+      "epoch": 1.76862507571169,
+      "grad_norm": 0.5592361495425977,
+      "learning_rate": 5e-06,
+      "loss": 0.5201,
+      "step": 730
+    },
+    {
+      "epoch": 1.7928528164748636,
+      "grad_norm": 0.6123941159516728,
+      "learning_rate": 5e-06,
+      "loss": 0.5158,
+      "step": 740
+    },
+    {
+      "epoch": 1.8170805572380375,
+      "grad_norm": 0.6140218865131343,
+      "learning_rate": 5e-06,
+      "loss": 0.5212,
+      "step": 750
+    },
+    {
+      "epoch": 1.8413082980012114,
+      "grad_norm": 0.5120361752286156,
+      "learning_rate": 5e-06,
+      "loss": 0.5165,
+      "step": 760
+    },
+    {
+      "epoch": 1.8655360387643851,
+      "grad_norm": 0.6257809652633152,
+      "learning_rate": 5e-06,
+      "loss": 0.5154,
+      "step": 770
+    },
+    {
+      "epoch": 1.889763779527559,
+      "grad_norm": 0.5517067441775361,
+      "learning_rate": 5e-06,
+      "loss": 0.5149,
+      "step": 780
+    },
+    {
+      "epoch": 1.913991520290733,
+      "grad_norm": 0.6184341957543669,
+      "learning_rate": 5e-06,
+      "loss": 0.5161,
+      "step": 790
+    },
+    {
+      "epoch": 1.9382192610539066,
+      "grad_norm": 0.45820885449224846,
+      "learning_rate": 5e-06,
+      "loss": 0.5163,
+      "step": 800
+    },
+    {
+      "epoch": 1.9624470018170805,
+      "grad_norm": 0.47230402419753703,
+      "learning_rate": 5e-06,
+      "loss": 0.5097,
+      "step": 810
+    },
+    {
+      "epoch": 1.9866747425802544,
+      "grad_norm": 0.5117535404655704,
+      "learning_rate": 5e-06,
+      "loss": 0.5104,
+      "step": 820
+    },
+    {
+      "epoch": 1.9987886129618413,
+      "eval_loss": 0.5373826026916504,
+      "eval_runtime": 221.5646,
+      "eval_samples_per_second": 50.193,
+      "eval_steps_per_second": 0.393,
+      "step": 825
+    },
+    {
+      "epoch": 2.010902483343428,
+      "grad_norm": 0.7236309822644943,
+      "learning_rate": 5e-06,
+      "loss": 0.4992,
+      "step": 830
+    },
+    {
+      "epoch": 2.0351302241066023,
+      "grad_norm": 0.51924831256788,
+      "learning_rate": 5e-06,
+      "loss": 0.4661,
+      "step": 840
+    },
+    {
+      "epoch": 2.059357964869776,
+      "grad_norm": 0.6287431376712824,
+      "learning_rate": 5e-06,
+      "loss": 0.4782,
+      "step": 850
+    },
+    {
+      "epoch": 2.0835857056329496,
+      "grad_norm": 0.5560237407779974,
+      "learning_rate": 5e-06,
+      "loss": 0.4752,
+      "step": 860
+    },
+    {
+      "epoch": 2.107813446396124,
+      "grad_norm": 0.4951419799289088,
+      "learning_rate": 5e-06,
+      "loss": 0.4783,
+      "step": 870
+    },
+    {
+      "epoch": 2.1320411871592975,
+      "grad_norm": 0.6200927154763336,
+      "learning_rate": 5e-06,
+      "loss": 0.4744,
+      "step": 880
+    },
+    {
+      "epoch": 2.156268927922471,
+      "grad_norm": 0.5434006657342179,
+      "learning_rate": 5e-06,
+      "loss": 0.4805,
+      "step": 890
+    },
+    {
+      "epoch": 2.1804966686856453,
+      "grad_norm": 0.5223425161148432,
+      "learning_rate": 5e-06,
+      "loss": 0.4754,
+      "step": 900
+    },
+    {
+      "epoch": 2.204724409448819,
+      "grad_norm": 0.6825783856408546,
+      "learning_rate": 5e-06,
+      "loss": 0.4758,
+      "step": 910
+    },
+    {
+      "epoch": 2.2289521502119927,
+      "grad_norm": 0.7441341191332547,
+      "learning_rate": 5e-06,
+      "loss": 0.4745,
+      "step": 920
+    },
+    {
+      "epoch": 2.253179890975167,
+      "grad_norm": 0.5442102121881107,
+      "learning_rate": 5e-06,
+      "loss": 0.4784,
+      "step": 930
+    },
+    {
+      "epoch": 2.2774076317383405,
+      "grad_norm": 0.6283995286149499,
+      "learning_rate": 5e-06,
+      "loss": 0.4804,
+      "step": 940
+    },
+    {
+      "epoch": 2.301635372501514,
+      "grad_norm": 0.5816653838824051,
+      "learning_rate": 5e-06,
+      "loss": 0.4778,
+      "step": 950
+    },
+    {
+      "epoch": 2.3258631132646883,
+      "grad_norm": 0.5549081081953725,
+      "learning_rate": 5e-06,
+      "loss": 0.4769,
+      "step": 960
+    },
+    {
+      "epoch": 2.350090854027862,
+      "grad_norm": 0.5279973698132591,
+      "learning_rate": 5e-06,
+      "loss": 0.4784,
+      "step": 970
+    },
+    {
+      "epoch": 2.3743185947910357,
+      "grad_norm": 0.6086532168897438,
+      "learning_rate": 5e-06,
+      "loss": 0.4859,
+      "step": 980
+    },
+    {
+      "epoch": 2.39854633555421,
+      "grad_norm": 0.5428630520508304,
+      "learning_rate": 5e-06,
+      "loss": 0.4797,
+      "step": 990
+    },
+    {
+      "epoch": 2.4227740763173835,
+      "grad_norm": 0.5159093369572579,
+      "learning_rate": 5e-06,
+      "loss": 0.4767,
+      "step": 1000
+    },
+    {
+      "epoch": 2.447001817080557,
+      "grad_norm": 0.6376648956193381,
+      "learning_rate": 5e-06,
+      "loss": 0.4797,
+      "step": 1010
+    },
+    {
+      "epoch": 2.471229557843731,
+      "grad_norm": 0.5547640740107239,
+      "learning_rate": 5e-06,
+      "loss": 0.4747,
+      "step": 1020
+    },
+    {
+      "epoch": 2.495457298606905,
+      "grad_norm": 0.4712927664253549,
+      "learning_rate": 5e-06,
+      "loss": 0.483,
+      "step": 1030
+    },
+    {
+      "epoch": 2.5196850393700787,
+      "grad_norm": 0.5246957682925482,
+      "learning_rate": 5e-06,
+      "loss": 0.4831,
+      "step": 1040
+    },
+    {
+      "epoch": 2.543912780133253,
+      "grad_norm": 0.49350610935172784,
+      "learning_rate": 5e-06,
+      "loss": 0.4761,
+      "step": 1050
+    },
+    {
+      "epoch": 2.5681405208964265,
+      "grad_norm": 0.5385640165749516,
+      "learning_rate": 5e-06,
+      "loss": 0.4781,
+      "step": 1060
+    },
+    {
+      "epoch": 2.5923682616596,
+      "grad_norm": 0.5103595421050182,
+      "learning_rate": 5e-06,
+      "loss": 0.4745,
+      "step": 1070
+    },
+    {
+      "epoch": 2.6165960024227743,
+      "grad_norm": 0.4917765675984819,
+      "learning_rate": 5e-06,
+      "loss": 0.4744,
+      "step": 1080
+    },
+    {
+      "epoch": 2.640823743185948,
+      "grad_norm": 0.5057838998647487,
+      "learning_rate": 5e-06,
+      "loss": 0.4812,
+      "step": 1090
+    },
+    {
+      "epoch": 2.6650514839491217,
+      "grad_norm": 0.5132155115777407,
+      "learning_rate": 5e-06,
+      "loss": 0.4714,
+      "step": 1100
+    },
+    {
+      "epoch": 2.6892792247122954,
+      "grad_norm": 0.5268065417493973,
+      "learning_rate": 5e-06,
+      "loss": 0.4761,
+      "step": 1110
+    },
+    {
+      "epoch": 2.7135069654754695,
+      "grad_norm": 0.552687045729179,
+      "learning_rate": 5e-06,
+      "loss": 0.4752,
+      "step": 1120
+    },
+    {
+      "epoch": 2.7377347062386432,
+      "grad_norm": 0.5709091633003863,
+      "learning_rate": 5e-06,
+      "loss": 0.4761,
+      "step": 1130
+    },
+    {
+      "epoch": 2.761962447001817,
+      "grad_norm": 0.4924939447065247,
+      "learning_rate": 5e-06,
+      "loss": 0.4817,
+      "step": 1140
+    },
+    {
+      "epoch": 2.786190187764991,
+      "grad_norm": 0.613145736032828,
+      "learning_rate": 5e-06,
+      "loss": 0.4758,
+      "step": 1150
+    },
+    {
+      "epoch": 2.8104179285281647,
+      "grad_norm": 0.5517385387267716,
+      "learning_rate": 5e-06,
+      "loss": 0.4732,
+      "step": 1160
+    },
+    {
+      "epoch": 2.8346456692913384,
+      "grad_norm": 0.5952199661627586,
+      "learning_rate": 5e-06,
+      "loss": 0.4832,
+      "step": 1170
+    },
+    {
+      "epoch": 2.8588734100545126,
+      "grad_norm": 0.6646202856678104,
+      "learning_rate": 5e-06,
+      "loss": 0.4767,
+      "step": 1180
+    },
+    {
+      "epoch": 2.8831011508176863,
+      "grad_norm": 0.7488420183599482,
+      "learning_rate": 5e-06,
+      "loss": 0.4773,
+      "step": 1190
+    },
+    {
+      "epoch": 2.90732889158086,
+      "grad_norm": 0.634712672540643,
+      "learning_rate": 5e-06,
+      "loss": 0.4803,
+      "step": 1200
+    },
+    {
+      "epoch": 2.931556632344034,
+      "grad_norm": 0.479455288955399,
+      "learning_rate": 5e-06,
+      "loss": 0.476,
+      "step": 1210
+    },
+    {
+      "epoch": 2.9557843731072078,
+      "grad_norm": 0.4788695949163765,
+      "learning_rate": 5e-06,
+      "loss": 0.4792,
+      "step": 1220
+    },
+    {
+      "epoch": 2.9800121138703815,
+      "grad_norm": 0.5724033736756394,
+      "learning_rate": 5e-06,
+      "loss": 0.4815,
+      "step": 1230
+    },
+    {
+      "epoch": 2.994548758328286,
+      "eval_loss": 0.530586838722229,
+      "eval_runtime": 221.8544,
+      "eval_samples_per_second": 50.127,
+      "eval_steps_per_second": 0.392,
+      "step": 1236
+    },
+    {
+      "epoch": 2.994548758328286,
+      "step": 1236,
+      "total_flos": 2069927276052480.0,
+      "train_loss": 0.5379253575716976,
+      "train_runtime": 37041.7776,
+      "train_samples_per_second": 17.113,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1236,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2069927276052480.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed