End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +906 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh_v1_w_v3_camel_chemistry_gpt-4o-mini
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh_v1_w_v3_camel_chemistry_gpt-4o-mini
-This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5540

 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh_v1_w_v3_camel_chemistry_gpt-4o-mini
 # oh_v1_w_v3_camel_chemistry_gpt-4o-mini
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/oh_v1_w_v3_camel_chemistry_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5540

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.5540264248847961,
+    "eval_runtime": 215.0054,
+    "eval_samples_per_second": 50.124,
+    "eval_steps_per_second": 0.395,
+    "total_flos": 2009625935216640.0,
+    "train_loss": 0.5581841540336608,
+    "train_runtime": 35639.5638,
+    "train_samples_per_second": 17.235,
+    "train_steps_per_second": 0.034
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.5540264248847961,
+    "eval_runtime": 215.0054,
+    "eval_samples_per_second": 50.124,
+    "eval_steps_per_second": 0.395
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2009625935216640.0,
+    "train_loss": 0.5581841540336608,
+    "train_runtime": 35639.5638,
+    "train_samples_per_second": 17.235,
+    "train_steps_per_second": 0.034
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,906 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.025,
+      "grad_norm": 6.2283793562421,
+      "learning_rate": 5e-06,
+      "loss": 0.9226,
+      "step": 10
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.073730252333159,
+      "learning_rate": 5e-06,
+      "loss": 0.7903,
+      "step": 20
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 1.0084608728414968,
+      "learning_rate": 5e-06,
+      "loss": 0.7361,
+      "step": 30
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.9477824152833885,
+      "learning_rate": 5e-06,
+      "loss": 0.7108,
+      "step": 40
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.8625762666467,
+      "learning_rate": 5e-06,
+      "loss": 0.686,
+      "step": 50
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.761040541226219,
+      "learning_rate": 5e-06,
+      "loss": 0.6788,
+      "step": 60
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.7440724149702675,
+      "learning_rate": 5e-06,
+      "loss": 0.665,
+      "step": 70
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6683283661638493,
+      "learning_rate": 5e-06,
+      "loss": 0.6647,
+      "step": 80
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.6752395142547716,
+      "learning_rate": 5e-06,
+      "loss": 0.6486,
+      "step": 90
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.6932666577646985,
+      "learning_rate": 5e-06,
+      "loss": 0.6394,
+      "step": 100
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.5563073495123759,
+      "learning_rate": 5e-06,
+      "loss": 0.6382,
+      "step": 110
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5732770966128271,
+      "learning_rate": 5e-06,
+      "loss": 0.6351,
+      "step": 120
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.6075711947587346,
+      "learning_rate": 5e-06,
+      "loss": 0.6318,
+      "step": 130
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.755814081800888,
+      "learning_rate": 5e-06,
+      "loss": 0.6267,
+      "step": 140
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.67881977859985,
+      "learning_rate": 5e-06,
+      "loss": 0.6292,
+      "step": 150
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.9073276911461459,
+      "learning_rate": 5e-06,
+      "loss": 0.6223,
+      "step": 160
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.6556934680429933,
+      "learning_rate": 5e-06,
+      "loss": 0.6181,
+      "step": 170
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5268985559340588,
+      "learning_rate": 5e-06,
+      "loss": 0.6189,
+      "step": 180
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.500078943711909,
+      "learning_rate": 5e-06,
+      "loss": 0.612,
+      "step": 190
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.5364000910049038,
+      "learning_rate": 5e-06,
+      "loss": 0.6113,
+      "step": 200
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.5290306126723202,
+      "learning_rate": 5e-06,
+      "loss": 0.6153,
+      "step": 210
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.4770992636352269,
+      "learning_rate": 5e-06,
+      "loss": 0.6104,
+      "step": 220
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.6289416522292515,
+      "learning_rate": 5e-06,
+      "loss": 0.6103,
+      "step": 230
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5398977888752696,
+      "learning_rate": 5e-06,
+      "loss": 0.6076,
+      "step": 240
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.5310488552193566,
+      "learning_rate": 5e-06,
+      "loss": 0.6096,
+      "step": 250
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.6110323317115457,
+      "learning_rate": 5e-06,
+      "loss": 0.605,
+      "step": 260
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.6068134600036437,
+      "learning_rate": 5e-06,
+      "loss": 0.6071,
+      "step": 270
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.5634137177645002,
+      "learning_rate": 5e-06,
+      "loss": 0.6,
+      "step": 280
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.7693981650465631,
+      "learning_rate": 5e-06,
+      "loss": 0.6053,
+      "step": 290
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.594391360805154,
+      "learning_rate": 5e-06,
+      "loss": 0.6032,
+      "step": 300
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.6029344366979934,
+      "learning_rate": 5e-06,
+      "loss": 0.6006,
+      "step": 310
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5077693887980811,
+      "learning_rate": 5e-06,
+      "loss": 0.5978,
+      "step": 320
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 0.5013009473527608,
+      "learning_rate": 5e-06,
+      "loss": 0.5939,
+      "step": 330
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.6898923986358316,
+      "learning_rate": 5e-06,
+      "loss": 0.5941,
+      "step": 340
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.6455192038734223,
+      "learning_rate": 5e-06,
+      "loss": 0.5948,
+      "step": 350
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.5846403091528135,
+      "learning_rate": 5e-06,
+      "loss": 0.5925,
+      "step": 360
+    },
+    {
+      "epoch": 0.925,
+      "grad_norm": 0.603873049442878,
+      "learning_rate": 5e-06,
+      "loss": 0.5885,
+      "step": 370
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.5089869229340043,
+      "learning_rate": 5e-06,
+      "loss": 0.587,
+      "step": 380
+    },
+    {
+      "epoch": 0.975,
+      "grad_norm": 0.46202950175215546,
+      "learning_rate": 5e-06,
+      "loss": 0.5866,
+      "step": 390
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5546489111159046,
+      "learning_rate": 5e-06,
+      "loss": 0.5865,
+      "step": 400
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.5859289765357971,
+      "eval_runtime": 214.5478,
+      "eval_samples_per_second": 50.231,
+      "eval_steps_per_second": 0.396,
+      "step": 400
+    },
+    {
+      "epoch": 1.025,
+      "grad_norm": 0.6920192241789218,
+      "learning_rate": 5e-06,
+      "loss": 0.5496,
+      "step": 410
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.6541721391004665,
+      "learning_rate": 5e-06,
+      "loss": 0.5509,
+      "step": 420
+    },
+    {
+      "epoch": 1.075,
+      "grad_norm": 0.614260313358043,
+      "learning_rate": 5e-06,
+      "loss": 0.558,
+      "step": 430
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.5447908907434397,
+      "learning_rate": 5e-06,
+      "loss": 0.5524,
+      "step": 440
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.5958987826147251,
+      "learning_rate": 5e-06,
+      "loss": 0.5501,
+      "step": 450
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.5389526586794362,
+      "learning_rate": 5e-06,
+      "loss": 0.5497,
+      "step": 460
+    },
+    {
+      "epoch": 1.175,
+      "grad_norm": 0.6252863504479338,
+      "learning_rate": 5e-06,
+      "loss": 0.548,
+      "step": 470
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.47836275145239177,
+      "learning_rate": 5e-06,
+      "loss": 0.5516,
+      "step": 480
+    },
+    {
+      "epoch": 1.225,
+      "grad_norm": 0.4969967566952363,
+      "learning_rate": 5e-06,
+      "loss": 0.5456,
+      "step": 490
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.5401134954649838,
+      "learning_rate": 5e-06,
+      "loss": 0.5559,
+      "step": 500
+    },
+    {
+      "epoch": 1.275,
+      "grad_norm": 0.5052946204256015,
+      "learning_rate": 5e-06,
+      "loss": 0.5463,
+      "step": 510
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.5464783361936272,
+      "learning_rate": 5e-06,
+      "loss": 0.5508,
+      "step": 520
+    },
+    {
+      "epoch": 1.325,
+      "grad_norm": 0.7005683495135656,
+      "learning_rate": 5e-06,
+      "loss": 0.551,
+      "step": 530
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.8637480119643226,
+      "learning_rate": 5e-06,
+      "loss": 0.5447,
+      "step": 540
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 0.6198566675351206,
+      "learning_rate": 5e-06,
+      "loss": 0.5414,
+      "step": 550
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.5676940292369252,
+      "learning_rate": 5e-06,
+      "loss": 0.5461,
+      "step": 560
+    },
+    {
+      "epoch": 1.425,
+      "grad_norm": 0.46615761940342565,
+      "learning_rate": 5e-06,
+      "loss": 0.5391,
+      "step": 570
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.5299833857615388,
+      "learning_rate": 5e-06,
+      "loss": 0.544,
+      "step": 580
+    },
+    {
+      "epoch": 1.475,
+      "grad_norm": 0.5350523878143963,
+      "learning_rate": 5e-06,
+      "loss": 0.5458,
+      "step": 590
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.5367249564898715,
+      "learning_rate": 5e-06,
+      "loss": 0.5462,
+      "step": 600
+    },
+    {
+      "epoch": 1.525,
+      "grad_norm": 0.7827798795419177,
+      "learning_rate": 5e-06,
+      "loss": 0.5463,
+      "step": 610
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.6823097374119125,
+      "learning_rate": 5e-06,
+      "loss": 0.5501,
+      "step": 620
+    },
+    {
+      "epoch": 1.575,
+      "grad_norm": 0.6572832849543329,
+      "learning_rate": 5e-06,
+      "loss": 0.5437,
+      "step": 630
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.5914218329646164,
+      "learning_rate": 5e-06,
+      "loss": 0.5345,
+      "step": 640
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 0.5060878083849677,
+      "learning_rate": 5e-06,
+      "loss": 0.5336,
+      "step": 650
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.5427787163346001,
+      "learning_rate": 5e-06,
+      "loss": 0.5361,
+      "step": 660
+    },
+    {
+      "epoch": 1.675,
+      "grad_norm": 0.6015169228068791,
+      "learning_rate": 5e-06,
+      "loss": 0.5395,
+      "step": 670
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.522909609070953,
+      "learning_rate": 5e-06,
+      "loss": 0.5407,
+      "step": 680
+    },
+    {
+      "epoch": 1.725,
+      "grad_norm": 0.5648959450008263,
+      "learning_rate": 5e-06,
+      "loss": 0.5384,
+      "step": 690
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.5073930020348113,
+      "learning_rate": 5e-06,
+      "loss": 0.5385,
+      "step": 700
+    },
+    {
+      "epoch": 1.775,
+      "grad_norm": 0.5344270884192877,
+      "learning_rate": 5e-06,
+      "loss": 0.5389,
+      "step": 710
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.5387282660221612,
+      "learning_rate": 5e-06,
+      "loss": 0.5384,
+      "step": 720
+    },
+    {
+      "epoch": 1.825,
+      "grad_norm": 0.630517812852184,
+      "learning_rate": 5e-06,
+      "loss": 0.5378,
+      "step": 730
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.528770579788001,
+      "learning_rate": 5e-06,
+      "loss": 0.5373,
+      "step": 740
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.46867857755871645,
+      "learning_rate": 5e-06,
+      "loss": 0.5302,
+      "step": 750
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.5407357139497844,
+      "learning_rate": 5e-06,
+      "loss": 0.5327,
+      "step": 760
+    },
+    {
+      "epoch": 1.925,
+      "grad_norm": 0.5955017346639638,
+      "learning_rate": 5e-06,
+      "loss": 0.5357,
+      "step": 770
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.4917679827974974,
+      "learning_rate": 5e-06,
+      "loss": 0.5313,
+      "step": 780
+    },
+    {
+      "epoch": 1.975,
+      "grad_norm": 0.5426327036736968,
+      "learning_rate": 5e-06,
+      "loss": 0.5362,
+      "step": 790
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.5570351037553893,
+      "learning_rate": 5e-06,
+      "loss": 0.5346,
+      "step": 800
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.5597097873687744,
+      "eval_runtime": 215.4672,
+      "eval_samples_per_second": 50.017,
+      "eval_steps_per_second": 0.394,
+      "step": 800
+    },
+    {
+      "epoch": 2.025,
+      "grad_norm": 0.6612509870643555,
+      "learning_rate": 5e-06,
+      "loss": 0.4939,
+      "step": 810
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 0.5438913454843807,
+      "learning_rate": 5e-06,
+      "loss": 0.4901,
+      "step": 820
+    },
+    {
+      "epoch": 2.075,
+      "grad_norm": 0.7056566708123541,
+      "learning_rate": 5e-06,
+      "loss": 0.4961,
+      "step": 830
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.5098170694120924,
+      "learning_rate": 5e-06,
+      "loss": 0.4971,
+      "step": 840
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": 0.6070617882857331,
+      "learning_rate": 5e-06,
+      "loss": 0.4981,
+      "step": 850
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 0.5998341137122876,
+      "learning_rate": 5e-06,
+      "loss": 0.4977,
+      "step": 860
+    },
+    {
+      "epoch": 2.175,
+      "grad_norm": 0.58734738292625,
+      "learning_rate": 5e-06,
+      "loss": 0.5015,
+      "step": 870
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.6197078930251222,
+      "learning_rate": 5e-06,
+      "loss": 0.4979,
+      "step": 880
+    },
+    {
+      "epoch": 2.225,
+      "grad_norm": 0.6086748900409549,
+      "learning_rate": 5e-06,
+      "loss": 0.4969,
+      "step": 890
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.546234904601564,
+      "learning_rate": 5e-06,
+      "loss": 0.4975,
+      "step": 900
+    },
+    {
+      "epoch": 2.275,
+      "grad_norm": 0.5160078517376208,
+      "learning_rate": 5e-06,
+      "loss": 0.498,
+      "step": 910
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.5415276924291007,
+      "learning_rate": 5e-06,
+      "loss": 0.4943,
+      "step": 920
+    },
+    {
+      "epoch": 2.325,
+      "grad_norm": 0.5686966271920224,
+      "learning_rate": 5e-06,
+      "loss": 0.4934,
+      "step": 930
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 0.5936539945494198,
+      "learning_rate": 5e-06,
+      "loss": 0.4992,
+      "step": 940
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": 0.5548802299834517,
+      "learning_rate": 5e-06,
+      "loss": 0.5051,
+      "step": 950
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.5103322725061038,
+      "learning_rate": 5e-06,
+      "loss": 0.4955,
+      "step": 960
+    },
+    {
+      "epoch": 2.425,
+      "grad_norm": 0.5211482899619925,
+      "learning_rate": 5e-06,
+      "loss": 0.4988,
+      "step": 970
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 0.4896499548762498,
+      "learning_rate": 5e-06,
+      "loss": 0.5012,
+      "step": 980
+    },
+    {
+      "epoch": 2.475,
+      "grad_norm": 0.6075465454296445,
+      "learning_rate": 5e-06,
+      "loss": 0.4921,
+      "step": 990
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.588232935912865,
+      "learning_rate": 5e-06,
+      "loss": 0.4967,
+      "step": 1000
+    },
+    {
+      "epoch": 2.525,
+      "grad_norm": 0.533857697833421,
+      "learning_rate": 5e-06,
+      "loss": 0.4981,
+      "step": 1010
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 0.5102421831778537,
+      "learning_rate": 5e-06,
+      "loss": 0.4963,
+      "step": 1020
+    },
+    {
+      "epoch": 2.575,
+      "grad_norm": 0.571515094485817,
+      "learning_rate": 5e-06,
+      "loss": 0.4994,
+      "step": 1030
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.5274028357185288,
+      "learning_rate": 5e-06,
+      "loss": 0.4998,
+      "step": 1040
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": 0.5263461614707381,
+      "learning_rate": 5e-06,
+      "loss": 0.4935,
+      "step": 1050
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 0.5484636738493971,
+      "learning_rate": 5e-06,
+      "loss": 0.495,
+      "step": 1060
+    },
+    {
+      "epoch": 2.675,
+      "grad_norm": 0.48284125009839746,
+      "learning_rate": 5e-06,
+      "loss": 0.5026,
+      "step": 1070
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.5049035715654736,
+      "learning_rate": 5e-06,
+      "loss": 0.4953,
+      "step": 1080
+    },
+    {
+      "epoch": 2.725,
+      "grad_norm": 0.5451746081470605,
+      "learning_rate": 5e-06,
+      "loss": 0.4917,
+      "step": 1090
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.4946736397645321,
+      "learning_rate": 5e-06,
+      "loss": 0.4982,
+      "step": 1100
+    },
+    {
+      "epoch": 2.775,
+      "grad_norm": 0.5804259517812362,
+      "learning_rate": 5e-06,
+      "loss": 0.4939,
+      "step": 1110
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.5489030189752196,
+      "learning_rate": 5e-06,
+      "loss": 0.4957,
+      "step": 1120
+    },
+    {
+      "epoch": 2.825,
+      "grad_norm": 0.5457133379941178,
+      "learning_rate": 5e-06,
+      "loss": 0.4948,
+      "step": 1130
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.5151610258671091,
+      "learning_rate": 5e-06,
+      "loss": 0.4965,
+      "step": 1140
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": 0.5480931688710529,
+      "learning_rate": 5e-06,
+      "loss": 0.5025,
+      "step": 1150
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.5250233587635805,
+      "learning_rate": 5e-06,
+      "loss": 0.5,
+      "step": 1160
+    },
+    {
+      "epoch": 2.925,
+      "grad_norm": 0.5611546648048623,
+      "learning_rate": 5e-06,
+      "loss": 0.4959,
+      "step": 1170
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 0.5168606076772253,
+      "learning_rate": 5e-06,
+      "loss": 0.4976,
+      "step": 1180
+    },
+    {
+      "epoch": 2.975,
+      "grad_norm": 0.5089740614604118,
+      "learning_rate": 5e-06,
+      "loss": 0.4977,
+      "step": 1190
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.501278157123975,
+      "learning_rate": 5e-06,
+      "loss": 0.4927,
+      "step": 1200
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.5540264248847961,
+      "eval_runtime": 214.8763,
+      "eval_samples_per_second": 50.154,
+      "eval_steps_per_second": 0.396,
+      "step": 1200
+    },
+    {
+      "epoch": 3.0,
+      "step": 1200,
+      "total_flos": 2009625935216640.0,
+      "train_loss": 0.5581841540336608,
+      "train_runtime": 35639.5638,
+      "train_samples_per_second": 17.235,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2009625935216640.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed