End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +934 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3
 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_camel_ai_biology
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # OH_DCFT_V3_wo_camel_ai_biology
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6422

 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_camel_ai_biology
 # OH_DCFT_V3_wo_camel_ai_biology
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_camel_ai_biology dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6422

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9945750452079567,
+    "eval_loss": 0.6422178745269775,
+    "eval_runtime": 223.8998,
+    "eval_samples_per_second": 49.915,
+    "eval_steps_per_second": 0.393,
+    "total_flos": 2079977499525120.0,
+    "train_loss": 0.6134321775029439,
+    "train_runtime": 37323.5248,
+    "train_samples_per_second": 17.067,
+    "train_steps_per_second": 0.033
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9945750452079567,
+    "eval_loss": 0.6422178745269775,
+    "eval_runtime": 223.8998,
+    "eval_samples_per_second": 49.915,
+    "eval_steps_per_second": 0.393
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9945750452079567,
+    "total_flos": 2079977499525120.0,
+    "train_loss": 0.6134321775029439,
+    "train_runtime": 37323.5248,
+    "train_samples_per_second": 17.067,
+    "train_steps_per_second": 0.033
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,934 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9945750452079567,
+  "eval_steps": 500,
+  "global_step": 1242,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.024110910186859555,
+      "grad_norm": 2.685936547429143,
+      "learning_rate": 5e-06,
+      "loss": 0.9133,
+      "step": 10
+    },
+    {
+      "epoch": 0.04822182037371911,
+      "grad_norm": 2.8983680167236696,
+      "learning_rate": 5e-06,
+      "loss": 0.7794,
+      "step": 20
+    },
+    {
+      "epoch": 0.07233273056057866,
+      "grad_norm": 1.4715031186017837,
+      "learning_rate": 5e-06,
+      "loss": 0.759,
+      "step": 30
+    },
+    {
+      "epoch": 0.09644364074743822,
+      "grad_norm": 0.8377570160614484,
+      "learning_rate": 5e-06,
+      "loss": 0.7317,
+      "step": 40
+    },
+    {
+      "epoch": 0.12055455093429777,
+      "grad_norm": 1.0172390996140888,
+      "learning_rate": 5e-06,
+      "loss": 0.728,
+      "step": 50
+    },
+    {
+      "epoch": 0.14466546112115733,
+      "grad_norm": 2.076736903100202,
+      "learning_rate": 5e-06,
+      "loss": 0.7065,
+      "step": 60
+    },
+    {
+      "epoch": 0.16877637130801687,
+      "grad_norm": 0.9324814149418421,
+      "learning_rate": 5e-06,
+      "loss": 0.7025,
+      "step": 70
+    },
+    {
+      "epoch": 0.19288728149487644,
+      "grad_norm": 0.8961533346444337,
+      "learning_rate": 5e-06,
+      "loss": 0.6948,
+      "step": 80
+    },
+    {
+      "epoch": 0.21699819168173598,
+      "grad_norm": 0.9833679322557037,
+      "learning_rate": 5e-06,
+      "loss": 0.6997,
+      "step": 90
+    },
+    {
+      "epoch": 0.24110910186859555,
+      "grad_norm": 0.6320425718923794,
+      "learning_rate": 5e-06,
+      "loss": 0.6781,
+      "step": 100
+    },
+    {
+      "epoch": 0.2652200120554551,
+      "grad_norm": 0.8047932595929222,
+      "learning_rate": 5e-06,
+      "loss": 0.6857,
+      "step": 110
+    },
+    {
+      "epoch": 0.28933092224231466,
+      "grad_norm": 0.6873832051216665,
+      "learning_rate": 5e-06,
+      "loss": 0.6762,
+      "step": 120
+    },
+    {
+      "epoch": 0.3134418324291742,
+      "grad_norm": 0.8515251273118922,
+      "learning_rate": 5e-06,
+      "loss": 0.6741,
+      "step": 130
+    },
+    {
+      "epoch": 0.33755274261603374,
+      "grad_norm": 0.6815399625732373,
+      "learning_rate": 5e-06,
+      "loss": 0.6786,
+      "step": 140
+    },
+    {
+      "epoch": 0.3616636528028933,
+      "grad_norm": 0.6231823130658575,
+      "learning_rate": 5e-06,
+      "loss": 0.6763,
+      "step": 150
+    },
+    {
+      "epoch": 0.3857745629897529,
+      "grad_norm": 0.552827319574485,
+      "learning_rate": 5e-06,
+      "loss": 0.6711,
+      "step": 160
+    },
+    {
+      "epoch": 0.4098854731766124,
+      "grad_norm": 0.6826986498299203,
+      "learning_rate": 5e-06,
+      "loss": 0.6706,
+      "step": 170
+    },
+    {
+      "epoch": 0.43399638336347196,
+      "grad_norm": 0.5574310360062503,
+      "learning_rate": 5e-06,
+      "loss": 0.6659,
+      "step": 180
+    },
+    {
+      "epoch": 0.45810729355033153,
+      "grad_norm": 0.7613567669157012,
+      "learning_rate": 5e-06,
+      "loss": 0.6658,
+      "step": 190
+    },
+    {
+      "epoch": 0.4822182037371911,
+      "grad_norm": 0.5609659476480818,
+      "learning_rate": 5e-06,
+      "loss": 0.6598,
+      "step": 200
+    },
+    {
+      "epoch": 0.5063291139240507,
+      "grad_norm": 1.078834895881199,
+      "learning_rate": 5e-06,
+      "loss": 0.6687,
+      "step": 210
+    },
+    {
+      "epoch": 0.5304400241109102,
+      "grad_norm": 0.6016551358752319,
+      "learning_rate": 5e-06,
+      "loss": 0.6667,
+      "step": 220
+    },
+    {
+      "epoch": 0.5545509342977697,
+      "grad_norm": 0.5329067498961892,
+      "learning_rate": 5e-06,
+      "loss": 0.6568,
+      "step": 230
+    },
+    {
+      "epoch": 0.5786618444846293,
+      "grad_norm": 0.5844269800148942,
+      "learning_rate": 5e-06,
+      "loss": 0.6656,
+      "step": 240
+    },
+    {
+      "epoch": 0.6027727546714888,
+      "grad_norm": 0.713015217035973,
+      "learning_rate": 5e-06,
+      "loss": 0.6584,
+      "step": 250
+    },
+    {
+      "epoch": 0.6268836648583485,
+      "grad_norm": 0.7063878216983879,
+      "learning_rate": 5e-06,
+      "loss": 0.665,
+      "step": 260
+    },
+    {
+      "epoch": 0.650994575045208,
+      "grad_norm": 0.50774960805631,
+      "learning_rate": 5e-06,
+      "loss": 0.6615,
+      "step": 270
+    },
+    {
+      "epoch": 0.6751054852320675,
+      "grad_norm": 0.6111313528033431,
+      "learning_rate": 5e-06,
+      "loss": 0.6551,
+      "step": 280
+    },
+    {
+      "epoch": 0.6992163954189271,
+      "grad_norm": 0.6458858962308502,
+      "learning_rate": 5e-06,
+      "loss": 0.6535,
+      "step": 290
+    },
+    {
+      "epoch": 0.7233273056057866,
+      "grad_norm": 0.6797329430329018,
+      "learning_rate": 5e-06,
+      "loss": 0.6616,
+      "step": 300
+    },
+    {
+      "epoch": 0.7474382157926461,
+      "grad_norm": 1.0271382997748104,
+      "learning_rate": 5e-06,
+      "loss": 0.6593,
+      "step": 310
+    },
+    {
+      "epoch": 0.7715491259795058,
+      "grad_norm": 0.5821025343959978,
+      "learning_rate": 5e-06,
+      "loss": 0.6556,
+      "step": 320
+    },
+    {
+      "epoch": 0.7956600361663653,
+      "grad_norm": 0.575144218324774,
+      "learning_rate": 5e-06,
+      "loss": 0.6522,
+      "step": 330
+    },
+    {
+      "epoch": 0.8197709463532248,
+      "grad_norm": 0.4992177743591918,
+      "learning_rate": 5e-06,
+      "loss": 0.6472,
+      "step": 340
+    },
+    {
+      "epoch": 0.8438818565400844,
+      "grad_norm": 0.5518799725500897,
+      "learning_rate": 5e-06,
+      "loss": 0.6486,
+      "step": 350
+    },
+    {
+      "epoch": 0.8679927667269439,
+      "grad_norm": 0.6827706978670125,
+      "learning_rate": 5e-06,
+      "loss": 0.6527,
+      "step": 360
+    },
+    {
+      "epoch": 0.8921036769138035,
+      "grad_norm": 0.5370276906753118,
+      "learning_rate": 5e-06,
+      "loss": 0.6564,
+      "step": 370
+    },
+    {
+      "epoch": 0.9162145871006631,
+      "grad_norm": 0.5011748190469159,
+      "learning_rate": 5e-06,
+      "loss": 0.648,
+      "step": 380
+    },
+    {
+      "epoch": 0.9403254972875226,
+      "grad_norm": 0.7289445343800255,
+      "learning_rate": 5e-06,
+      "loss": 0.645,
+      "step": 390
+    },
+    {
+      "epoch": 0.9644364074743822,
+      "grad_norm": 0.5223137931656774,
+      "learning_rate": 5e-06,
+      "loss": 0.6481,
+      "step": 400
+    },
+    {
+      "epoch": 0.9885473176612417,
+      "grad_norm": 0.5702001612329072,
+      "learning_rate": 5e-06,
+      "loss": 0.6417,
+      "step": 410
+    },
+    {
+      "epoch": 0.9981916817359855,
+      "eval_loss": 0.6491908431053162,
+      "eval_runtime": 223.3579,
+      "eval_samples_per_second": 50.036,
+      "eval_steps_per_second": 0.394,
+      "step": 414
+    },
+    {
+      "epoch": 1.0126582278481013,
+      "grad_norm": 0.6141037997267318,
+      "learning_rate": 5e-06,
+      "loss": 0.6207,
+      "step": 420
+    },
+    {
+      "epoch": 1.0367691380349608,
+      "grad_norm": 0.5738222179228437,
+      "learning_rate": 5e-06,
+      "loss": 0.6073,
+      "step": 430
+    },
+    {
+      "epoch": 1.0608800482218204,
+      "grad_norm": 0.6152321417799828,
+      "learning_rate": 5e-06,
+      "loss": 0.5963,
+      "step": 440
+    },
+    {
+      "epoch": 1.0849909584086799,
+      "grad_norm": 0.6230797448075694,
+      "learning_rate": 5e-06,
+      "loss": 0.6035,
+      "step": 450
+    },
+    {
+      "epoch": 1.1091018685955394,
+      "grad_norm": 0.5547485435536735,
+      "learning_rate": 5e-06,
+      "loss": 0.6043,
+      "step": 460
+    },
+    {
+      "epoch": 1.1332127787823991,
+      "grad_norm": 0.6897788261093171,
+      "learning_rate": 5e-06,
+      "loss": 0.6059,
+      "step": 470
+    },
+    {
+      "epoch": 1.1573236889692586,
+      "grad_norm": 0.5319379437293987,
+      "learning_rate": 5e-06,
+      "loss": 0.5991,
+      "step": 480
+    },
+    {
+      "epoch": 1.1814345991561181,
+      "grad_norm": 0.5927433636509655,
+      "learning_rate": 5e-06,
+      "loss": 0.6033,
+      "step": 490
+    },
+    {
+      "epoch": 1.2055455093429777,
+      "grad_norm": 0.6178241976987927,
+      "learning_rate": 5e-06,
+      "loss": 0.6,
+      "step": 500
+    },
+    {
+      "epoch": 1.2296564195298372,
+      "grad_norm": 0.5009847922110348,
+      "learning_rate": 5e-06,
+      "loss": 0.5987,
+      "step": 510
+    },
+    {
+      "epoch": 1.253767329716697,
+      "grad_norm": 0.6865827636690425,
+      "learning_rate": 5e-06,
+      "loss": 0.6039,
+      "step": 520
+    },
+    {
+      "epoch": 1.2778782399035564,
+      "grad_norm": 0.6419339118636196,
+      "learning_rate": 5e-06,
+      "loss": 0.5977,
+      "step": 530
+    },
+    {
+      "epoch": 1.301989150090416,
+      "grad_norm": 0.5403820568820131,
+      "learning_rate": 5e-06,
+      "loss": 0.6053,
+      "step": 540
+    },
+    {
+      "epoch": 1.3261000602772754,
+      "grad_norm": 0.496944344094317,
+      "learning_rate": 5e-06,
+      "loss": 0.6043,
+      "step": 550
+    },
+    {
+      "epoch": 1.350210970464135,
+      "grad_norm": 0.6835364259470225,
+      "learning_rate": 5e-06,
+      "loss": 0.6015,
+      "step": 560
+    },
+    {
+      "epoch": 1.3743218806509945,
+      "grad_norm": 0.5433357613957998,
+      "learning_rate": 5e-06,
+      "loss": 0.5979,
+      "step": 570
+    },
+    {
+      "epoch": 1.3984327908378542,
+      "grad_norm": 0.48293592352088544,
+      "learning_rate": 5e-06,
+      "loss": 0.6039,
+      "step": 580
+    },
+    {
+      "epoch": 1.4225437010247137,
+      "grad_norm": 0.5167692584382013,
+      "learning_rate": 5e-06,
+      "loss": 0.6029,
+      "step": 590
+    },
+    {
+      "epoch": 1.4466546112115732,
+      "grad_norm": 0.5467014681458703,
+      "learning_rate": 5e-06,
+      "loss": 0.6056,
+      "step": 600
+    },
+    {
+      "epoch": 1.4707655213984328,
+      "grad_norm": 0.48669984975762765,
+      "learning_rate": 5e-06,
+      "loss": 0.6053,
+      "step": 610
+    },
+    {
+      "epoch": 1.4948764315852923,
+      "grad_norm": 0.5052139384494145,
+      "learning_rate": 5e-06,
+      "loss": 0.607,
+      "step": 620
+    },
+    {
+      "epoch": 1.518987341772152,
+      "grad_norm": 0.5189039466272587,
+      "learning_rate": 5e-06,
+      "loss": 0.6079,
+      "step": 630
+    },
+    {
+      "epoch": 1.5430982519590115,
+      "grad_norm": 0.5340411087467901,
+      "learning_rate": 5e-06,
+      "loss": 0.5966,
+      "step": 640
+    },
+    {
+      "epoch": 1.567209162145871,
+      "grad_norm": 0.6320951914134804,
+      "learning_rate": 5e-06,
+      "loss": 0.6119,
+      "step": 650
+    },
+    {
+      "epoch": 1.5913200723327305,
+      "grad_norm": 0.5402636477743581,
+      "learning_rate": 5e-06,
+      "loss": 0.6018,
+      "step": 660
+    },
+    {
+      "epoch": 1.61543098251959,
+      "grad_norm": 0.6023321834042192,
+      "learning_rate": 5e-06,
+      "loss": 0.6023,
+      "step": 670
+    },
+    {
+      "epoch": 1.6395418927064496,
+      "grad_norm": 0.49282224066247415,
+      "learning_rate": 5e-06,
+      "loss": 0.6002,
+      "step": 680
+    },
+    {
+      "epoch": 1.663652802893309,
+      "grad_norm": 0.6838051107799483,
+      "learning_rate": 5e-06,
+      "loss": 0.601,
+      "step": 690
+    },
+    {
+      "epoch": 1.6877637130801688,
+      "grad_norm": 0.4809683173497573,
+      "learning_rate": 5e-06,
+      "loss": 0.6012,
+      "step": 700
+    },
+    {
+      "epoch": 1.7118746232670283,
+      "grad_norm": 0.5130004764470846,
+      "learning_rate": 5e-06,
+      "loss": 0.6019,
+      "step": 710
+    },
+    {
+      "epoch": 1.7359855334538878,
+      "grad_norm": 0.5222089493788711,
+      "learning_rate": 5e-06,
+      "loss": 0.6029,
+      "step": 720
+    },
+    {
+      "epoch": 1.7600964436407476,
+      "grad_norm": 0.5537154673186192,
+      "learning_rate": 5e-06,
+      "loss": 0.6039,
+      "step": 730
+    },
+    {
+      "epoch": 1.784207353827607,
+      "grad_norm": 0.5081950888314039,
+      "learning_rate": 5e-06,
+      "loss": 0.5973,
+      "step": 740
+    },
+    {
+      "epoch": 1.8083182640144666,
+      "grad_norm": 0.5806567422134803,
+      "learning_rate": 5e-06,
+      "loss": 0.6072,
+      "step": 750
+    },
+    {
+      "epoch": 1.8324291742013261,
+      "grad_norm": 0.5192410257029635,
+      "learning_rate": 5e-06,
+      "loss": 0.6026,
+      "step": 760
+    },
+    {
+      "epoch": 1.8565400843881856,
+      "grad_norm": 0.5487344170749389,
+      "learning_rate": 5e-06,
+      "loss": 0.6009,
+      "step": 770
+    },
+    {
+      "epoch": 1.8806509945750451,
+      "grad_norm": 0.5324805374861366,
+      "learning_rate": 5e-06,
+      "loss": 0.5994,
+      "step": 780
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.6058321884008855,
+      "learning_rate": 5e-06,
+      "loss": 0.6025,
+      "step": 790
+    },
+    {
+      "epoch": 1.9288728149487642,
+      "grad_norm": 0.57365525151735,
+      "learning_rate": 5e-06,
+      "loss": 0.6026,
+      "step": 800
+    },
+    {
+      "epoch": 1.952983725135624,
+      "grad_norm": 0.5436955562661013,
+      "learning_rate": 5e-06,
+      "loss": 0.5953,
+      "step": 810
+    },
+    {
+      "epoch": 1.9770946353224834,
+      "grad_norm": 0.6042343773815075,
+      "learning_rate": 5e-06,
+      "loss": 0.6,
+      "step": 820
+    },
+    {
+      "epoch": 1.998794454490657,
+      "eval_loss": 0.6393378973007202,
+      "eval_runtime": 225.3367,
+      "eval_samples_per_second": 49.597,
+      "eval_steps_per_second": 0.391,
+      "step": 829
+    },
+    {
+      "epoch": 2.001205545509343,
+      "grad_norm": 0.9670296692720087,
+      "learning_rate": 5e-06,
+      "loss": 0.6016,
+      "step": 830
+    },
+    {
+      "epoch": 2.0253164556962027,
+      "grad_norm": 0.6837527713124405,
+      "learning_rate": 5e-06,
+      "loss": 0.5631,
+      "step": 840
+    },
+    {
+      "epoch": 2.049427365883062,
+      "grad_norm": 0.5935688974373606,
+      "learning_rate": 5e-06,
+      "loss": 0.5531,
+      "step": 850
+    },
+    {
+      "epoch": 2.0735382760699217,
+      "grad_norm": 0.5980530217682797,
+      "learning_rate": 5e-06,
+      "loss": 0.5554,
+      "step": 860
+    },
+    {
+      "epoch": 2.097649186256781,
+      "grad_norm": 0.5752374885434699,
+      "learning_rate": 5e-06,
+      "loss": 0.5557,
+      "step": 870
+    },
+    {
+      "epoch": 2.1217600964436407,
+      "grad_norm": 0.5042143345935887,
+      "learning_rate": 5e-06,
+      "loss": 0.5522,
+      "step": 880
+    },
+    {
+      "epoch": 2.1458710066305002,
+      "grad_norm": 0.5980920545311946,
+      "learning_rate": 5e-06,
+      "loss": 0.553,
+      "step": 890
+    },
+    {
+      "epoch": 2.1699819168173597,
+      "grad_norm": 0.5290062022586566,
+      "learning_rate": 5e-06,
+      "loss": 0.5541,
+      "step": 900
+    },
+    {
+      "epoch": 2.1940928270042193,
+      "grad_norm": 0.6029389321066391,
+      "learning_rate": 5e-06,
+      "loss": 0.5527,
+      "step": 910
+    },
+    {
+      "epoch": 2.2182037371910788,
+      "grad_norm": 0.5761620842575014,
+      "learning_rate": 5e-06,
+      "loss": 0.561,
+      "step": 920
+    },
+    {
+      "epoch": 2.2423146473779383,
+      "grad_norm": 0.5382086109948551,
+      "learning_rate": 5e-06,
+      "loss": 0.5528,
+      "step": 930
+    },
+    {
+      "epoch": 2.2664255575647982,
+      "grad_norm": 0.5536204411197307,
+      "learning_rate": 5e-06,
+      "loss": 0.5552,
+      "step": 940
+    },
+    {
+      "epoch": 2.2905364677516578,
+      "grad_norm": 0.7414422036930762,
+      "learning_rate": 5e-06,
+      "loss": 0.557,
+      "step": 950
+    },
+    {
+      "epoch": 2.3146473779385173,
+      "grad_norm": 0.6072913873182035,
+      "learning_rate": 5e-06,
+      "loss": 0.5573,
+      "step": 960
+    },
+    {
+      "epoch": 2.338758288125377,
+      "grad_norm": 0.5786725716853928,
+      "learning_rate": 5e-06,
+      "loss": 0.5577,
+      "step": 970
+    },
+    {
+      "epoch": 2.3628691983122363,
+      "grad_norm": 0.5958758621711483,
+      "learning_rate": 5e-06,
+      "loss": 0.5573,
+      "step": 980
+    },
+    {
+      "epoch": 2.386980108499096,
+      "grad_norm": 0.5427800525323759,
+      "learning_rate": 5e-06,
+      "loss": 0.5605,
+      "step": 990
+    },
+    {
+      "epoch": 2.4110910186859553,
+      "grad_norm": 0.5008520202035274,
+      "learning_rate": 5e-06,
+      "loss": 0.55,
+      "step": 1000
+    },
+    {
+      "epoch": 2.435201928872815,
+      "grad_norm": 0.5438627458062395,
+      "learning_rate": 5e-06,
+      "loss": 0.5591,
+      "step": 1010
+    },
+    {
+      "epoch": 2.4593128390596743,
+      "grad_norm": 0.523458598668171,
+      "learning_rate": 5e-06,
+      "loss": 0.5523,
+      "step": 1020
+    },
+    {
+      "epoch": 2.483423749246534,
+      "grad_norm": 0.562845339140823,
+      "learning_rate": 5e-06,
+      "loss": 0.5513,
+      "step": 1030
+    },
+    {
+      "epoch": 2.507534659433394,
+      "grad_norm": 0.71192454951128,
+      "learning_rate": 5e-06,
+      "loss": 0.5617,
+      "step": 1040
+    },
+    {
+      "epoch": 2.5316455696202533,
+      "grad_norm": 0.5488684911452221,
+      "learning_rate": 5e-06,
+      "loss": 0.5594,
+      "step": 1050
+    },
+    {
+      "epoch": 2.555756479807113,
+      "grad_norm": 0.6322721667592042,
+      "learning_rate": 5e-06,
+      "loss": 0.5603,
+      "step": 1060
+    },
+    {
+      "epoch": 2.5798673899939724,
+      "grad_norm": 0.5208011078844106,
+      "learning_rate": 5e-06,
+      "loss": 0.5564,
+      "step": 1070
+    },
+    {
+      "epoch": 2.603978300180832,
+      "grad_norm": 0.5150689754075237,
+      "learning_rate": 5e-06,
+      "loss": 0.5624,
+      "step": 1080
+    },
+    {
+      "epoch": 2.6280892103676914,
+      "grad_norm": 0.5338754237375813,
+      "learning_rate": 5e-06,
+      "loss": 0.5628,
+      "step": 1090
+    },
+    {
+      "epoch": 2.652200120554551,
+      "grad_norm": 0.5072044155960452,
+      "learning_rate": 5e-06,
+      "loss": 0.5606,
+      "step": 1100
+    },
+    {
+      "epoch": 2.6763110307414104,
+      "grad_norm": 0.7238515722776927,
+      "learning_rate": 5e-06,
+      "loss": 0.5557,
+      "step": 1110
+    },
+    {
+      "epoch": 2.70042194092827,
+      "grad_norm": 0.5147434745712806,
+      "learning_rate": 5e-06,
+      "loss": 0.553,
+      "step": 1120
+    },
+    {
+      "epoch": 2.7245328511151294,
+      "grad_norm": 0.5564967074947503,
+      "learning_rate": 5e-06,
+      "loss": 0.5635,
+      "step": 1130
+    },
+    {
+      "epoch": 2.748643761301989,
+      "grad_norm": 0.5501220049253929,
+      "learning_rate": 5e-06,
+      "loss": 0.5583,
+      "step": 1140
+    },
+    {
+      "epoch": 2.7727546714888485,
+      "grad_norm": 0.5103459117518057,
+      "learning_rate": 5e-06,
+      "loss": 0.5597,
+      "step": 1150
+    },
+    {
+      "epoch": 2.7968655816757084,
+      "grad_norm": 0.5479118611862815,
+      "learning_rate": 5e-06,
+      "loss": 0.5579,
+      "step": 1160
+    },
+    {
+      "epoch": 2.820976491862568,
+      "grad_norm": 0.5471001762934908,
+      "learning_rate": 5e-06,
+      "loss": 0.5591,
+      "step": 1170
+    },
+    {
+      "epoch": 2.8450874020494274,
+      "grad_norm": 0.6232136492982399,
+      "learning_rate": 5e-06,
+      "loss": 0.5606,
+      "step": 1180
+    },
+    {
+      "epoch": 2.869198312236287,
+      "grad_norm": 0.5669388319949817,
+      "learning_rate": 5e-06,
+      "loss": 0.5649,
+      "step": 1190
+    },
+    {
+      "epoch": 2.8933092224231465,
+      "grad_norm": 0.6969387028585086,
+      "learning_rate": 5e-06,
+      "loss": 0.5651,
+      "step": 1200
+    },
+    {
+      "epoch": 2.917420132610006,
+      "grad_norm": 0.6374387529410114,
+      "learning_rate": 5e-06,
+      "loss": 0.56,
+      "step": 1210
+    },
+    {
+      "epoch": 2.9415310427968655,
+      "grad_norm": 0.560816628841587,
+      "learning_rate": 5e-06,
+      "loss": 0.5594,
+      "step": 1220
+    },
+    {
+      "epoch": 2.965641952983725,
+      "grad_norm": 0.6033572013760955,
+      "learning_rate": 5e-06,
+      "loss": 0.5604,
+      "step": 1230
+    },
+    {
+      "epoch": 2.9897528631705845,
+      "grad_norm": 0.5557325437050415,
+      "learning_rate": 5e-06,
+      "loss": 0.5631,
+      "step": 1240
+    },
+    {
+      "epoch": 2.9945750452079567,
+      "eval_loss": 0.6422178745269775,
+      "eval_runtime": 225.1456,
+      "eval_samples_per_second": 49.639,
+      "eval_steps_per_second": 0.391,
+      "step": 1242
+    },
+    {
+      "epoch": 2.9945750452079567,
+      "step": 1242,
+      "total_flos": 2079977499525120.0,
+      "train_loss": 0.6134321775029439,
+      "train_runtime": 37323.5248,
+      "train_samples_per_second": 17.067,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1242,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2079977499525120.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed