End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +941 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: llama3-1_8b_oh_v3.1_wo_gpt4_llm
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # llama3-1_8b_oh_v3.1_wo_gpt4_llm
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6390

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: llama3-1_8b_oh_v3.1_wo_gpt4_llm
 # llama3-1_8b_oh_v3.1_wo_gpt4_llm
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/oh_v3.1_wo_gpt4_llm dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6390

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.6389971375465393,
+    "eval_runtime": 41.2263,
+    "eval_samples_per_second": 273.491,
+    "eval_steps_per_second": 1.092,
+    "total_flos": 2105521817518080.0,
+    "train_loss": 0.6169646524112189,
+    "train_runtime": 7810.9265,
+    "train_samples_per_second": 82.273,
+    "train_steps_per_second": 0.161
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.6389971375465393,
+    "eval_runtime": 41.2263,
+    "eval_samples_per_second": 273.491,
+    "eval_steps_per_second": 1.092
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2105521817518080.0,
+    "train_loss": 0.6169646524112189,
+    "train_runtime": 7810.9265,
+    "train_samples_per_second": 82.273,
+    "train_steps_per_second": 0.161
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,941 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1257,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02386634844868735,
+      "grad_norm": 2.8815478378979726,
+      "learning_rate": 5e-06,
+      "loss": 0.8847,
+      "step": 10
+    },
+    {
+      "epoch": 0.0477326968973747,
+      "grad_norm": 2.1111681660055823,
+      "learning_rate": 5e-06,
+      "loss": 0.7795,
+      "step": 20
+    },
+    {
+      "epoch": 0.07159904534606205,
+      "grad_norm": 0.7661989040642819,
+      "learning_rate": 5e-06,
+      "loss": 0.7503,
+      "step": 30
+    },
+    {
+      "epoch": 0.0954653937947494,
+      "grad_norm": 0.9394058763103804,
+      "learning_rate": 5e-06,
+      "loss": 0.7339,
+      "step": 40
+    },
+    {
+      "epoch": 0.11933174224343675,
+      "grad_norm": 0.8225402898620612,
+      "learning_rate": 5e-06,
+      "loss": 0.7135,
+      "step": 50
+    },
+    {
+      "epoch": 0.1431980906921241,
+      "grad_norm": 0.7799933213085141,
+      "learning_rate": 5e-06,
+      "loss": 0.7122,
+      "step": 60
+    },
+    {
+      "epoch": 0.16706443914081145,
+      "grad_norm": 0.5800160564456466,
+      "learning_rate": 5e-06,
+      "loss": 0.6973,
+      "step": 70
+    },
+    {
+      "epoch": 0.1909307875894988,
+      "grad_norm": 0.597063104491002,
+      "learning_rate": 5e-06,
+      "loss": 0.6903,
+      "step": 80
+    },
+    {
+      "epoch": 0.21479713603818615,
+      "grad_norm": 0.5997913835782436,
+      "learning_rate": 5e-06,
+      "loss": 0.6808,
+      "step": 90
+    },
+    {
+      "epoch": 0.2386634844868735,
+      "grad_norm": 0.9361461454403766,
+      "learning_rate": 5e-06,
+      "loss": 0.6882,
+      "step": 100
+    },
+    {
+      "epoch": 0.26252983293556087,
+      "grad_norm": 0.6783993996639943,
+      "learning_rate": 5e-06,
+      "loss": 0.6836,
+      "step": 110
+    },
+    {
+      "epoch": 0.2863961813842482,
+      "grad_norm": 0.5795802549448508,
+      "learning_rate": 5e-06,
+      "loss": 0.6806,
+      "step": 120
+    },
+    {
+      "epoch": 0.31026252983293556,
+      "grad_norm": 0.5386116555684645,
+      "learning_rate": 5e-06,
+      "loss": 0.6786,
+      "step": 130
+    },
+    {
+      "epoch": 0.3341288782816229,
+      "grad_norm": 1.1955667749232783,
+      "learning_rate": 5e-06,
+      "loss": 0.673,
+      "step": 140
+    },
+    {
+      "epoch": 0.35799522673031026,
+      "grad_norm": 0.963473662374355,
+      "learning_rate": 5e-06,
+      "loss": 0.6791,
+      "step": 150
+    },
+    {
+      "epoch": 0.3818615751789976,
+      "grad_norm": 0.5375818632492324,
+      "learning_rate": 5e-06,
+      "loss": 0.6715,
+      "step": 160
+    },
+    {
+      "epoch": 0.40572792362768495,
+      "grad_norm": 0.5122467752567826,
+      "learning_rate": 5e-06,
+      "loss": 0.6672,
+      "step": 170
+    },
+    {
+      "epoch": 0.4295942720763723,
+      "grad_norm": 0.5438018622021655,
+      "learning_rate": 5e-06,
+      "loss": 0.6696,
+      "step": 180
+    },
+    {
+      "epoch": 0.45346062052505964,
+      "grad_norm": 0.5443450875717797,
+      "learning_rate": 5e-06,
+      "loss": 0.6707,
+      "step": 190
+    },
+    {
+      "epoch": 0.477326968973747,
+      "grad_norm": 0.5591666330075007,
+      "learning_rate": 5e-06,
+      "loss": 0.6629,
+      "step": 200
+    },
+    {
+      "epoch": 0.5011933174224343,
+      "grad_norm": 0.7316386056094906,
+      "learning_rate": 5e-06,
+      "loss": 0.6601,
+      "step": 210
+    },
+    {
+      "epoch": 0.5250596658711217,
+      "grad_norm": 0.8169771743047101,
+      "learning_rate": 5e-06,
+      "loss": 0.666,
+      "step": 220
+    },
+    {
+      "epoch": 0.548926014319809,
+      "grad_norm": 0.5023436573258486,
+      "learning_rate": 5e-06,
+      "loss": 0.6655,
+      "step": 230
+    },
+    {
+      "epoch": 0.5727923627684964,
+      "grad_norm": 0.5715922888425466,
+      "learning_rate": 5e-06,
+      "loss": 0.6621,
+      "step": 240
+    },
+    {
+      "epoch": 0.5966587112171837,
+      "grad_norm": 0.5978492051245125,
+      "learning_rate": 5e-06,
+      "loss": 0.6606,
+      "step": 250
+    },
+    {
+      "epoch": 0.6205250596658711,
+      "grad_norm": 0.5562863722589444,
+      "learning_rate": 5e-06,
+      "loss": 0.6646,
+      "step": 260
+    },
+    {
+      "epoch": 0.6443914081145584,
+      "grad_norm": 0.5933691995834427,
+      "learning_rate": 5e-06,
+      "loss": 0.6583,
+      "step": 270
+    },
+    {
+      "epoch": 0.6682577565632458,
+      "grad_norm": 0.5981641076306046,
+      "learning_rate": 5e-06,
+      "loss": 0.6557,
+      "step": 280
+    },
+    {
+      "epoch": 0.6921241050119332,
+      "grad_norm": 0.8109530838139422,
+      "learning_rate": 5e-06,
+      "loss": 0.6582,
+      "step": 290
+    },
+    {
+      "epoch": 0.7159904534606205,
+      "grad_norm": 0.5965206875329182,
+      "learning_rate": 5e-06,
+      "loss": 0.6621,
+      "step": 300
+    },
+    {
+      "epoch": 0.7398568019093079,
+      "grad_norm": 0.486222749934066,
+      "learning_rate": 5e-06,
+      "loss": 0.6549,
+      "step": 310
+    },
+    {
+      "epoch": 0.7637231503579952,
+      "grad_norm": 0.5522832083975265,
+      "learning_rate": 5e-06,
+      "loss": 0.6496,
+      "step": 320
+    },
+    {
+      "epoch": 0.7875894988066826,
+      "grad_norm": 0.5396338578678825,
+      "learning_rate": 5e-06,
+      "loss": 0.6431,
+      "step": 330
+    },
+    {
+      "epoch": 0.8114558472553699,
+      "grad_norm": 0.509360104131435,
+      "learning_rate": 5e-06,
+      "loss": 0.6458,
+      "step": 340
+    },
+    {
+      "epoch": 0.8353221957040573,
+      "grad_norm": 0.5620324892726529,
+      "learning_rate": 5e-06,
+      "loss": 0.6564,
+      "step": 350
+    },
+    {
+      "epoch": 0.8591885441527446,
+      "grad_norm": 0.6119146933240237,
+      "learning_rate": 5e-06,
+      "loss": 0.6534,
+      "step": 360
+    },
+    {
+      "epoch": 0.883054892601432,
+      "grad_norm": 0.5839021797409776,
+      "learning_rate": 5e-06,
+      "loss": 0.6565,
+      "step": 370
+    },
+    {
+      "epoch": 0.9069212410501193,
+      "grad_norm": 0.43401159052073285,
+      "learning_rate": 5e-06,
+      "loss": 0.6535,
+      "step": 380
+    },
+    {
+      "epoch": 0.9307875894988067,
+      "grad_norm": 0.5668890229094246,
+      "learning_rate": 5e-06,
+      "loss": 0.6462,
+      "step": 390
+    },
+    {
+      "epoch": 0.954653937947494,
+      "grad_norm": 0.5782226624956547,
+      "learning_rate": 5e-06,
+      "loss": 0.6546,
+      "step": 400
+    },
+    {
+      "epoch": 0.9785202863961814,
+      "grad_norm": 0.6579659643852935,
+      "learning_rate": 5e-06,
+      "loss": 0.641,
+      "step": 410
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.6449207663536072,
+      "eval_runtime": 41.1778,
+      "eval_samples_per_second": 273.813,
+      "eval_steps_per_second": 1.093,
+      "step": 419
+    },
+    {
+      "epoch": 1.0023866348448687,
+      "grad_norm": 0.7827774952972171,
+      "learning_rate": 5e-06,
+      "loss": 0.6431,
+      "step": 420
+    },
+    {
+      "epoch": 1.026252983293556,
+      "grad_norm": 0.6329747383963555,
+      "learning_rate": 5e-06,
+      "loss": 0.6134,
+      "step": 430
+    },
+    {
+      "epoch": 1.0501193317422435,
+      "grad_norm": 0.5281801985221292,
+      "learning_rate": 5e-06,
+      "loss": 0.6083,
+      "step": 440
+    },
+    {
+      "epoch": 1.0739856801909309,
+      "grad_norm": 0.7577758408944637,
+      "learning_rate": 5e-06,
+      "loss": 0.6061,
+      "step": 450
+    },
+    {
+      "epoch": 1.097852028639618,
+      "grad_norm": 0.5351677292156073,
+      "learning_rate": 5e-06,
+      "loss": 0.6031,
+      "step": 460
+    },
+    {
+      "epoch": 1.1217183770883055,
+      "grad_norm": 0.5015989972497082,
+      "learning_rate": 5e-06,
+      "loss": 0.6059,
+      "step": 470
+    },
+    {
+      "epoch": 1.1455847255369929,
+      "grad_norm": 0.5967761959033508,
+      "learning_rate": 5e-06,
+      "loss": 0.6144,
+      "step": 480
+    },
+    {
+      "epoch": 1.1694510739856803,
+      "grad_norm": 0.6199769517789647,
+      "learning_rate": 5e-06,
+      "loss": 0.6089,
+      "step": 490
+    },
+    {
+      "epoch": 1.1933174224343674,
+      "grad_norm": 0.5989234321280023,
+      "learning_rate": 5e-06,
+      "loss": 0.6079,
+      "step": 500
+    },
+    {
+      "epoch": 1.2171837708830548,
+      "grad_norm": 0.48214122296698664,
+      "learning_rate": 5e-06,
+      "loss": 0.6106,
+      "step": 510
+    },
+    {
+      "epoch": 1.2410501193317423,
+      "grad_norm": 0.500906885639557,
+      "learning_rate": 5e-06,
+      "loss": 0.6114,
+      "step": 520
+    },
+    {
+      "epoch": 1.2649164677804297,
+      "grad_norm": 0.5055182485221988,
+      "learning_rate": 5e-06,
+      "loss": 0.6073,
+      "step": 530
+    },
+    {
+      "epoch": 1.288782816229117,
+      "grad_norm": 0.5890740590556416,
+      "learning_rate": 5e-06,
+      "loss": 0.6074,
+      "step": 540
+    },
+    {
+      "epoch": 1.3126491646778042,
+      "grad_norm": 0.48236044063151085,
+      "learning_rate": 5e-06,
+      "loss": 0.6128,
+      "step": 550
+    },
+    {
+      "epoch": 1.3365155131264916,
+      "grad_norm": 0.5202514925506149,
+      "learning_rate": 5e-06,
+      "loss": 0.6058,
+      "step": 560
+    },
+    {
+      "epoch": 1.360381861575179,
+      "grad_norm": 0.5228316664959745,
+      "learning_rate": 5e-06,
+      "loss": 0.6091,
+      "step": 570
+    },
+    {
+      "epoch": 1.3842482100238662,
+      "grad_norm": 0.51530770994292,
+      "learning_rate": 5e-06,
+      "loss": 0.6087,
+      "step": 580
+    },
+    {
+      "epoch": 1.4081145584725536,
+      "grad_norm": 0.6386559379894787,
+      "learning_rate": 5e-06,
+      "loss": 0.6116,
+      "step": 590
+    },
+    {
+      "epoch": 1.431980906921241,
+      "grad_norm": 0.4779744600855222,
+      "learning_rate": 5e-06,
+      "loss": 0.6033,
+      "step": 600
+    },
+    {
+      "epoch": 1.4558472553699284,
+      "grad_norm": 0.4819600928038827,
+      "learning_rate": 5e-06,
+      "loss": 0.6077,
+      "step": 610
+    },
+    {
+      "epoch": 1.4797136038186158,
+      "grad_norm": 0.45917275315096606,
+      "learning_rate": 5e-06,
+      "loss": 0.6094,
+      "step": 620
+    },
+    {
+      "epoch": 1.503579952267303,
+      "grad_norm": 0.5010113270578477,
+      "learning_rate": 5e-06,
+      "loss": 0.6055,
+      "step": 630
+    },
+    {
+      "epoch": 1.5274463007159904,
+      "grad_norm": 0.7579117243752399,
+      "learning_rate": 5e-06,
+      "loss": 0.6159,
+      "step": 640
+    },
+    {
+      "epoch": 1.5513126491646778,
+      "grad_norm": 0.6420792800924288,
+      "learning_rate": 5e-06,
+      "loss": 0.6058,
+      "step": 650
+    },
+    {
+      "epoch": 1.575178997613365,
+      "grad_norm": 0.55940882824889,
+      "learning_rate": 5e-06,
+      "loss": 0.5961,
+      "step": 660
+    },
+    {
+      "epoch": 1.5990453460620526,
+      "grad_norm": 0.4983792149426302,
+      "learning_rate": 5e-06,
+      "loss": 0.6083,
+      "step": 670
+    },
+    {
+      "epoch": 1.6229116945107398,
+      "grad_norm": 0.5263299595036224,
+      "learning_rate": 5e-06,
+      "loss": 0.6053,
+      "step": 680
+    },
+    {
+      "epoch": 1.6467780429594272,
+      "grad_norm": 0.5149484039474402,
+      "learning_rate": 5e-06,
+      "loss": 0.6079,
+      "step": 690
+    },
+    {
+      "epoch": 1.6706443914081146,
+      "grad_norm": 0.4685802940879146,
+      "learning_rate": 5e-06,
+      "loss": 0.6041,
+      "step": 700
+    },
+    {
+      "epoch": 1.6945107398568018,
+      "grad_norm": 0.4639317334767733,
+      "learning_rate": 5e-06,
+      "loss": 0.6012,
+      "step": 710
+    },
+    {
+      "epoch": 1.7183770883054894,
+      "grad_norm": 0.48774835965978913,
+      "learning_rate": 5e-06,
+      "loss": 0.6117,
+      "step": 720
+    },
+    {
+      "epoch": 1.7422434367541766,
+      "grad_norm": 0.4895883070209168,
+      "learning_rate": 5e-06,
+      "loss": 0.6066,
+      "step": 730
+    },
+    {
+      "epoch": 1.766109785202864,
+      "grad_norm": 0.45180400146140737,
+      "learning_rate": 5e-06,
+      "loss": 0.6031,
+      "step": 740
+    },
+    {
+      "epoch": 1.7899761336515514,
+      "grad_norm": 0.5197512123710193,
+      "learning_rate": 5e-06,
+      "loss": 0.6018,
+      "step": 750
+    },
+    {
+      "epoch": 1.8138424821002386,
+      "grad_norm": 0.45816712660411146,
+      "learning_rate": 5e-06,
+      "loss": 0.6024,
+      "step": 760
+    },
+    {
+      "epoch": 1.837708830548926,
+      "grad_norm": 0.45318755220959944,
+      "learning_rate": 5e-06,
+      "loss": 0.6101,
+      "step": 770
+    },
+    {
+      "epoch": 1.8615751789976134,
+      "grad_norm": 0.48227944198410183,
+      "learning_rate": 5e-06,
+      "loss": 0.6084,
+      "step": 780
+    },
+    {
+      "epoch": 1.8854415274463006,
+      "grad_norm": 0.5498211459609608,
+      "learning_rate": 5e-06,
+      "loss": 0.6014,
+      "step": 790
+    },
+    {
+      "epoch": 1.9093078758949882,
+      "grad_norm": 0.5272792947894827,
+      "learning_rate": 5e-06,
+      "loss": 0.6035,
+      "step": 800
+    },
+    {
+      "epoch": 1.9331742243436754,
+      "grad_norm": 0.4520775749202011,
+      "learning_rate": 5e-06,
+      "loss": 0.6023,
+      "step": 810
+    },
+    {
+      "epoch": 1.9570405727923628,
+      "grad_norm": 0.48624457536560756,
+      "learning_rate": 5e-06,
+      "loss": 0.6022,
+      "step": 820
+    },
+    {
+      "epoch": 1.9809069212410502,
+      "grad_norm": 0.5488566572359053,
+      "learning_rate": 5e-06,
+      "loss": 0.6068,
+      "step": 830
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.6355295777320862,
+      "eval_runtime": 40.6265,
+      "eval_samples_per_second": 277.529,
+      "eval_steps_per_second": 1.108,
+      "step": 838
+    },
+    {
+      "epoch": 2.0047732696897373,
+      "grad_norm": 0.6658281603460572,
+      "learning_rate": 5e-06,
+      "loss": 0.5955,
+      "step": 840
+    },
+    {
+      "epoch": 2.028639618138425,
+      "grad_norm": 0.7098614483793992,
+      "learning_rate": 5e-06,
+      "loss": 0.5701,
+      "step": 850
+    },
+    {
+      "epoch": 2.052505966587112,
+      "grad_norm": 0.6533616500153973,
+      "learning_rate": 5e-06,
+      "loss": 0.5634,
+      "step": 860
+    },
+    {
+      "epoch": 2.0763723150357993,
+      "grad_norm": 0.6126417325021997,
+      "learning_rate": 5e-06,
+      "loss": 0.5639,
+      "step": 870
+    },
+    {
+      "epoch": 2.100238663484487,
+      "grad_norm": 0.44612672152357774,
+      "learning_rate": 5e-06,
+      "loss": 0.5671,
+      "step": 880
+    },
+    {
+      "epoch": 2.124105011933174,
+      "grad_norm": 0.5379973514690706,
+      "learning_rate": 5e-06,
+      "loss": 0.562,
+      "step": 890
+    },
+    {
+      "epoch": 2.1479713603818618,
+      "grad_norm": 0.442404391317877,
+      "learning_rate": 5e-06,
+      "loss": 0.5728,
+      "step": 900
+    },
+    {
+      "epoch": 2.171837708830549,
+      "grad_norm": 0.5754078775127957,
+      "learning_rate": 5e-06,
+      "loss": 0.5613,
+      "step": 910
+    },
+    {
+      "epoch": 2.195704057279236,
+      "grad_norm": 0.6586173018673331,
+      "learning_rate": 5e-06,
+      "loss": 0.5654,
+      "step": 920
+    },
+    {
+      "epoch": 2.2195704057279237,
+      "grad_norm": 0.5439510862576353,
+      "learning_rate": 5e-06,
+      "loss": 0.5614,
+      "step": 930
+    },
+    {
+      "epoch": 2.243436754176611,
+      "grad_norm": 0.5740630674331443,
+      "learning_rate": 5e-06,
+      "loss": 0.565,
+      "step": 940
+    },
+    {
+      "epoch": 2.2673031026252985,
+      "grad_norm": 0.48777976586303107,
+      "learning_rate": 5e-06,
+      "loss": 0.561,
+      "step": 950
+    },
+    {
+      "epoch": 2.2911694510739857,
+      "grad_norm": 0.4461443532364416,
+      "learning_rate": 5e-06,
+      "loss": 0.5582,
+      "step": 960
+    },
+    {
+      "epoch": 2.315035799522673,
+      "grad_norm": 0.487505257972905,
+      "learning_rate": 5e-06,
+      "loss": 0.5692,
+      "step": 970
+    },
+    {
+      "epoch": 2.3389021479713605,
+      "grad_norm": 0.48305958679617367,
+      "learning_rate": 5e-06,
+      "loss": 0.5694,
+      "step": 980
+    },
+    {
+      "epoch": 2.3627684964200477,
+      "grad_norm": 0.503359166301003,
+      "learning_rate": 5e-06,
+      "loss": 0.5609,
+      "step": 990
+    },
+    {
+      "epoch": 2.386634844868735,
+      "grad_norm": 0.5256679194745039,
+      "learning_rate": 5e-06,
+      "loss": 0.5632,
+      "step": 1000
+    },
+    {
+      "epoch": 2.4105011933174225,
+      "grad_norm": 0.5732669619119689,
+      "learning_rate": 5e-06,
+      "loss": 0.5666,
+      "step": 1010
+    },
+    {
+      "epoch": 2.4343675417661097,
+      "grad_norm": 0.4506151186524362,
+      "learning_rate": 5e-06,
+      "loss": 0.5762,
+      "step": 1020
+    },
+    {
+      "epoch": 2.4582338902147973,
+      "grad_norm": 0.4746241123968773,
+      "learning_rate": 5e-06,
+      "loss": 0.5652,
+      "step": 1030
+    },
+    {
+      "epoch": 2.4821002386634845,
+      "grad_norm": 0.46349902513638275,
+      "learning_rate": 5e-06,
+      "loss": 0.5644,
+      "step": 1040
+    },
+    {
+      "epoch": 2.5059665871121717,
+      "grad_norm": 0.45976906873116374,
+      "learning_rate": 5e-06,
+      "loss": 0.5686,
+      "step": 1050
+    },
+    {
+      "epoch": 2.5298329355608593,
+      "grad_norm": 0.4662220401853357,
+      "learning_rate": 5e-06,
+      "loss": 0.5686,
+      "step": 1060
+    },
+    {
+      "epoch": 2.5536992840095465,
+      "grad_norm": 0.4951436398512421,
+      "learning_rate": 5e-06,
+      "loss": 0.5702,
+      "step": 1070
+    },
+    {
+      "epoch": 2.577565632458234,
+      "grad_norm": 0.4502698747379483,
+      "learning_rate": 5e-06,
+      "loss": 0.5719,
+      "step": 1080
+    },
+    {
+      "epoch": 2.6014319809069213,
+      "grad_norm": 0.5079705723721918,
+      "learning_rate": 5e-06,
+      "loss": 0.5671,
+      "step": 1090
+    },
+    {
+      "epoch": 2.6252983293556085,
+      "grad_norm": 0.5140346872907439,
+      "learning_rate": 5e-06,
+      "loss": 0.5671,
+      "step": 1100
+    },
+    {
+      "epoch": 2.649164677804296,
+      "grad_norm": 0.5358331120197253,
+      "learning_rate": 5e-06,
+      "loss": 0.5633,
+      "step": 1110
+    },
+    {
+      "epoch": 2.6730310262529833,
+      "grad_norm": 0.4907309631164768,
+      "learning_rate": 5e-06,
+      "loss": 0.5644,
+      "step": 1120
+    },
+    {
+      "epoch": 2.6968973747016705,
+      "grad_norm": 0.5049970936550133,
+      "learning_rate": 5e-06,
+      "loss": 0.5692,
+      "step": 1130
+    },
+    {
+      "epoch": 2.720763723150358,
+      "grad_norm": 0.5553567091902175,
+      "learning_rate": 5e-06,
+      "loss": 0.5658,
+      "step": 1140
+    },
+    {
+      "epoch": 2.7446300715990453,
+      "grad_norm": 0.49261752142359677,
+      "learning_rate": 5e-06,
+      "loss": 0.5742,
+      "step": 1150
+    },
+    {
+      "epoch": 2.7684964200477324,
+      "grad_norm": 0.5018759977401656,
+      "learning_rate": 5e-06,
+      "loss": 0.5645,
+      "step": 1160
+    },
+    {
+      "epoch": 2.79236276849642,
+      "grad_norm": 0.45826106825699625,
+      "learning_rate": 5e-06,
+      "loss": 0.5641,
+      "step": 1170
+    },
+    {
+      "epoch": 2.8162291169451072,
+      "grad_norm": 0.5072976091618316,
+      "learning_rate": 5e-06,
+      "loss": 0.5676,
+      "step": 1180
+    },
+    {
+      "epoch": 2.840095465393795,
+      "grad_norm": 0.651235919618626,
+      "learning_rate": 5e-06,
+      "loss": 0.5681,
+      "step": 1190
+    },
+    {
+      "epoch": 2.863961813842482,
+      "grad_norm": 0.5113781250941779,
+      "learning_rate": 5e-06,
+      "loss": 0.5669,
+      "step": 1200
+    },
+    {
+      "epoch": 2.8878281622911697,
+      "grad_norm": 0.4949601222660754,
+      "learning_rate": 5e-06,
+      "loss": 0.5581,
+      "step": 1210
+    },
+    {
+      "epoch": 2.911694510739857,
+      "grad_norm": 0.5748654481884351,
+      "learning_rate": 5e-06,
+      "loss": 0.5678,
+      "step": 1220
+    },
+    {
+      "epoch": 2.935560859188544,
+      "grad_norm": 0.5793617800123868,
+      "learning_rate": 5e-06,
+      "loss": 0.5649,
+      "step": 1230
+    },
+    {
+      "epoch": 2.9594272076372317,
+      "grad_norm": 0.4514242707013011,
+      "learning_rate": 5e-06,
+      "loss": 0.5652,
+      "step": 1240
+    },
+    {
+      "epoch": 2.983293556085919,
+      "grad_norm": 0.4744906861925991,
+      "learning_rate": 5e-06,
+      "loss": 0.5638,
+      "step": 1250
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.6389971375465393,
+      "eval_runtime": 41.4704,
+      "eval_samples_per_second": 271.881,
+      "eval_steps_per_second": 1.085,
+      "step": 1257
+    },
+    {
+      "epoch": 3.0,
+      "step": 1257,
+      "total_flos": 2105521817518080.0,
+      "train_loss": 0.6169646524112189,
+      "train_runtime": 7810.9265,
+      "train_samples_per_second": 82.273,
+      "train_steps_per_second": 0.161
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1257,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2105521817518080.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed