Training in progress, epoch 0

Browse files

Files changed (4) hide show

adapter_config.json +4 -2
adapter_model.safetensors +2 -2
trainer_state.json +639 -0
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -20,10 +20,12 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
     "v_proj",
     "q_proj",
-    "o_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "v_proj",
+    "fc1",
     "q_proj",
+    "k_proj",
+    "fc2",
+    "dense"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1296f13c9184dd546caac3586253bee4c8499493047b205f9850ff841c4adff
-size 31483040

 version https://git-lfs.github.com/spec/v1
+oid sha256:4b1b4c2bf412d25e81a1e930e6646e340fdb9da06243a818484153349e9339cb
+size 94422368

trainer_state.json ADDED Viewed

	@@ -0,0 +1,639 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.914529914529915,
+  "eval_steps": 500,
+  "global_step": 870,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.11396011396011396,
+      "grad_norm": 0.158817857503891,
+      "learning_rate": 0.0001999348095389677,
+      "loss": 0.9924,
+      "step": 10
+    },
+    {
+      "epoch": 0.22792022792022792,
+      "grad_norm": 0.21280939877033234,
+      "learning_rate": 0.000199739323151795,
+      "loss": 0.819,
+      "step": 20
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.22974510490894318,
+      "learning_rate": 0.00019941379571543596,
+      "loss": 0.767,
+      "step": 30
+    },
+    {
+      "epoch": 0.45584045584045585,
+      "grad_norm": 0.20720455050468445,
+      "learning_rate": 0.00019895865165556377,
+      "loss": 0.6948,
+      "step": 40
+    },
+    {
+      "epoch": 0.5698005698005698,
+      "grad_norm": 0.1902514398097992,
+      "learning_rate": 0.00019837448439320027,
+      "loss": 0.6509,
+      "step": 50
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.18335820734500885,
+      "learning_rate": 0.00019766205557100868,
+      "loss": 0.6344,
+      "step": 60
+    },
+    {
+      "epoch": 0.7977207977207977,
+      "grad_norm": 0.17900069057941437,
+      "learning_rate": 0.00019682229406025635,
+      "loss": 0.6447,
+      "step": 70
+    },
+    {
+      "epoch": 0.9116809116809117,
+      "grad_norm": 0.16915330290794373,
+      "learning_rate": 0.00019585629474974415,
+      "loss": 0.6335,
+      "step": 80
+    },
+    {
+      "epoch": 1.0256410256410255,
+      "grad_norm": 0.16036000847816467,
+      "learning_rate": 0.00019476531711828027,
+      "loss": 0.634,
+      "step": 90
+    },
+    {
+      "epoch": 1.1396011396011396,
+      "grad_norm": 0.16852639615535736,
+      "learning_rate": 0.0001935507835925601,
+      "loss": 0.6058,
+      "step": 100
+    },
+    {
+      "epoch": 1.2535612535612537,
+      "grad_norm": 0.15856905281543732,
+      "learning_rate": 0.00019221427769259333,
+      "loss": 0.5902,
+      "step": 110
+    },
+    {
+      "epoch": 1.3675213675213675,
+      "grad_norm": 0.16909192502498627,
+      "learning_rate": 0.00019075754196709572,
+      "loss": 0.6051,
+      "step": 120
+    },
+    {
+      "epoch": 1.4814814814814814,
+      "grad_norm": 0.1899166703224182,
+      "learning_rate": 0.00018918247572153823,
+      "loss": 0.6098,
+      "step": 130
+    },
+    {
+      "epoch": 1.5954415954415955,
+      "grad_norm": 0.17596793174743652,
+      "learning_rate": 0.00018749113254181498,
+      "loss": 0.597,
+      "step": 140
+    },
+    {
+      "epoch": 1.7094017094017095,
+      "grad_norm": 0.16560517251491547,
+      "learning_rate": 0.00018568571761675893,
+      "loss": 0.5899,
+      "step": 150
+    },
+    {
+      "epoch": 1.8233618233618234,
+      "grad_norm": 0.16513986885547638,
+      "learning_rate": 0.00018376858486299647,
+      "loss": 0.5989,
+      "step": 160
+    },
+    {
+      "epoch": 1.9373219373219372,
+      "grad_norm": 0.20360782742500305,
+      "learning_rate": 0.00018174223385588917,
+      "loss": 0.5982,
+      "step": 170
+    },
+    {
+      "epoch": 2.051282051282051,
+      "grad_norm": 0.16155321896076202,
+      "learning_rate": 0.00017960930657056438,
+      "loss": 0.593,
+      "step": 180
+    },
+    {
+      "epoch": 2.1652421652421654,
+      "grad_norm": 0.1811763048171997,
+      "learning_rate": 0.00017737258393728364,
+      "loss": 0.6077,
+      "step": 190
+    },
+    {
+      "epoch": 2.2792022792022792,
+      "grad_norm": 0.16952063143253326,
+      "learning_rate": 0.00017503498221564025,
+      "loss": 0.5749,
+      "step": 200
+    },
+    {
+      "epoch": 2.393162393162393,
+      "grad_norm": 0.17240603268146515,
+      "learning_rate": 0.0001725995491923131,
+      "loss": 0.5592,
+      "step": 210
+    },
+    {
+      "epoch": 2.5071225071225074,
+      "grad_norm": 0.1657334417104721,
+      "learning_rate": 0.00017006946020733425,
+      "loss": 0.5779,
+      "step": 220
+    },
+    {
+      "epoch": 2.6210826210826212,
+      "grad_norm": 0.16417497396469116,
+      "learning_rate": 0.0001674480140140514,
+      "loss": 0.5675,
+      "step": 230
+    },
+    {
+      "epoch": 2.735042735042735,
+      "grad_norm": 0.174308180809021,
+      "learning_rate": 0.00016473862847818277,
+      "loss": 0.5977,
+      "step": 240
+    },
+    {
+      "epoch": 2.849002849002849,
+      "grad_norm": 0.17116901278495789,
+      "learning_rate": 0.0001619448361215723,
+      "loss": 0.5582,
+      "step": 250
+    },
+    {
+      "epoch": 2.962962962962963,
+      "grad_norm": 0.16816489398479462,
+      "learning_rate": 0.0001590702795164551,
+      "loss": 0.5813,
+      "step": 260
+    },
+    {
+      "epoch": 3.076923076923077,
+      "grad_norm": 0.17530137300491333,
+      "learning_rate": 0.00015611870653623825,
+      "loss": 0.559,
+      "step": 270
+    },
+    {
+      "epoch": 3.190883190883191,
+      "grad_norm": 0.1744232326745987,
+      "learning_rate": 0.0001530939654689887,
+      "loss": 0.5668,
+      "step": 280
+    },
+    {
+      "epoch": 3.304843304843305,
+      "grad_norm": 0.1809006929397583,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.5754,
+      "step": 290
+    },
+    {
+      "epoch": 3.4188034188034186,
+      "grad_norm": 0.16484159231185913,
+      "learning_rate": 0.00014684084406997903,
+      "loss": 0.5731,
+      "step": 300
+    },
+    {
+      "epoch": 3.532763532763533,
+      "grad_norm": 0.19075918197631836,
+      "learning_rate": 0.00014362061661555675,
+      "loss": 0.5496,
+      "step": 310
+    },
+    {
+      "epoch": 3.646723646723647,
+      "grad_norm": 0.18451079726219177,
+      "learning_rate": 0.00014034351619898088,
+      "loss": 0.5463,
+      "step": 320
+    },
+    {
+      "epoch": 3.7606837606837606,
+      "grad_norm": 0.18566997349262238,
+      "learning_rate": 0.00013701381553399145,
+      "loss": 0.5768,
+      "step": 330
+    },
+    {
+      "epoch": 3.8746438746438745,
+      "grad_norm": 0.1669853925704956,
+      "learning_rate": 0.0001336358559150175,
+      "loss": 0.5606,
+      "step": 340
+    },
+    {
+      "epoch": 3.9886039886039883,
+      "grad_norm": 0.17847082018852234,
+      "learning_rate": 0.00013021404155695725,
+      "loss": 0.5756,
+      "step": 350
+    },
+    {
+      "epoch": 4.102564102564102,
+      "grad_norm": 0.16660483181476593,
+      "learning_rate": 0.00012675283385292212,
+      "loss": 0.5585,
+      "step": 360
+    },
+    {
+      "epoch": 4.216524216524217,
+      "grad_norm": 0.17163340747356415,
+      "learning_rate": 0.00012325674555743106,
+      "loss": 0.5434,
+      "step": 370
+    },
+    {
+      "epoch": 4.330484330484331,
+      "grad_norm": 0.16264410316944122,
+      "learning_rate": 0.00011973033490264001,
+      "loss": 0.5449,
+      "step": 380
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 0.17614829540252686,
+      "learning_rate": 0.0001161781996552765,
+      "loss": 0.5574,
+      "step": 390
+    },
+    {
+      "epoch": 4.5584045584045585,
+      "grad_norm": 0.19437584280967712,
+      "learning_rate": 0.00011260497112202895,
+      "loss": 0.5448,
+      "step": 400
+    },
+    {
+      "epoch": 4.672364672364672,
+      "grad_norm": 0.19045701622962952,
+      "learning_rate": 0.00010901530811120655,
+      "loss": 0.5474,
+      "step": 410
+    },
+    {
+      "epoch": 4.786324786324786,
+      "grad_norm": 0.21330882608890533,
+      "learning_rate": 0.00010541389085854176,
+      "loss": 0.5552,
+      "step": 420
+    },
+    {
+      "epoch": 4.9002849002849,
+      "grad_norm": 0.17429402470588684,
+      "learning_rate": 0.00010180541492505604,
+      "loss": 0.5495,
+      "step": 430
+    },
+    {
+      "epoch": 5.014245014245014,
+      "grad_norm": 0.17785826325416565,
+      "learning_rate": 9.819458507494394e-05,
+      "loss": 0.5583,
+      "step": 440
+    },
+    {
+      "epoch": 5.128205128205128,
+      "grad_norm": 0.19076977670192719,
+      "learning_rate": 9.458610914145826e-05,
+      "loss": 0.5291,
+      "step": 450
+    },
+    {
+      "epoch": 5.2421652421652425,
+      "grad_norm": 0.19988471269607544,
+      "learning_rate": 9.098469188879349e-05,
+      "loss": 0.5311,
+      "step": 460
+    },
+    {
+      "epoch": 5.356125356125356,
+      "grad_norm": 0.19638335704803467,
+      "learning_rate": 8.739502887797107e-05,
+      "loss": 0.5684,
+      "step": 470
+    },
+    {
+      "epoch": 5.47008547008547,
+      "grad_norm": 0.2043437659740448,
+      "learning_rate": 8.382180034472353e-05,
+      "loss": 0.5371,
+      "step": 480
+    },
+    {
+      "epoch": 5.584045584045584,
+      "grad_norm": 0.2045976221561432,
+      "learning_rate": 8.026966509736001e-05,
+      "loss": 0.5307,
+      "step": 490
+    },
+    {
+      "epoch": 5.698005698005698,
+      "grad_norm": 0.21237310767173767,
+      "learning_rate": 7.674325444256899e-05,
+      "loss": 0.5483,
+      "step": 500
+    },
+    {
+      "epoch": 5.811965811965812,
+      "grad_norm": 0.22306476533412933,
+      "learning_rate": 7.324716614707793e-05,
+      "loss": 0.5572,
+      "step": 510
+    },
+    {
+      "epoch": 5.925925925925926,
+      "grad_norm": 0.20065273344516754,
+      "learning_rate": 6.978595844304271e-05,
+      "loss": 0.5363,
+      "step": 520
+    },
+    {
+      "epoch": 6.0398860398860394,
+      "grad_norm": 0.21213628351688385,
+      "learning_rate": 6.636414408498249e-05,
+      "loss": 0.521,
+      "step": 530
+    },
+    {
+      "epoch": 6.153846153846154,
+      "grad_norm": 0.1936779022216797,
+      "learning_rate": 6.298618446600856e-05,
+      "loss": 0.5283,
+      "step": 540
+    },
+    {
+      "epoch": 6.267806267806268,
+      "grad_norm": 0.19564631581306458,
+      "learning_rate": 5.965648380101916e-05,
+      "loss": 0.5301,
+      "step": 550
+    },
+    {
+      "epoch": 6.381766381766382,
+      "grad_norm": 0.20069913566112518,
+      "learning_rate": 5.6379383384443255e-05,
+      "loss": 0.5204,
+      "step": 560
+    },
+    {
+      "epoch": 6.495726495726496,
+      "grad_norm": 0.21325626969337463,
+      "learning_rate": 5.3159155930021e-05,
+      "loss": 0.5419,
+      "step": 570
+    },
+    {
+      "epoch": 6.60968660968661,
+      "grad_norm": 0.21303197741508484,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.543,
+      "step": 580
+    },
+    {
+      "epoch": 6.7236467236467234,
+      "grad_norm": 0.21136346459388733,
+      "learning_rate": 4.6906034531011346e-05,
+      "loss": 0.5217,
+      "step": 590
+    },
+    {
+      "epoch": 6.837606837606837,
+      "grad_norm": 0.21392931044101715,
+      "learning_rate": 4.388129346376178e-05,
+      "loss": 0.5288,
+      "step": 600
+    },
+    {
+      "epoch": 6.951566951566951,
+      "grad_norm": 0.22880437970161438,
+      "learning_rate": 4.092972048354491e-05,
+      "loss": 0.5273,
+      "step": 610
+    },
+    {
+      "epoch": 7.065527065527066,
+      "grad_norm": 0.21491903066635132,
+      "learning_rate": 3.80551638784277e-05,
+      "loss": 0.5332,
+      "step": 620
+    },
+    {
+      "epoch": 7.17948717948718,
+      "grad_norm": 0.26633119583129883,
+      "learning_rate": 3.5261371521817244e-05,
+      "loss": 0.5239,
+      "step": 630
+    },
+    {
+      "epoch": 7.293447293447294,
+      "grad_norm": 0.23685385286808014,
+      "learning_rate": 3.2551985985948616e-05,
+      "loss": 0.5309,
+      "step": 640
+    },
+    {
+      "epoch": 7.407407407407407,
+      "grad_norm": 0.22292840480804443,
+      "learning_rate": 2.993053979266577e-05,
+      "loss": 0.5372,
+      "step": 650
+    },
+    {
+      "epoch": 7.521367521367521,
+      "grad_norm": 0.2220107614994049,
+      "learning_rate": 2.7400450807686938e-05,
+      "loss": 0.5083,
+      "step": 660
+    },
+    {
+      "epoch": 7.635327635327635,
+      "grad_norm": 0.2191537618637085,
+      "learning_rate": 2.496501778435977e-05,
+      "loss": 0.5164,
+      "step": 670
+    },
+    {
+      "epoch": 7.749287749287749,
+      "grad_norm": 0.22593119740486145,
+      "learning_rate": 2.2627416062716366e-05,
+      "loss": 0.5152,
+      "step": 680
+    },
+    {
+      "epoch": 7.863247863247864,
+      "grad_norm": 0.23532789945602417,
+      "learning_rate": 2.0390693429435627e-05,
+      "loss": 0.5269,
+      "step": 690
+    },
+    {
+      "epoch": 7.977207977207978,
+      "grad_norm": 0.25111591815948486,
+      "learning_rate": 1.825776614411082e-05,
+      "loss": 0.5335,
+      "step": 700
+    },
+    {
+      "epoch": 8.091168091168091,
+      "grad_norm": 0.21956747770309448,
+      "learning_rate": 1.6231415137003537e-05,
+      "loss": 0.5144,
+      "step": 710
+    },
+    {
+      "epoch": 8.205128205128204,
+      "grad_norm": 0.23355403542518616,
+      "learning_rate": 1.4314282383241096e-05,
+      "loss": 0.5294,
+      "step": 720
+    },
+    {
+      "epoch": 8.31908831908832,
+      "grad_norm": 0.23712006211280823,
+      "learning_rate": 1.2508867458185037e-05,
+      "loss": 0.5229,
+      "step": 730
+    },
+    {
+      "epoch": 8.433048433048434,
+      "grad_norm": 0.22506175935268402,
+      "learning_rate": 1.0817524278461776e-05,
+      "loss": 0.5212,
+      "step": 740
+    },
+    {
+      "epoch": 8.547008547008547,
+      "grad_norm": 0.21853385865688324,
+      "learning_rate": 9.242458032904311e-06,
+      "loss": 0.5193,
+      "step": 750
+    },
+    {
+      "epoch": 8.660968660968662,
+      "grad_norm": 0.23257511854171753,
+      "learning_rate": 7.785722307406684e-06,
+      "loss": 0.5039,
+      "step": 760
+    },
+    {
+      "epoch": 8.774928774928775,
+      "grad_norm": 0.21563945710659027,
+      "learning_rate": 6.4492164074399065e-06,
+      "loss": 0.5232,
+      "step": 770
+    },
+    {
+      "epoch": 8.88888888888889,
+      "grad_norm": 0.22108329832553864,
+      "learning_rate": 5.2346828817197655e-06,
+      "loss": 0.5309,
+      "step": 780
+    },
+    {
+      "epoch": 9.002849002849002,
+      "grad_norm": 0.22330021858215332,
+      "learning_rate": 4.143705250255869e-06,
+      "loss": 0.5287,
+      "step": 790
+    },
+    {
+      "epoch": 9.116809116809117,
+      "grad_norm": 0.22394247353076935,
+      "learning_rate": 3.1777059397436692e-06,
+      "loss": 0.5007,
+      "step": 800
+    },
+    {
+      "epoch": 9.23076923076923,
+      "grad_norm": 0.2144930511713028,
+      "learning_rate": 2.3379444289913342e-06,
+      "loss": 0.5277,
+      "step": 810
+    },
+    {
+      "epoch": 9.344729344729345,
+      "grad_norm": 0.2214236557483673,
+      "learning_rate": 1.6255156067997323e-06,
+      "loss": 0.5173,
+      "step": 820
+    },
+    {
+      "epoch": 9.45868945868946,
+      "grad_norm": 0.2192196100950241,
+      "learning_rate": 1.0413483444362771e-06,
+      "loss": 0.5123,
+      "step": 830
+    },
+    {
+      "epoch": 9.572649572649572,
+      "grad_norm": 0.22837017476558685,
+      "learning_rate": 5.862042845640403e-07,
+      "loss": 0.5279,
+      "step": 840
+    },
+    {
+      "epoch": 9.686609686609687,
+      "grad_norm": 0.21172335743904114,
+      "learning_rate": 2.606768482050215e-07,
+      "loss": 0.5263,
+      "step": 850
+    },
+    {
+      "epoch": 9.8005698005698,
+      "grad_norm": 0.23530949652194977,
+      "learning_rate": 6.519046103230508e-08,
+      "loss": 0.5202,
+      "step": 860
+    },
+    {
+      "epoch": 9.914529914529915,
+      "grad_norm": 0.23058444261550903,
+      "learning_rate": 0.0,
+      "loss": 0.5243,
+      "step": 870
+    },
+    {
+      "epoch": 9.914529914529915,
+      "step": 870,
+      "total_flos": 5.67984355540992e+16,
+      "train_loss": 0.5655439464525245,
+      "train_runtime": 2716.1071,
+      "train_samples_per_second": 1.292,
+      "train_steps_per_second": 0.32
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 870,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "total_flos": 5.67984355540992e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1d917449d89c64c2ae6344d97c3bf6801872da1d9cc7f5c2752d692ed16bc08
 size 4984

 version https://git-lfs.github.com/spec/v1
+oid sha256:397e037153240ddceac73ed48aa51e61256700ed0043d6463824f5d88fa3beb8
 size 4984