Upload 12 files

Browse files

Files changed (10) hide show

README.md +1 -1
adapter_config.json +4 -4
adapter_model.safetensors +1 -1
optimizer.pt +1 -1
scheduler.pt +1 -1
special_tokens_map.json +4 -0
tokenizer.json +2 -2
tokenizer_config.json +21 -0
trainer_state.json +301 -301
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 library_name: peft
-base_model: google/gemma-7b
 ---
 # Model Card for Model ID

 ---
 library_name: peft
+base_model: google/gemma-7b-it
 ---
 # Model Card for Model ID

adapter_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "alpha_pattern": {},
   "auto_mapping": null,
-  "base_model_name_or_path": "google/gemma-7b",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
@@ -21,11 +21,11 @@
   "target_modules": [
     "o_proj",
     "k_proj",
-    "gate_proj",
-    "q_proj",
     "v_proj",
     "up_proj",
-    "down_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_rslora": false

 {
   "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-7b-it",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
   "target_modules": [
     "o_proj",
     "k_proj",
+    "down_proj",
     "v_proj",
+    "gate_proj",
     "up_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_rslora": false

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21f1402c48f90b44fe71d7d2f4a3984a548e64279efda68f4b24e55ada2c8233
 size 100059752

 version https://git-lfs.github.com/spec/v1
+oid sha256:30f4a246c1f4f9856ab88a883fd0e60bc5595caa8199955de3c9993f0ae5171d
 size 100059752

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:502a6742bb8f48b47e6450a524e1d895f6e64d98a1cb546233bd8cc6428c1988
 size 50545780

 version https://git-lfs.github.com/spec/v1
+oid sha256:0e9822545dcd04649a67476c0930a0020fde0d4172429320eedd4b24afa59839
 size 50545780

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a60c7d771c1fd156acee762fba03c724cb41829a3f71df370ecd1d20b134982
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:312b1a4e327e3843fca725781dfeb890e3325a1f08b77b1a3b7f934fb5467564
 size 1064

special_tokens_map.json CHANGED Viewed

@@ -1,4 +1,8 @@
 {
   "bos_token": {
     "content": "<bos>",
     "lstrip": false,

 {
+  "additional_special_tokens": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ],
   "bos_token": {
     "content": "<bos>",
     "lstrip": false,

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb7e592b7314175501c7fc56b904d581de569169ac90e5aa2ec11a860c2cbbaa
-size 17477652

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce19157ce6b457736a0015a2a4fb06c966c6ff252ec9d1950777eec6598abf6d
+size 17478028

tokenizer_config.json CHANGED Viewed

@@ -33,9 +33,30 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "<bos>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<eos>",
   "legacy": null,

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "106": {
+      "content": "<start_of_turn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<end_of_turn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
+  "additional_special_tokens": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ],
   "bos_token": "<bos>",
+  "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<eos>",
   "legacy": null,

trainer_state.json CHANGED Viewed

@@ -10,702 +10,702 @@
   "log_history": [
     {
       "epoch": 1.0,
-      "grad_norm": 2.355809211730957,
-      "learning_rate": 0.0001,
-      "loss": 1.3272,
       "step": 1
     },
     {
       "epoch": 2.0,
-      "grad_norm": 2.355809211730957,
-      "learning_rate": 0.0002,
-      "loss": 1.3272,
       "step": 2
     },
     {
       "epoch": 3.0,
-      "grad_norm": 1.4446067810058594,
-      "learning_rate": 0.00019795918367346938,
-      "loss": 1.1654,
       "step": 3
     },
     {
       "epoch": 4.0,
-      "grad_norm": 1.521039366722107,
-      "learning_rate": 0.0001959183673469388,
-      "loss": 0.9031,
       "step": 4
     },
     {
       "epoch": 5.0,
-      "grad_norm": 1.5087562799453735,
-      "learning_rate": 0.00019387755102040816,
-      "loss": 0.6727,
       "step": 5
     },
     {
       "epoch": 6.0,
-      "grad_norm": 1.60162353515625,
-      "learning_rate": 0.00019183673469387756,
-      "loss": 0.4641,
       "step": 6
     },
     {
       "epoch": 7.0,
-      "grad_norm": 1.0103554725646973,
-      "learning_rate": 0.00018979591836734697,
-      "loss": 0.2926,
       "step": 7
     },
     {
       "epoch": 8.0,
-      "grad_norm": 2.812863826751709,
-      "learning_rate": 0.00018775510204081634,
-      "loss": 0.2053,
       "step": 8
     },
     {
       "epoch": 9.0,
-      "grad_norm": 1.5256155729293823,
-      "learning_rate": 0.00018571428571428572,
-      "loss": 0.1473,
       "step": 9
     },
     {
       "epoch": 10.0,
-      "grad_norm": 1.875736951828003,
-      "learning_rate": 0.00018367346938775512,
-      "loss": 0.1351,
       "step": 10
     },
     {
       "epoch": 11.0,
-      "grad_norm": 0.4695473313331604,
-      "learning_rate": 0.0001816326530612245,
-      "loss": 0.1029,
       "step": 11
     },
     {
       "epoch": 12.0,
-      "grad_norm": 0.5592666864395142,
-      "learning_rate": 0.0001795918367346939,
-      "loss": 0.0982,
       "step": 12
     },
     {
       "epoch": 13.0,
-      "grad_norm": 0.3424108922481537,
-      "learning_rate": 0.00017755102040816327,
-      "loss": 0.086,
       "step": 13
     },
     {
       "epoch": 14.0,
-      "grad_norm": 0.5507439374923706,
-      "learning_rate": 0.00017551020408163265,
-      "loss": 0.082,
       "step": 14
     },
     {
       "epoch": 15.0,
-      "grad_norm": 0.40908390283584595,
-      "learning_rate": 0.00017346938775510205,
-      "loss": 0.0727,
       "step": 15
     },
     {
       "epoch": 16.0,
-      "grad_norm": 0.5264757871627808,
-      "learning_rate": 0.00017142857142857143,
-      "loss": 0.0651,
       "step": 16
     },
     {
       "epoch": 17.0,
-      "grad_norm": 0.5899124145507812,
-      "learning_rate": 0.00016938775510204083,
-      "loss": 0.0557,
       "step": 17
     },
     {
       "epoch": 18.0,
-      "grad_norm": 0.5028952956199646,
-      "learning_rate": 0.00016734693877551023,
-      "loss": 0.042,
       "step": 18
     },
     {
       "epoch": 19.0,
-      "grad_norm": 0.6451049447059631,
-      "learning_rate": 0.0001653061224489796,
-      "loss": 0.0315,
       "step": 19
     },
     {
       "epoch": 20.0,
-      "grad_norm": 0.5155397057533264,
-      "learning_rate": 0.00016326530612244898,
-      "loss": 0.017,
       "step": 20
     },
     {
       "epoch": 21.0,
-      "grad_norm": 0.29147273302078247,
-      "learning_rate": 0.00016122448979591838,
-      "loss": 0.009,
       "step": 21
     },
     {
       "epoch": 22.0,
-      "grad_norm": 0.3708033263683319,
-      "learning_rate": 0.00015918367346938776,
-      "loss": 0.0076,
       "step": 22
     },
     {
       "epoch": 23.0,
-      "grad_norm": 0.08893364667892456,
-      "learning_rate": 0.00015714285714285716,
-      "loss": 0.0057,
       "step": 23
     },
     {
       "epoch": 24.0,
-      "grad_norm": 0.1690322905778885,
-      "learning_rate": 0.00015510204081632654,
-      "loss": 0.0062,
       "step": 24
     },
     {
       "epoch": 25.0,
-      "grad_norm": 0.20433245599269867,
-      "learning_rate": 0.0001530612244897959,
-      "loss": 0.0058,
       "step": 25
     },
     {
       "epoch": 26.0,
-      "grad_norm": 0.16720539331436157,
-      "learning_rate": 0.0001510204081632653,
-      "loss": 0.0056,
       "step": 26
     },
     {
       "epoch": 27.0,
-      "grad_norm": 0.17153231799602509,
-      "learning_rate": 0.00014897959183673472,
-      "loss": 0.0056,
       "step": 27
     },
     {
       "epoch": 28.0,
-      "grad_norm": 0.1037655845284462,
-      "learning_rate": 0.0001469387755102041,
-      "loss": 0.0059,
       "step": 28
     },
     {
       "epoch": 29.0,
-      "grad_norm": 0.14174027740955353,
-      "learning_rate": 0.0001448979591836735,
-      "loss": 0.0055,
       "step": 29
     },
     {
       "epoch": 30.0,
-      "grad_norm": 0.011748074553906918,
-      "learning_rate": 0.00014285714285714287,
-      "loss": 0.0053,
       "step": 30
     },
     {
       "epoch": 31.0,
-      "grad_norm": 0.09256725758314133,
-      "learning_rate": 0.00014081632653061224,
-      "loss": 0.0054,
       "step": 31
     },
     {
       "epoch": 32.0,
-      "grad_norm": 0.09271395206451416,
-      "learning_rate": 0.00013877551020408165,
-      "loss": 0.0054,
       "step": 32
     },
     {
       "epoch": 33.0,
-      "grad_norm": 0.10514255613088608,
-      "learning_rate": 0.00013673469387755102,
-      "loss": 0.0049,
       "step": 33
     },
     {
       "epoch": 34.0,
-      "grad_norm": 0.14201119542121887,
-      "learning_rate": 0.0001346938775510204,
-      "loss": 0.0055,
       "step": 34
     },
     {
       "epoch": 35.0,
-      "grad_norm": 0.05419391393661499,
-      "learning_rate": 0.0001326530612244898,
-      "loss": 0.0049,
       "step": 35
     },
     {
       "epoch": 36.0,
-      "grad_norm": 0.17390480637550354,
-      "learning_rate": 0.00013061224489795917,
-      "loss": 0.0056,
       "step": 36
     },
     {
       "epoch": 37.0,
-      "grad_norm": 0.01634000428020954,
-      "learning_rate": 0.00012857142857142858,
-      "loss": 0.0053,
       "step": 37
     },
     {
       "epoch": 38.0,
-      "grad_norm": 0.14033198356628418,
-      "learning_rate": 0.00012653061224489798,
-      "loss": 0.0055,
       "step": 38
     },
     {
       "epoch": 39.0,
-      "grad_norm": 0.06495083123445511,
-      "learning_rate": 0.00012448979591836735,
-      "loss": 0.0053,
       "step": 39
     },
     {
       "epoch": 40.0,
-      "grad_norm": 0.014886317774653435,
-      "learning_rate": 0.00012244897959183676,
-      "loss": 0.0053,
       "step": 40
     },
     {
       "epoch": 41.0,
-      "grad_norm": 0.09001646935939789,
-      "learning_rate": 0.00012040816326530613,
-      "loss": 0.0054,
       "step": 41
     },
     {
       "epoch": 42.0,
-      "grad_norm": 0.05034321919083595,
-      "learning_rate": 0.00011836734693877552,
-      "loss": 0.0048,
       "step": 42
     },
     {
       "epoch": 43.0,
-      "grad_norm": 0.012698125094175339,
-      "learning_rate": 0.0001163265306122449,
-      "loss": 0.0053,
       "step": 43
     },
     {
       "epoch": 44.0,
-      "grad_norm": 0.06450632214546204,
-      "learning_rate": 0.00011428571428571428,
-      "loss": 0.0053,
       "step": 44
     },
     {
       "epoch": 45.0,
-      "grad_norm": 0.06438387930393219,
-      "learning_rate": 0.00011224489795918367,
-      "loss": 0.0053,
       "step": 45
     },
     {
       "epoch": 46.0,
-      "grad_norm": 0.010477419942617416,
-      "learning_rate": 0.00011020408163265306,
-      "loss": 0.0053,
       "step": 46
     },
     {
       "epoch": 47.0,
-      "grad_norm": 0.010764668695628643,
-      "learning_rate": 0.00010816326530612246,
-      "loss": 0.0053,
       "step": 47
     },
     {
       "epoch": 48.0,
-      "grad_norm": 0.010250415652990341,
-      "learning_rate": 0.00010612244897959185,
-      "loss": 0.0053,
       "step": 48
     },
     {
       "epoch": 49.0,
-      "grad_norm": 0.01072185579687357,
-      "learning_rate": 0.00010408163265306123,
-      "loss": 0.0053,
       "step": 49
     },
     {
       "epoch": 50.0,
-      "grad_norm": 0.04789392277598381,
-      "learning_rate": 0.00010204081632653062,
-      "loss": 0.0058,
       "step": 50
     },
     {
       "epoch": 51.0,
-      "grad_norm": 0.046109408140182495,
-      "learning_rate": 0.0001,
-      "loss": 0.0048,
       "step": 51
     },
     {
       "epoch": 52.0,
-      "grad_norm": 0.06478489935398102,
-      "learning_rate": 9.79591836734694e-05,
-      "loss": 0.0053,
       "step": 52
     },
     {
       "epoch": 53.0,
-      "grad_norm": 0.06432071328163147,
-      "learning_rate": 9.591836734693878e-05,
-      "loss": 0.0053,
       "step": 53
     },
     {
       "epoch": 54.0,
-      "grad_norm": 0.02735786698758602,
-      "learning_rate": 9.387755102040817e-05,
-      "loss": 0.0058,
       "step": 54
     },
     {
       "epoch": 55.0,
-      "grad_norm": 0.04583168402314186,
-      "learning_rate": 9.183673469387756e-05,
-      "loss": 0.0048,
       "step": 55
     },
     {
       "epoch": 56.0,
-      "grad_norm": 0.08271433413028717,
-      "learning_rate": 8.979591836734695e-05,
-      "loss": 0.0053,
       "step": 56
     },
     {
       "epoch": 57.0,
-      "grad_norm": 0.010216895490884781,
-      "learning_rate": 8.775510204081632e-05,
-      "loss": 0.0053,
       "step": 57
     },
     {
       "epoch": 58.0,
-      "grad_norm": 0.06349101662635803,
-      "learning_rate": 8.571428571428571e-05,
-      "loss": 0.0053,
       "step": 58
     },
     {
       "epoch": 59.0,
-      "grad_norm": 0.06379684805870056,
-      "learning_rate": 8.367346938775511e-05,
-      "loss": 0.0053,
       "step": 59
     },
     {
       "epoch": 60.0,
-      "grad_norm": 0.010050535202026367,
-      "learning_rate": 8.163265306122449e-05,
-      "loss": 0.0053,
       "step": 60
     },
     {
       "epoch": 61.0,
-      "grad_norm": 0.08275067806243896,
-      "learning_rate": 7.959183673469388e-05,
-      "loss": 0.0053,
       "step": 61
     },
     {
       "epoch": 62.0,
-      "grad_norm": 0.010069911368191242,
-      "learning_rate": 7.755102040816327e-05,
-      "loss": 0.0053,
       "step": 62
     },
     {
       "epoch": 63.0,
-      "grad_norm": 0.00969759002327919,
-      "learning_rate": 7.551020408163266e-05,
-      "loss": 0.0053,
       "step": 63
     },
     {
       "epoch": 64.0,
-      "grad_norm": 0.06372305005788803,
-      "learning_rate": 7.346938775510205e-05,
-      "loss": 0.0053,
       "step": 64
     },
     {
       "epoch": 65.0,
-      "grad_norm": 0.027780011296272278,
-      "learning_rate": 7.142857142857143e-05,
-      "loss": 0.0048,
       "step": 65
     },
     {
       "epoch": 66.0,
-      "grad_norm": 0.009718519635498524,
-      "learning_rate": 6.938775510204082e-05,
-      "loss": 0.0053,
       "step": 66
     },
     {
       "epoch": 67.0,
-      "grad_norm": 0.04677029326558113,
-      "learning_rate": 6.73469387755102e-05,
-      "loss": 0.0058,
       "step": 67
     },
     {
       "epoch": 68.0,
-      "grad_norm": 0.00952499732375145,
-      "learning_rate": 6.530612244897959e-05,
-      "loss": 0.0053,
       "step": 68
     },
     {
       "epoch": 69.0,
-      "grad_norm": 0.009604902006685734,
-      "learning_rate": 6.326530612244899e-05,
-      "loss": 0.0053,
       "step": 69
     },
     {
       "epoch": 70.0,
-      "grad_norm": 0.06420488655567169,
-      "learning_rate": 6.122448979591838e-05,
-      "loss": 0.0053,
       "step": 70
     },
     {
       "epoch": 71.0,
-      "grad_norm": 0.026840027421712875,
-      "learning_rate": 5.918367346938776e-05,
-      "loss": 0.0058,
       "step": 71
     },
     {
       "epoch": 72.0,
-      "grad_norm": 0.009567616507411003,
-      "learning_rate": 5.714285714285714e-05,
-      "loss": 0.0053,
       "step": 72
     },
     {
       "epoch": 73.0,
-      "grad_norm": 0.045753173530101776,
-      "learning_rate": 5.510204081632653e-05,
-      "loss": 0.0048,
       "step": 73
     },
     {
       "epoch": 74.0,
-      "grad_norm": 0.009476774372160435,
-      "learning_rate": 5.3061224489795926e-05,
-      "loss": 0.0053,
       "step": 74
     },
     {
       "epoch": 75.0,
-      "grad_norm": 0.04571187496185303,
-      "learning_rate": 5.102040816326531e-05,
-      "loss": 0.0048,
       "step": 75
     },
     {
       "epoch": 76.0,
-      "grad_norm": 0.009730060584843159,
-      "learning_rate": 4.89795918367347e-05,
-      "loss": 0.0053,
       "step": 76
     },
     {
       "epoch": 77.0,
-      "grad_norm": 0.028138577938079834,
-      "learning_rate": 4.6938775510204086e-05,
-      "loss": 0.0048,
       "step": 77
     },
     {
       "epoch": 78.0,
-      "grad_norm": 0.06447193771600723,
-      "learning_rate": 4.4897959183673474e-05,
-      "loss": 0.0053,
       "step": 78
     },
     {
       "epoch": 79.0,
-      "grad_norm": 0.06447941064834595,
-      "learning_rate": 4.2857142857142856e-05,
-      "loss": 0.0053,
       "step": 79
     },
     {
       "epoch": 80.0,
-      "grad_norm": 0.028326738625764847,
-      "learning_rate": 4.0816326530612245e-05,
-      "loss": 0.0048,
       "step": 80
     },
     {
       "epoch": 81.0,
-      "grad_norm": 0.08396949619054794,
-      "learning_rate": 3.8775510204081634e-05,
-      "loss": 0.0053,
       "step": 81
     },
     {
       "epoch": 82.0,
-      "grad_norm": 0.04610452800989151,
-      "learning_rate": 3.673469387755102e-05,
-      "loss": 0.0048,
       "step": 82
     },
     {
       "epoch": 83.0,
-      "grad_norm": 0.046097833663225174,
-      "learning_rate": 3.469387755102041e-05,
-      "loss": 0.0048,
       "step": 83
     },
     {
       "epoch": 84.0,
-      "grad_norm": 0.009920083917677402,
-      "learning_rate": 3.265306122448979e-05,
-      "loss": 0.0053,
       "step": 84
     },
     {
       "epoch": 85.0,
-      "grad_norm": 0.02849722094833851,
-      "learning_rate": 3.061224489795919e-05,
-      "loss": 0.0048,
       "step": 85
     },
     {
       "epoch": 86.0,
-      "grad_norm": 0.00956336036324501,
-      "learning_rate": 2.857142857142857e-05,
-      "loss": 0.0053,
       "step": 86
     },
     {
       "epoch": 87.0,
-      "grad_norm": 0.06530480086803436,
-      "learning_rate": 2.6530612244897963e-05,
-      "loss": 0.0053,
       "step": 87
     },
     {
       "epoch": 88.0,
-      "grad_norm": 0.028555840253829956,
-      "learning_rate": 2.448979591836735e-05,
-      "loss": 0.0048,
       "step": 88
     },
     {
       "epoch": 89.0,
-      "grad_norm": 0.009486482478678226,
-      "learning_rate": 2.2448979591836737e-05,
-      "loss": 0.0053,
       "step": 89
     },
     {
       "epoch": 90.0,
-      "grad_norm": 0.028627220541238785,
-      "learning_rate": 2.0408163265306123e-05,
-      "loss": 0.0048,
       "step": 90
     },
     {
       "epoch": 91.0,
-      "grad_norm": 0.009754066355526447,
-      "learning_rate": 1.836734693877551e-05,
-      "loss": 0.0053,
       "step": 91
     },
     {
       "epoch": 92.0,
-      "grad_norm": 0.009760790504515171,
-      "learning_rate": 1.6326530612244897e-05,
-      "loss": 0.0053,
       "step": 92
     },
     {
       "epoch": 93.0,
-      "grad_norm": 0.02760305069386959,
-      "learning_rate": 1.4285714285714285e-05,
-      "loss": 0.0058,
       "step": 93
     },
     {
       "epoch": 94.0,
-      "grad_norm": 0.009988558478653431,
-      "learning_rate": 1.2244897959183674e-05,
-      "loss": 0.0053,
       "step": 94
     },
     {
       "epoch": 95.0,
-      "grad_norm": 0.04651060700416565,
-      "learning_rate": 1.0204081632653061e-05,
-      "loss": 0.0048,
       "step": 95
     },
     {
       "epoch": 96.0,
-      "grad_norm": 0.010227691382169724,
-      "learning_rate": 8.163265306122448e-06,
-      "loss": 0.0053,
       "step": 96
     },
     {
       "epoch": 97.0,
-      "grad_norm": 0.010372389107942581,
-      "learning_rate": 6.122448979591837e-06,
-      "loss": 0.0053,
       "step": 97
     },
     {
       "epoch": 98.0,
-      "grad_norm": 0.009753258898854256,
-      "learning_rate": 4.081632653061224e-06,
-      "loss": 0.0053,
       "step": 98
     },
     {
       "epoch": 99.0,
-      "grad_norm": 0.010054420679807663,
-      "learning_rate": 2.040816326530612e-06,
-      "loss": 0.0053,
       "step": 99
     },
     {
       "epoch": 100.0,
-      "grad_norm": 0.01012208592146635,
-      "learning_rate": 0.0,
-      "loss": 0.0053,
       "step": 100
     }
   ],
@@ -714,7 +714,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 100,
   "save_steps": 500,
-  "total_flos": 625210570752000.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "log_history": [
     {
       "epoch": 1.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.0,
+      "loss": 7.3765,
       "step": 1
     },
     {
       "epoch": 2.0,
+      "grad_norm": 2.7495672702789307,
+      "learning_rate": 0.0001,
+      "loss": 7.3765,
       "step": 2
     },
     {
       "epoch": 3.0,
+      "grad_norm": 2.7495672702789307,
+      "learning_rate": 0.0002,
+      "loss": 7.3765,
       "step": 3
     },
     {
       "epoch": 4.0,
+      "grad_norm": 3.6275627613067627,
+      "learning_rate": 0.00019795918367346938,
+      "loss": 6.994,
       "step": 4
     },
     {
       "epoch": 5.0,
+      "grad_norm": 7.796189308166504,
+      "learning_rate": 0.0001959183673469388,
+      "loss": 6.341,
       "step": 5
     },
     {
       "epoch": 6.0,
+      "grad_norm": 11.919865608215332,
+      "learning_rate": 0.00019387755102040816,
+      "loss": 5.805,
       "step": 6
     },
     {
       "epoch": 7.0,
+      "grad_norm": Infinity,
+      "learning_rate": 0.00019387755102040816,
+      "loss": 5.2771,
       "step": 7
     },
     {
       "epoch": 8.0,
+      "grad_norm": 15.628558158874512,
+      "learning_rate": 0.00019183673469387756,
+      "loss": 5.2771,
       "step": 8
     },
     {
       "epoch": 9.0,
+      "grad_norm": 18.900388717651367,
+      "learning_rate": 0.00018979591836734697,
+      "loss": 4.7626,
       "step": 9
     },
     {
       "epoch": 10.0,
+      "grad_norm": 21.62285614013672,
+      "learning_rate": 0.00018775510204081634,
+      "loss": 4.2169,
       "step": 10
     },
     {
       "epoch": 11.0,
+      "grad_norm": 23.690582275390625,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 3.623,
       "step": 11
     },
     {
       "epoch": 12.0,
+      "grad_norm": 25.02626609802246,
+      "learning_rate": 0.00018367346938775512,
+      "loss": 2.9824,
       "step": 12
     },
     {
       "epoch": 13.0,
+      "grad_norm": 25.598007202148438,
+      "learning_rate": 0.0001816326530612245,
+      "loss": 2.3122,
       "step": 13
     },
     {
       "epoch": 14.0,
+      "grad_norm": 25.378807067871094,
+      "learning_rate": 0.0001795918367346939,
+      "loss": 1.6226,
       "step": 14
     },
     {
       "epoch": 15.0,
+      "grad_norm": 24.527645111083984,
+      "learning_rate": 0.00017755102040816327,
+      "loss": 0.9334,
       "step": 15
     },
     {
       "epoch": 16.0,
+      "grad_norm": 23.03998565673828,
+      "learning_rate": 0.00017551020408163265,
+      "loss": 0.2465,
       "step": 16
     },
     {
       "epoch": 17.0,
+      "grad_norm": 4.810272216796875,
+      "learning_rate": 0.00017346938775510205,
+      "loss": 0.2217,
       "step": 17
     },
     {
       "epoch": 18.0,
+      "grad_norm": 5.40369987487793,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 0.2093,
       "step": 18
     },
     {
       "epoch": 19.0,
+      "grad_norm": 5.298532962799072,
+      "learning_rate": 0.00016938775510204083,
+      "loss": 0.1797,
       "step": 19
     },
     {
       "epoch": 20.0,
+      "grad_norm": 4.629075050354004,
+      "learning_rate": 0.00016734693877551023,
+      "loss": 0.1339,
       "step": 20
     },
     {
       "epoch": 21.0,
+      "grad_norm": 1.2457849979400635,
+      "learning_rate": 0.0001653061224489796,
+      "loss": 0.092,
       "step": 21
     },
     {
       "epoch": 22.0,
+      "grad_norm": 0.8375206589698792,
+      "learning_rate": 0.00016326530612244898,
+      "loss": 0.0933,
       "step": 22
     },
     {
       "epoch": 23.0,
+      "grad_norm": 0.7440481185913086,
+      "learning_rate": 0.00016122448979591838,
+      "loss": 0.081,
       "step": 23
     },
     {
       "epoch": 24.0,
+      "grad_norm": 0.728550910949707,
+      "learning_rate": 0.00015918367346938776,
+      "loss": 0.0715,
       "step": 24
     },
     {
       "epoch": 25.0,
+      "grad_norm": 0.729324460029602,
+      "learning_rate": 0.00015714285714285716,
+      "loss": 0.0583,
       "step": 25
     },
     {
       "epoch": 26.0,
+      "grad_norm": 0.7445201873779297,
+      "learning_rate": 0.00015510204081632654,
+      "loss": 0.044,
       "step": 26
     },
     {
       "epoch": 27.0,
+      "grad_norm": 0.64507657289505,
+      "learning_rate": 0.0001530612244897959,
+      "loss": 0.0256,
       "step": 27
     },
     {
       "epoch": 28.0,
+      "grad_norm": 0.3869144916534424,
+      "learning_rate": 0.0001510204081632653,
+      "loss": 0.0138,
       "step": 28
     },
     {
       "epoch": 29.0,
+      "grad_norm": 0.17224831879138947,
+      "learning_rate": 0.00014897959183673472,
+      "loss": 0.0087,
       "step": 29
     },
     {
       "epoch": 30.0,
+      "grad_norm": 0.0585104376077652,
+      "learning_rate": 0.0001469387755102041,
+      "loss": 0.0072,
       "step": 30
     },
     {
       "epoch": 31.0,
+      "grad_norm": 0.18696996569633484,
+      "learning_rate": 0.0001448979591836735,
+      "loss": 0.0081,
       "step": 31
     },
     {
       "epoch": 32.0,
+      "grad_norm": 0.10075689852237701,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 0.0072,
       "step": 32
     },
     {
       "epoch": 33.0,
+      "grad_norm": 0.04343040660023689,
+      "learning_rate": 0.00014081632653061224,
+      "loss": 0.0069,
       "step": 33
     },
     {
       "epoch": 34.0,
+      "grad_norm": 0.13335004448890686,
+      "learning_rate": 0.00013877551020408165,
+      "loss": 0.0074,
       "step": 34
     },
     {
       "epoch": 35.0,
+      "grad_norm": 0.0894094929099083,
+      "learning_rate": 0.00013673469387755102,
+      "loss": 0.007,
       "step": 35
     },
     {
       "epoch": 36.0,
+      "grad_norm": 0.01999577507376671,
+      "learning_rate": 0.0001346938775510204,
+      "loss": 0.0067,
       "step": 36
     },
     {
       "epoch": 37.0,
+      "grad_norm": 0.1184980571269989,
+      "learning_rate": 0.0001326530612244898,
+      "loss": 0.0072,
       "step": 37
     },
     {
       "epoch": 38.0,
+      "grad_norm": 0.09607323259115219,
+      "learning_rate": 0.00013061224489795917,
+      "loss": 0.007,
       "step": 38
     },
     {
       "epoch": 39.0,
+      "grad_norm": 0.027331219986081123,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 0.0067,
       "step": 39
     },
     {
       "epoch": 40.0,
+      "grad_norm": 0.08817232400178909,
+      "learning_rate": 0.00012653061224489798,
+      "loss": 0.0069,
       "step": 40
     },
     {
       "epoch": 41.0,
+      "grad_norm": 0.08792853355407715,
+      "learning_rate": 0.00012448979591836735,
+      "loss": 0.0069,
       "step": 41
     },
     {
       "epoch": 42.0,
+      "grad_norm": 0.04289069399237633,
+      "learning_rate": 0.00012244897959183676,
+      "loss": 0.0067,
       "step": 42
     },
     {
       "epoch": 43.0,
+      "grad_norm": 0.04996877163648605,
+      "learning_rate": 0.00012040816326530613,
+      "loss": 0.0067,
       "step": 43
     },
     {
       "epoch": 44.0,
+      "grad_norm": 0.07244863361120224,
+      "learning_rate": 0.00011836734693877552,
+      "loss": 0.0068,
       "step": 44
     },
     {
       "epoch": 45.0,
+      "grad_norm": 0.07215742021799088,
+      "learning_rate": 0.0001163265306122449,
+      "loss": 0.0068,
       "step": 45
     },
     {
       "epoch": 46.0,
+      "grad_norm": 0.01955232582986355,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 0.0067,
       "step": 46
     },
     {
       "epoch": 47.0,
+      "grad_norm": 0.06493868678808212,
+      "learning_rate": 0.00011224489795918367,
+      "loss": 0.0068,
       "step": 47
     },
     {
       "epoch": 48.0,
+      "grad_norm": 0.06490014493465424,
+      "learning_rate": 0.00011020408163265306,
+      "loss": 0.0068,
       "step": 48
     },
     {
       "epoch": 49.0,
+      "grad_norm": 0.019649550318717957,
+      "learning_rate": 0.00010816326530612246,
+      "loss": 0.0067,
       "step": 49
     },
     {
       "epoch": 50.0,
+      "grad_norm": 0.04920223355293274,
+      "learning_rate": 0.00010612244897959185,
+      "loss": 0.0067,
       "step": 50
     },
     {
       "epoch": 51.0,
+      "grad_norm": 0.07163064181804657,
+      "learning_rate": 0.00010408163265306123,
+      "loss": 0.0068,
       "step": 51
     },
     {
       "epoch": 52.0,
+      "grad_norm": 0.005953885614871979,
+      "learning_rate": 0.00010204081632653062,
+      "loss": 0.0066,
       "step": 52
     },
     {
       "epoch": 53.0,
+      "grad_norm": 0.01944654807448387,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
       "step": 53
     },
     {
       "epoch": 54.0,
+      "grad_norm": 0.0421106182038784,
+      "learning_rate": 9.79591836734694e-05,
+      "loss": 0.0067,
       "step": 54
     },
     {
       "epoch": 55.0,
+      "grad_norm": 0.019489118829369545,
+      "learning_rate": 9.591836734693878e-05,
+      "loss": 0.0066,
       "step": 55
     },
     {
       "epoch": 56.0,
+      "grad_norm": 0.004421094432473183,
+      "learning_rate": 9.387755102040817e-05,
+      "loss": 0.0066,
       "step": 56
     },
     {
       "epoch": 57.0,
+      "grad_norm": 0.026416227221488953,
+      "learning_rate": 9.183673469387756e-05,
+      "loss": 0.0067,
       "step": 57
     },
     {
       "epoch": 58.0,
+      "grad_norm": 0.003954235929995775,
+      "learning_rate": 8.979591836734695e-05,
+      "loss": 0.0066,
       "step": 58
     },
     {
       "epoch": 59.0,
+      "grad_norm": 0.003926219418644905,
+      "learning_rate": 8.775510204081632e-05,
+      "loss": 0.0066,
       "step": 59
     },
     {
       "epoch": 60.0,
+      "grad_norm": 0.0038123615086078644,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 0.0066,
       "step": 60
     },
     {
       "epoch": 61.0,
+      "grad_norm": 0.003582009579986334,
+      "learning_rate": 8.367346938775511e-05,
+      "loss": 0.0066,
       "step": 61
     },
     {
       "epoch": 62.0,
+      "grad_norm": 0.0035740730818361044,
+      "learning_rate": 8.163265306122449e-05,
+      "loss": 0.0066,
       "step": 62
     },
     {
       "epoch": 63.0,
+      "grad_norm": 0.01964273676276207,
+      "learning_rate": 7.959183673469388e-05,
+      "loss": 0.0066,
       "step": 63
     },
     {
       "epoch": 64.0,
+      "grad_norm": 0.01971287839114666,
+      "learning_rate": 7.755102040816327e-05,
+      "loss": 0.0066,
       "step": 64
     },
     {
       "epoch": 65.0,
+      "grad_norm": 0.0035709121730178595,
+      "learning_rate": 7.551020408163266e-05,
+      "loss": 0.0066,
       "step": 65
     },
     {
       "epoch": 66.0,
+      "grad_norm": 0.003548271721228957,
+      "learning_rate": 7.346938775510205e-05,
+      "loss": 0.0066,
       "step": 66
     },
     {
       "epoch": 67.0,
+      "grad_norm": 0.02695435844361782,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.0066,
       "step": 67
     },
     {
       "epoch": 68.0,
+      "grad_norm": 0.026985742151737213,
+      "learning_rate": 6.938775510204082e-05,
+      "loss": 0.0066,
       "step": 68
     },
     {
       "epoch": 69.0,
+      "grad_norm": 0.00358410133048892,
+      "learning_rate": 6.73469387755102e-05,
+      "loss": 0.0066,
       "step": 69
     },
     {
       "epoch": 70.0,
+      "grad_norm": 0.04342804476618767,
+      "learning_rate": 6.530612244897959e-05,
+      "loss": 0.0067,
       "step": 70
     },
     {
       "epoch": 71.0,
+      "grad_norm": 0.020023003220558167,
+      "learning_rate": 6.326530612244899e-05,
+      "loss": 0.0066,
       "step": 71
     },
     {
       "epoch": 72.0,
+      "grad_norm": 0.020061027258634567,
+      "learning_rate": 6.122448979591838e-05,
+      "loss": 0.0066,
       "step": 72
     },
     {
       "epoch": 73.0,
+      "grad_norm": 0.003791953669860959,
+      "learning_rate": 5.918367346938776e-05,
+      "loss": 0.0066,
       "step": 73
     },
     {
       "epoch": 74.0,
+      "grad_norm": 0.050881966948509216,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 0.0067,
       "step": 74
     },
     {
       "epoch": 75.0,
+      "grad_norm": 0.027295473963022232,
+      "learning_rate": 5.510204081632653e-05,
+      "loss": 0.0066,
       "step": 75
     },
     {
       "epoch": 76.0,
+      "grad_norm": 0.0037258469965308905,
+      "learning_rate": 5.3061224489795926e-05,
+      "loss": 0.0066,
       "step": 76
     },
     {
       "epoch": 77.0,
+      "grad_norm": 0.020169131457805634,
+      "learning_rate": 5.102040816326531e-05,
+      "loss": 0.0066,
       "step": 77
     },
     {
       "epoch": 78.0,
+      "grad_norm": 0.04392065480351448,
+      "learning_rate": 4.89795918367347e-05,
+      "loss": 0.0067,
       "step": 78
     },
     {
       "epoch": 79.0,
+      "grad_norm": 0.02023773454129696,
+      "learning_rate": 4.6938775510204086e-05,
+      "loss": 0.0066,
       "step": 79
     },
     {
       "epoch": 80.0,
+      "grad_norm": 0.003931655548512936,
+      "learning_rate": 4.4897959183673474e-05,
+      "loss": 0.0066,
       "step": 80
     },
     {
       "epoch": 81.0,
+      "grad_norm": 0.027433717623353004,
+      "learning_rate": 4.2857142857142856e-05,
+      "loss": 0.0066,
       "step": 81
     },
     {
       "epoch": 82.0,
+      "grad_norm": 0.027440495789051056,
+      "learning_rate": 4.0816326530612245e-05,
+      "loss": 0.0066,
       "step": 82
     },
     {
       "epoch": 83.0,
+      "grad_norm": 0.003971911035478115,
+      "learning_rate": 3.8775510204081634e-05,
+      "loss": 0.0066,
       "step": 83
     },
     {
       "epoch": 84.0,
+      "grad_norm": 0.0040692477487027645,
+      "learning_rate": 3.673469387755102e-05,
+      "loss": 0.0066,
       "step": 84
     },
     {
       "epoch": 85.0,
+      "grad_norm": 0.02032075822353363,
+      "learning_rate": 3.469387755102041e-05,
+      "loss": 0.0066,
       "step": 85
     },
     {
       "epoch": 86.0,
+      "grad_norm": 0.004029002971947193,
+      "learning_rate": 3.265306122448979e-05,
+      "loss": 0.0066,
       "step": 86
     },
     {
       "epoch": 87.0,
+      "grad_norm": 0.02034132555127144,
+      "learning_rate": 3.061224489795919e-05,
+      "loss": 0.0066,
       "step": 87
     },
     {
       "epoch": 88.0,
+      "grad_norm": 0.003994234371930361,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 0.0066,
       "step": 88
     },
     {
       "epoch": 89.0,
+      "grad_norm": 0.004034143406897783,
+      "learning_rate": 2.6530612244897963e-05,
+      "loss": 0.0066,
       "step": 89
     },
     {
       "epoch": 90.0,
+      "grad_norm": 0.004001120571047068,
+      "learning_rate": 2.448979591836735e-05,
+      "loss": 0.0066,
       "step": 90
     },
     {
       "epoch": 91.0,
+      "grad_norm": 0.020308438688516617,
+      "learning_rate": 2.2448979591836737e-05,
+      "loss": 0.0066,
       "step": 91
     },
     {
       "epoch": 92.0,
+      "grad_norm": 0.004174523055553436,
+      "learning_rate": 2.0408163265306123e-05,
+      "loss": 0.0066,
       "step": 92
     },
     {
       "epoch": 93.0,
+      "grad_norm": 0.004282441921532154,
+      "learning_rate": 1.836734693877551e-05,
+      "loss": 0.0066,
       "step": 93
     },
     {
       "epoch": 94.0,
+      "grad_norm": 0.004000538494437933,
+      "learning_rate": 1.6326530612244897e-05,
+      "loss": 0.0066,
       "step": 94
     },
     {
       "epoch": 95.0,
+      "grad_norm": 0.003992615267634392,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 0.0066,
       "step": 95
     },
     {
       "epoch": 96.0,
+      "grad_norm": 0.0273627657443285,
+      "learning_rate": 1.2244897959183674e-05,
+      "loss": 0.0066,
       "step": 96
     },
     {
       "epoch": 97.0,
+      "grad_norm": 0.027324387803673744,
+      "learning_rate": 1.0204081632653061e-05,
+      "loss": 0.0066,
       "step": 97
     },
     {
       "epoch": 98.0,
+      "grad_norm": 0.027326995506882668,
+      "learning_rate": 8.163265306122448e-06,
+      "loss": 0.0066,
       "step": 98
     },
     {
       "epoch": 99.0,
+      "grad_norm": 0.020300107076764107,
+      "learning_rate": 6.122448979591837e-06,
+      "loss": 0.0066,
       "step": 99
     },
     {
       "epoch": 100.0,
+      "grad_norm": 0.003976646810770035,
+      "learning_rate": 4.081632653061224e-06,
+      "loss": 0.0066,
       "step": 100
     }
   ],
   "num_input_tokens_seen": 0,
   "num_train_epochs": 100,
   "save_steps": 500,
+  "total_flos": 499235306496000.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc2b910d926edadac64591186b55a95c20e3f121f861011f15403c335b9a1351
 size 4984

 version https://git-lfs.github.com/spec/v1
+oid sha256:c913740270ce755c394445d072d8363aa026f0debcf3f5a3a5f27a1eea73cb07
 size 4984