Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
adapter_config.json +32 -0
adapter_model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +34 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +71 -0
trainer_state.json +721 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "unsloth/gemma-2b-it",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": "unsloth",
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e587daa92aea01009aad6118424ce73bc512b06241f31b5cc570d8a84a52a021
+size 78480072

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89a20b45145a1ac95637aae5661f2b0b51c1c3fa84faa15c02875bcc6386b871
+size 39594180

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4526d24dfcd6f7f05a227220d3f48a0ec6a8dae66cd141f8b21fa0efdd56cc22
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18070fab0b9756f02d25c0a163c5d05591888b6e42fab37a5382660fea70b3b0
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05e97791a5e007260de1db7e1692e53150e08cea481e2bf25435553380c147ee
+size 17477929

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
+size 4241003

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "106": {
+      "content": "<start_of_turn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<end_of_turn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ],
+  "bos_token": "<bos>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "legacy": null,
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,721 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.01714383679067375,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.4420362710952759,
+      "learning_rate": 4e-05,
+      "loss": 3.3293,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": NaN,
+      "learning_rate": 4e-05,
+      "loss": 3.8207,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.3726439476013184,
+      "learning_rate": 8e-05,
+      "loss": 3.4594,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.2832139730453491,
+      "learning_rate": 0.00012,
+      "loss": 3.297,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.2156065702438354,
+      "learning_rate": 0.00016,
+      "loss": 3.0036,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.196576476097107,
+      "learning_rate": 0.0002,
+      "loss": 2.9488,
+      "step": 6
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.0002,
+      "loss": 2.7249,
+      "step": 7
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.1029444932937622,
+      "learning_rate": 0.00019996568291008924,
+      "loss": 2.8809,
+      "step": 8
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.0251176357269287,
+      "learning_rate": 0.00019993136582017846,
+      "loss": 2.728,
+      "step": 9
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.7124559879302979,
+      "learning_rate": 0.0001998970487302677,
+      "loss": 2.6262,
+      "step": 10
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.3183832168579102,
+      "learning_rate": 0.00019986273164035692,
+      "loss": 2.765,
+      "step": 11
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019986273164035692,
+      "loss": 2.682,
+      "step": 12
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.2592570781707764,
+      "learning_rate": 0.00019982841455044615,
+      "loss": 2.7837,
+      "step": 13
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.9486250877380371,
+      "learning_rate": 0.00019979409746053535,
+      "loss": 2.8407,
+      "step": 14
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.8232390284538269,
+      "learning_rate": 0.00019975978037062458,
+      "loss": 2.6573,
+      "step": 15
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.1916477680206299,
+      "learning_rate": 0.0001997254632807138,
+      "loss": 2.8644,
+      "step": 16
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.6067786812782288,
+      "learning_rate": 0.00019969114619080303,
+      "loss": 2.582,
+      "step": 17
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.9011898040771484,
+      "learning_rate": 0.00019965682910089226,
+      "loss": 2.7302,
+      "step": 18
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.8688616156578064,
+      "learning_rate": 0.00019962251201098149,
+      "loss": 2.3353,
+      "step": 19
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.7956060767173767,
+      "learning_rate": 0.0001995881949210707,
+      "loss": 2.5263,
+      "step": 20
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001995881949210707,
+      "loss": 2.4818,
+      "step": 21
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.8461011052131653,
+      "learning_rate": 0.00019955387783115994,
+      "loss": 2.542,
+      "step": 22
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.6973548531532288,
+      "learning_rate": 0.00019951956074124917,
+      "loss": 2.4787,
+      "step": 23
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.6046115756034851,
+      "learning_rate": 0.0001994852436513384,
+      "loss": 2.4436,
+      "step": 24
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.4711577594280243,
+      "learning_rate": 0.0001994509265614276,
+      "loss": 2.3658,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001994509265614276,
+      "loss": 2.4518,
+      "step": 26
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.5012961030006409,
+      "learning_rate": 0.00019941660947151682,
+      "loss": 2.3782,
+      "step": 27
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.6144160628318787,
+      "learning_rate": 0.00019938229238160605,
+      "loss": 2.4487,
+      "step": 28
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019938229238160605,
+      "loss": 2.3935,
+      "step": 29
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6648868322372437,
+      "learning_rate": 0.00019934797529169528,
+      "loss": 2.5339,
+      "step": 30
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5968282222747803,
+      "learning_rate": 0.00019931365820178448,
+      "loss": 2.4238,
+      "step": 31
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019931365820178448,
+      "loss": 2.4432,
+      "step": 32
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019931365820178448,
+      "loss": 2.3153,
+      "step": 33
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9555236101150513,
+      "learning_rate": 0.0001992793411118737,
+      "loss": 2.2495,
+      "step": 34
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.8204368352890015,
+      "learning_rate": 0.00019924502402196293,
+      "loss": 2.401,
+      "step": 35
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019924502402196293,
+      "loss": 2.31,
+      "step": 36
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.8206217885017395,
+      "learning_rate": 0.00019921070693205216,
+      "loss": 2.3511,
+      "step": 37
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019921070693205216,
+      "loss": 2.2848,
+      "step": 38
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5887894630432129,
+      "learning_rate": 0.0001991763898421414,
+      "loss": 2.191,
+      "step": 39
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5945093035697937,
+      "learning_rate": 0.00019914207275223062,
+      "loss": 2.345,
+      "step": 40
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019914207275223062,
+      "loss": 2.4744,
+      "step": 41
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.7511911988258362,
+      "learning_rate": 0.00019910775566231984,
+      "loss": 2.3624,
+      "step": 42
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019910775566231984,
+      "loss": 2.3179,
+      "step": 43
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.8779334425926208,
+      "learning_rate": 0.00019907343857240907,
+      "loss": 2.4634,
+      "step": 44
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9247270226478577,
+      "learning_rate": 0.0001990391214824983,
+      "loss": 2.2699,
+      "step": 45
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.785952627658844,
+      "learning_rate": 0.0001990048043925875,
+      "loss": 2.2791,
+      "step": 46
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.0079773664474487,
+      "learning_rate": 0.00019897048730267673,
+      "loss": 2.4368,
+      "step": 47
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.1102200746536255,
+      "learning_rate": 0.00019893617021276595,
+      "loss": 2.4482,
+      "step": 48
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.1956760883331299,
+      "learning_rate": 0.00019890185312285518,
+      "loss": 2.4036,
+      "step": 49
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.7190561294555664,
+      "learning_rate": 0.0001988675360329444,
+      "loss": 2.2854,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001988675360329444,
+      "loss": 2.404,
+      "step": 51
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.0126006603240967,
+      "learning_rate": 0.00019883321894303364,
+      "loss": 2.1978,
+      "step": 52
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019883321894303364,
+      "loss": 2.3899,
+      "step": 53
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.213659405708313,
+      "learning_rate": 0.00019879890185312286,
+      "loss": 2.0958,
+      "step": 54
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.3466947078704834,
+      "learning_rate": 0.0001987645847632121,
+      "loss": 2.4106,
+      "step": 55
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.2530690431594849,
+      "learning_rate": 0.00019873026767330132,
+      "loss": 2.5628,
+      "step": 56
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.6688309907913208,
+      "learning_rate": 0.00019869595058339055,
+      "loss": 2.233,
+      "step": 57
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.2483224868774414,
+      "learning_rate": 0.00019866163349347977,
+      "loss": 2.2923,
+      "step": 58
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.017726182937622,
+      "learning_rate": 0.000198627316403569,
+      "loss": 2.354,
+      "step": 59
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.000198627316403569,
+      "loss": 2.5311,
+      "step": 60
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.810524582862854,
+      "learning_rate": 0.0001985929993136582,
+      "loss": 2.24,
+      "step": 61
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.439291000366211,
+      "learning_rate": 0.00019855868222374743,
+      "loss": 2.8512,
+      "step": 62
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019855868222374743,
+      "loss": 2.5943,
+      "step": 63
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.2657182216644287,
+      "learning_rate": 0.00019852436513383666,
+      "loss": 2.8789,
+      "step": 64
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.1179282665252686,
+      "learning_rate": 0.00019849004804392589,
+      "loss": 2.6996,
+      "step": 65
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.0068299770355225,
+      "learning_rate": 0.0001984557309540151,
+      "loss": 2.3677,
+      "step": 66
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001984557309540151,
+      "loss": 2.577,
+      "step": 67
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001984557309540151,
+      "loss": 2.7072,
+      "step": 68
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.678086280822754,
+      "learning_rate": 0.00019842141386410434,
+      "loss": 2.658,
+      "step": 69
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.056044101715088,
+      "learning_rate": 0.00019838709677419357,
+      "loss": 2.5174,
+      "step": 70
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.784575462341309,
+      "learning_rate": 0.0001983527796842828,
+      "loss": 2.6965,
+      "step": 71
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.531276226043701,
+      "learning_rate": 0.00019831846259437202,
+      "loss": 2.9831,
+      "step": 72
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.478590965270996,
+      "learning_rate": 0.00019828414550446125,
+      "loss": 3.0535,
+      "step": 73
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019828414550446125,
+      "loss": 2.7639,
+      "step": 74
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.104541778564453,
+      "learning_rate": 0.00019824982841455048,
+      "loss": 3.0957,
+      "step": 75
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.70428991317749,
+      "learning_rate": 0.00019821551132463968,
+      "loss": 3.0107,
+      "step": 76
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.2724623680114746,
+      "learning_rate": 0.0001981811942347289,
+      "loss": 2.9372,
+      "step": 77
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001981811942347289,
+      "loss": 3.4289,
+      "step": 78
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001981811942347289,
+      "loss": 3.4832,
+      "step": 79
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 20.135986328125,
+      "learning_rate": 0.00019814687714481813,
+      "loss": 3.2707,
+      "step": 80
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 22.686079025268555,
+      "learning_rate": 0.00019811256005490736,
+      "loss": 3.8144,
+      "step": 81
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019811256005490736,
+      "loss": 4.5631,
+      "step": 82
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 28.4752197265625,
+      "learning_rate": 0.00019807824296499656,
+      "loss": 4.8363,
+      "step": 83
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 23.301496505737305,
+      "learning_rate": 0.0001980439258750858,
+      "loss": 5.3773,
+      "step": 84
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 19.411945343017578,
+      "learning_rate": 0.00019800960878517502,
+      "loss": 5.1105,
+      "step": 85
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019800960878517502,
+      "loss": 5.8669,
+      "step": 86
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 35.982608795166016,
+      "learning_rate": 0.00019797529169526424,
+      "loss": 5.4417,
+      "step": 87
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019797529169526424,
+      "loss": 5.4101,
+      "step": 88
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 69.57833862304688,
+      "learning_rate": 0.00019794097460535347,
+      "loss": 5.2831,
+      "step": 89
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 76.1761703491211,
+      "learning_rate": 0.0001979066575154427,
+      "loss": 6.0605,
+      "step": 90
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": NaN,
+      "learning_rate": 0.0001979066575154427,
+      "loss": 6.9964,
+      "step": 91
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 104.58732604980469,
+      "learning_rate": 0.00019787234042553193,
+      "loss": 7.346,
+      "step": 92
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 45.56397247314453,
+      "learning_rate": 0.00019783802333562115,
+      "loss": 8.6069,
+      "step": 93
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019783802333562115,
+      "loss": 9.6728,
+      "step": 94
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019783802333562115,
+      "loss": 8.1841,
+      "step": 95
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": NaN,
+      "learning_rate": 0.00019783802333562115,
+      "loss": 9.4986,
+      "step": 96
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 25.033235549926758,
+      "learning_rate": 0.00019780370624571035,
+      "loss": 9.8723,
+      "step": 97
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 74.37015533447266,
+      "learning_rate": 0.00019776938915579958,
+      "loss": 9.7078,
+      "step": 98
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 72.31209564208984,
+      "learning_rate": 0.0001977350720658888,
+      "loss": 12.0863,
+      "step": 99
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 47.87614059448242,
+      "learning_rate": 0.00019770075497597804,
+      "loss": 14.7985,
+      "step": 100
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 5833,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "total_flos": 1.2504616383074304e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1c7fae7a260158c38df930a244e04cfe8b8cc1b1f85587f1974c004c5bb661c
+size 4856