atsuki-yamaguchi commited on
Commit
a4db66d
1 Parent(s): 5f0b79e

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,32 +1,21 @@
1
  ---
2
- license: mit
3
- language:
4
- - ar
5
  ---
6
- TigerBot-7B LAPT + CLP+ Arabic
7
- ===
8
 
9
- ## How to use
10
- ```python
11
- from peft import AutoPeftModelForCausalLM
12
- from transformers import AutoTokenizer
13
 
14
- model = AutoPeftModelForCausalLM.from_pretrained(
15
- "atsuki-yamaguchi/tigerbot-7b-base-clpp-ar"
16
- )
17
- ```
 
 
 
 
 
 
 
 
18
 
19
- ## Citation
20
- ```
21
- @article{yamaguchi2024empirical,
22
- title={An Empirical Study on Cross-lingual Vocabulary Adaptation for Efficient Generative {LLM} Inference},
23
- author={Atsuki Yamaguchi and Aline Villavicencio and Nikolaos Aletras},
24
- journal={ArXiv},
25
- year={2024},
26
- volume={abs/2402.10712},
27
- url={https://arxiv.org/abs/2402.10712}
28
- }
29
- ```
30
 
31
- ## Link
32
- For more details, please visit https://github.com/gucci-j/llm-cva
 
1
  ---
2
+ library_name: peft
 
 
3
  ---
4
+ ## Training procedure
 
5
 
 
 
 
 
6
 
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: True
10
+ - load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ ### Framework versions
19
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ - PEFT 0.5.0
 
adapter_config.json CHANGED
@@ -1,29 +1 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "atsuki-yamaguchi/tigerbot-7b-base-clpp-ar",
4
- "bias": "none",
5
- "fan_in_fan_out": false,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 32,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": [
13
- "lm_head",
14
- "embed_tokens"
15
- ],
16
- "peft_type": "LORA",
17
- "r": 8,
18
- "revision": null,
19
- "target_modules": [
20
- "q_proj",
21
- "v_proj",
22
- "k_proj",
23
- "o_proj",
24
- "gate_proj",
25
- "down_proj",
26
- "up_proj"
27
- ],
28
- "task_type": "CAUSAL_LM"
29
- }
 
1
+ {"auto_mapping": null, "base_model_name_or_path": "atsuki-yamaguchi/tigerbot-7b-base-clpp-ar", "bias": "none", "fan_in_fan_out": false, "inference_mode": true, "init_lora_weights": true, "layers_pattern": null, "layers_to_transform": null, "lora_alpha": 32, "lora_dropout": 0.05, "modules_to_save": ["lm_head", "embed_tokens"], "peft_type": "LORA", "r": 8, "revision": null, "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"], "task_type": "CAUSAL_LM"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/mnt/parscratch/users/acp23ay/private/models/tigerbot-7b-base-ar-clp-plus",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
@@ -21,7 +21,7 @@
21
  "rope_scaling": null,
22
  "rope_theta": 10000.0,
23
  "tie_word_embeddings": false,
24
- "torch_dtype": "float32",
25
  "transformers_version": "4.35.0.dev0",
26
  "use_cache": true,
27
  "vocab_size": 64000
 
1
  {
2
+ "_name_or_path": "TigerResearch/tigerbot-7b-base",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
 
21
  "rope_scaling": null,
22
  "rope_theta": 10000.0,
23
  "tie_word_embeddings": false,
24
+ "torch_dtype": "float64",
25
  "transformers_version": "4.35.0.dev0",
26
  "use_cache": true,
27
  "vocab_size": 64000
model-00001-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf172036d5beb5eba82b40f8326418445035ed0048711800269d46df03b33245
3
- size 4915860320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f91776d14d120e90959697b31448ce6b94f6f7c86abbd7602e95e4cd2e2605
3
+ size 4974546664
model-00002-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0caf7138553ae36aa80702d1218a23873098a5e73cd11e25a6caf7080071817
3
- size 4857206856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:997fb2ef0df37cba255743a67d2e124712010dc421564d652de9bca9a13ac77b
3
+ size 4857206848
model-00003-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f7006a6f327bc54c4ff7b414274088e35824b98bb7ed225129d766ff4c5af01
3
  size 4857206896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63f7f1ad4ce7ef01c8e1f770d4992dab3683b53f480ea7e16e2d2cd131ba9ae4
3
  size 4857206896
model-00004-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5d3bd310eaa968845955c0521aa3f34822cb9a8625ed12d502fd322321dc52f
3
  size 4857206896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70d5d8b33baba96365bb16b3c8191e177de16c2ac7da20f49b48d859ecc3fb34
3
  size 4857206896
model-00005-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c17f81881a4fdeead802ed7936c8099f8c53bf191816727c50271b83818af749
3
  size 4857206896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50893eaddd16b7542245ac6d814e07bea7357fc37b3bdc70398b9c1f872d935e
3
  size 4857206896
model-00006-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d30e54a85473acc143d4180d57068b4587f0bab9108f7012253839f097227bf5
3
- size 3657584232
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78d3753464469fb030b6693ae8d85b173c5439fb8e482e937798cde0b2aa21e7
3
+ size 4647473904
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 28002238464
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00006-of-00006.safetensors",
@@ -25,13 +25,13 @@
25
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
26
  "model.layers.10.input_layernorm.weight": "model-00003-of-00006.safetensors",
27
  "model.layers.10.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
28
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
29
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
30
  "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
31
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
32
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
33
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
34
- "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
35
  "model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors",
36
  "model.layers.11.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
37
  "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
@@ -68,24 +68,24 @@
68
  "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
69
  "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
70
  "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
71
- "model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors",
72
- "model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
73
  "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
74
- "model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
75
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
76
  "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
77
  "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
78
  "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
79
  "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
80
  "model.layers.16.input_layernorm.weight": "model-00004-of-00006.safetensors",
81
  "model.layers.16.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
82
- "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
83
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
84
  "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
85
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
86
- "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
87
- "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
88
- "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
89
  "model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors",
90
  "model.layers.17.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
91
  "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
@@ -131,24 +131,24 @@
131
  "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
132
  "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
133
  "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
134
- "model.layers.21.input_layernorm.weight": "model-00004-of-00006.safetensors",
135
- "model.layers.21.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
136
  "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
137
- "model.layers.21.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
138
- "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
139
  "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
140
  "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
141
  "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
142
  "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
143
  "model.layers.22.input_layernorm.weight": "model-00005-of-00006.safetensors",
144
  "model.layers.22.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
145
- "model.layers.22.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
146
- "model.layers.22.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
147
  "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
148
- "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
149
- "model.layers.22.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
150
- "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
151
- "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
152
  "model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors",
153
  "model.layers.23.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
154
  "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
@@ -185,24 +185,24 @@
185
  "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
186
  "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
187
  "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
188
- "model.layers.27.input_layernorm.weight": "model-00005-of-00006.safetensors",
189
- "model.layers.27.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
190
  "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
191
- "model.layers.27.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
192
- "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
193
  "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
194
  "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
195
  "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
196
  "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
197
  "model.layers.28.input_layernorm.weight": "model-00006-of-00006.safetensors",
198
  "model.layers.28.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
199
- "model.layers.28.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
200
- "model.layers.28.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
201
  "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
202
- "model.layers.28.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
203
- "model.layers.28.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
204
- "model.layers.28.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
205
- "model.layers.28.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
206
  "model.layers.29.input_layernorm.weight": "model-00006-of-00006.safetensors",
207
  "model.layers.29.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
208
  "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
@@ -212,11 +212,11 @@
212
  "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
213
  "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
214
  "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
215
- "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
216
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
217
  "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
218
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
219
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
220
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
221
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
222
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
@@ -241,13 +241,13 @@
241
  "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
242
  "model.layers.4.input_layernorm.weight": "model-00002-of-00006.safetensors",
243
  "model.layers.4.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
244
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
245
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
246
  "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
247
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
248
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
249
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
250
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
251
  "model.layers.5.input_layernorm.weight": "model-00002-of-00006.safetensors",
252
  "model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
253
  "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
@@ -284,11 +284,11 @@
284
  "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
285
  "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
286
  "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
287
- "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
288
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
289
  "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
290
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
291
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
292
  "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
293
  "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
294
  "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 29050814464
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00006-of-00006.safetensors",
 
25
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
26
  "model.layers.10.input_layernorm.weight": "model-00003-of-00006.safetensors",
27
  "model.layers.10.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
30
  "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
35
  "model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors",
36
  "model.layers.11.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
37
  "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
 
68
  "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
69
  "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
70
  "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00004-of-00006.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
73
  "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
76
  "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
77
  "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
78
  "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
79
  "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
80
  "model.layers.16.input_layernorm.weight": "model-00004-of-00006.safetensors",
81
  "model.layers.16.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
84
  "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
89
  "model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors",
90
  "model.layers.17.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
91
  "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
 
131
  "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
132
  "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
133
  "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00005-of-00006.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
136
  "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
139
  "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
140
  "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
141
  "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
142
  "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
143
  "model.layers.22.input_layernorm.weight": "model-00005-of-00006.safetensors",
144
  "model.layers.22.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
147
  "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
152
  "model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors",
153
  "model.layers.23.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
154
  "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
 
185
  "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
186
  "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
187
  "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00006-of-00006.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
190
  "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
193
  "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
194
  "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
195
  "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
196
  "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
197
  "model.layers.28.input_layernorm.weight": "model-00006-of-00006.safetensors",
198
  "model.layers.28.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
201
  "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
206
  "model.layers.29.input_layernorm.weight": "model-00006-of-00006.safetensors",
207
  "model.layers.29.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
208
  "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
 
212
  "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
213
  "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
214
  "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00006.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
217
  "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
220
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
221
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
222
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
 
241
  "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
242
  "model.layers.4.input_layernorm.weight": "model-00002-of-00006.safetensors",
243
  "model.layers.4.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
246
  "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
251
  "model.layers.5.input_layernorm.weight": "model-00002-of-00006.safetensors",
252
  "model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
253
  "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
 
284
  "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
285
  "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
286
  "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00006.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
289
  "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
292
  "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
293
  "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
294
  "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d3ca824474c990b0e0eed473e27e44726039e49022c1ce41a044b075a7b16e2
3
+ size 1091197148
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf529198e9c26a03f9e5d5faa128e2b28c42f44f340935dece0e18c806cef182
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b21283c3db24950204115ac348f58e53c6d0488c1b3816d199a477f9c0931a6
3
+ size 1064
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a34ccdaf89aac619bed9397b6efe5813df7d048bbaf2f1490e50b6ed605b556b
3
+ size 4664