0xtaipoian
commited on
Commit
•
05012b5
1
Parent(s):
9994dd4
Upload folder using huggingface_hub
Browse files- README.md +114 -0
- all_results.json +12 -0
- config.json +30 -0
- config_old.json +30 -0
- eval_results.json +7 -0
- generation_config.json +8 -0
- model-00001-of-00005.safetensors +3 -0
- model-00002-of-00005.safetensors +3 -0
- model-00003-of-00005.safetensors +3 -0
- model-00004-of-00005.safetensors +3 -0
- model-00005-of-00005.safetensors +3 -0
- model.safetensors.index.json +298 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +91 -0
- train_results.json +8 -0
- trainer_log.jsonl +47 -0
- trainer_state.json +366 -0
- training_args.bin +3 -0
- training_eval_loss.png +0 -0
- training_loss.png +0 -0
README.md
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
base_model: hon9kon9ize/CantoneseLLMChat-v0.5
|
4 |
+
tags:
|
5 |
+
- llama-factory
|
6 |
+
- full
|
7 |
+
- generated_from_trainer
|
8 |
+
metrics:
|
9 |
+
- accuracy
|
10 |
+
model-index:
|
11 |
+
- name: open-lilm-v2
|
12 |
+
results: []
|
13 |
+
---
|
14 |
+
|
15 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
16 |
+
should probably proofread and complete it, then remove this comment. -->
|
17 |
+
|
18 |
+
# open-lilm-v2
|
19 |
+
|
20 |
+
[Version 1](https://huggingface.co/0xtaipoian/open-lilm) can be found here.
|
21 |
+
|
22 |
+
Warning: Due to the nature of the training data, this model is highly likely to return violent, racist and discriminative content. DO NOT USE IN PRODUCTION ENVIRONMENT.
|
23 |
+
|
24 |
+
|
25 |
+
Inspired by [another project](https://github.com/alphrc/lilm).
|
26 |
+
This is a finetuned model based on [CantoneseLLMChat-v0.5](https://huggingface.co/hon9kon9ize/CantoneseLLMChat-v0.5) which everybody can use without the need for a Mac with 128GB RAM.
|
27 |
+
|
28 |
+
Following the same principle, we filtered 1,916,944 post and reply pairs in LIHKG forum from the [LIHKG Dataset](https://huggingface.co/datasets/AlienKevin/LIHKG) and scrapped from the site for the latest posts.
|
29 |
+
- Reply must be a direct reply to the original post by a user other than the author
|
30 |
+
- The total number of reactions (positive or negative) must be larger than 20
|
31 |
+
- The post and reply pair has to be shorter than 2048 words
|
32 |
+
|
33 |
+
To avoid political complications, the dataset will not be made publicly available.
|
34 |
+
|
35 |
+
|
36 |
+
Compared to version 1,
|
37 |
+
- Training sample increased from 377,595 to 1,916,944, including the latest posts
|
38 |
+
- Removed all URLs
|
39 |
+
- Removed comments with only emojis
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
## Intended uses & limitations
|
45 |
+
|
46 |
+
Due to the nature of an online and anonymous forum, the training data and the model are full of rude, violent, racist and discriminative language.
|
47 |
+
This model is only intended for research or entertainment purposes.
|
48 |
+
|
49 |
+
The comments on LIHKG also tend to be very short. Thus the model cannot generate anything more than a line.
|
50 |
+
|
51 |
+
|
52 |
+
## How to use it?
|
53 |
+
You can run it on [Colab](https://colab.research.google.com/drive/1veRH2GP3ZR3buYCG2_bFUKu0kS-hv1S2) or anywhere you want based on the code:
|
54 |
+
```python
|
55 |
+
|
56 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaTokenizer, GenerationConfig, pipeline
|
57 |
+
from peft import PeftModel, PeftMixedModel
|
58 |
+
import torch
|
59 |
+
|
60 |
+
|
61 |
+
model_name = "0xtaipoian/open-lilm-v2"
|
62 |
+
|
63 |
+
bnb_config = BitsAndBytesConfig(
|
64 |
+
load_in_4bit=True,
|
65 |
+
bnb_4bit_use_double_quant=True,
|
66 |
+
bnb_4bit_quant_type="nf4",
|
67 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
68 |
+
)
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
70 |
+
model = AutoModelForCausalLM.from_pretrained(
|
71 |
+
model_name,
|
72 |
+
torch_dtype=torch.bfloat16,
|
73 |
+
trust_remote_code=True,
|
74 |
+
quantization_config=bnb_config,
|
75 |
+
)
|
76 |
+
|
77 |
+
def chat(messages, temperature=0.9, max_new_tokens=200):
|
78 |
+
input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt').to('cuda:0')
|
79 |
+
output_ids = model.generate(input_ids, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True)
|
80 |
+
|
81 |
+
chatml = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
82 |
+
print(chatml)
|
83 |
+
|
84 |
+
response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=False)
|
85 |
+
|
86 |
+
return response
|
87 |
+
|
88 |
+
messages = [
|
89 |
+
# {"role": "system", "content": ""},
|
90 |
+
{"role": "user",
|
91 |
+
|
92 |
+
"content":
|
93 |
+
"""
|
94 |
+
密陽44人輪姦案」受害女隔20年現身:時間停在2004,不記得
|
95 |
+
"""}]
|
96 |
+
|
97 |
+
result = chat(messages, max_new_tokens=200, temperature=1)
|
98 |
+
|
99 |
+
print(result)
|
100 |
+
```
|
101 |
+
|
102 |
+
### Training Procedures
|
103 |
+
|
104 |
+
The model was trained for 11 hours on 8 NVIDIA H100 80GB HBM3 GPUs with [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
|
105 |
+
|
106 |
+
The following hyperparameters were used during training:
|
107 |
+
- learning_rate: 1e-05
|
108 |
+
- train_batch_size: 22
|
109 |
+
- seed: 42
|
110 |
+
- gradient_accumulation_steps: 22
|
111 |
+
- total_train_batch_size: 3872
|
112 |
+
- num_epochs: 1.0
|
113 |
+
|
114 |
+
|
all_results.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 0.9986738753442823,
|
3 |
+
"eval_loss": 2.5821733474731445,
|
4 |
+
"eval_runtime": 1288.7775,
|
5 |
+
"eval_samples_per_second": 148.742,
|
6 |
+
"eval_steps_per_second": 0.846,
|
7 |
+
"total_flos": 7.860958022007259e+18,
|
8 |
+
"train_loss": 2.698132219207421,
|
9 |
+
"train_runtime": 39941.2631,
|
10 |
+
"train_samples_per_second": 43.195,
|
11 |
+
"train_steps_per_second": 0.011
|
12 |
+
}
|
config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/home/pj24001684/ku40000295/jc/models/CantonesellmChat-v0.5-sft",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 6,
|
9 |
+
"eos_token_id": 7,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 4096,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 11008,
|
14 |
+
"max_position_embeddings": 4096,
|
15 |
+
"mlp_bias": false,
|
16 |
+
"model_type": "llama",
|
17 |
+
"num_attention_heads": 32,
|
18 |
+
"num_hidden_layers": 32,
|
19 |
+
"num_key_value_heads": 4,
|
20 |
+
"pad_token_id": 0,
|
21 |
+
"pretraining_tp": 1,
|
22 |
+
"rms_norm_eps": 1e-06,
|
23 |
+
"rope_scaling": null,
|
24 |
+
"rope_theta": 5000000.0,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "float32",
|
27 |
+
"transformers_version": "4.42.3",
|
28 |
+
"use_cache": false,
|
29 |
+
"vocab_size": 64960
|
30 |
+
}
|
config_old.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "hon9kon9ize/CantoneseLLM-v0.5",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 4096,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 11008,
|
14 |
+
"max_position_embeddings": 4096,
|
15 |
+
"mlp_bias": false,
|
16 |
+
"model_type": "llama",
|
17 |
+
"num_attention_heads": 32,
|
18 |
+
"num_hidden_layers": 32,
|
19 |
+
"num_key_value_heads": 4,
|
20 |
+
"pad_token_id": 0,
|
21 |
+
"pretraining_tp": 1,
|
22 |
+
"rms_norm_eps": 1e-06,
|
23 |
+
"rope_scaling": null,
|
24 |
+
"rope_theta": 5000000.0,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "float32",
|
27 |
+
"transformers_version": "4.43.1",
|
28 |
+
"use_cache": false,
|
29 |
+
"vocab_size": 64960
|
30 |
+
}
|
eval_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 0.9986738753442823,
|
3 |
+
"eval_loss": 2.5821733474731445,
|
4 |
+
"eval_runtime": 1288.7775,
|
5 |
+
"eval_samples_per_second": 148.742,
|
6 |
+
"eval_steps_per_second": 0.846
|
7 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 6,
|
4 |
+
"bot_token_id": 6,
|
5 |
+
"eos_token_id": 7,
|
6 |
+
"pad_token_id": 0,
|
7 |
+
"transformers_version": "4.43.0.dev0"
|
8 |
+
}
|
model-00001-of-00005.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94197e548f61ab4a0216aab0db24191e90a92b4ea3aff17f15ef17a393767fda
|
3 |
+
size 4856125168
|
model-00002-of-00005.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab0bd1f944a8954b41a71a1a2315897c8b6414250e358b34d350ad12bbdc3072
|
3 |
+
size 4844657744
|
model-00003-of-00005.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6306110fffda4dbee4e72bfdfd10fdb9942d8b882d9c3c570d647fb95fffeb2
|
3 |
+
size 4844657784
|
model-00004-of-00005.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f08472e79f98ebe54409fdde957269d65c2a65b1cdd63e956778dd066f6f83d
|
3 |
+
size 4844657784
|
model-00005-of-00005.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:470c154000f1aee36369bac18a6dc366e85298abd30440cf0731e940e07bf5f5
|
3 |
+
size 4885534456
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 24275599360
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"lm_head.weight": "model-00005-of-00005.safetensors",
|
7 |
+
"model.embed_tokens.weight": "model-00001-of-00005.safetensors",
|
8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
13 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
14 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
15 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
16 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
17 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
18 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
19 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
20 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
21 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
22 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
23 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
24 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
25 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
26 |
+
"model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
27 |
+
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
28 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
29 |
+
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
30 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
31 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
32 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
33 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
34 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
35 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
36 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
37 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
38 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
39 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
40 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
41 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
42 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
43 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
44 |
+
"model.layers.12.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
45 |
+
"model.layers.12.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
46 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
47 |
+
"model.layers.12.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
48 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
49 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
50 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
51 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
52 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
53 |
+
"model.layers.13.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
54 |
+
"model.layers.13.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
55 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
56 |
+
"model.layers.13.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
57 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
58 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
59 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
60 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
61 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
62 |
+
"model.layers.14.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
63 |
+
"model.layers.14.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
64 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
65 |
+
"model.layers.14.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
66 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
67 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
68 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
69 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
70 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
71 |
+
"model.layers.15.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
72 |
+
"model.layers.15.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
73 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
74 |
+
"model.layers.15.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
75 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
76 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
77 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
78 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
79 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
80 |
+
"model.layers.16.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
81 |
+
"model.layers.16.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
82 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
83 |
+
"model.layers.16.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
84 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
85 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
86 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
87 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
88 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
89 |
+
"model.layers.17.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
90 |
+
"model.layers.17.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
91 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
92 |
+
"model.layers.17.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
93 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
94 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
95 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
96 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
97 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
98 |
+
"model.layers.18.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
99 |
+
"model.layers.18.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
100 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
101 |
+
"model.layers.18.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
102 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
103 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
104 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
105 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
106 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
107 |
+
"model.layers.19.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
108 |
+
"model.layers.19.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
109 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
110 |
+
"model.layers.19.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
111 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
112 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
113 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
114 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
115 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
116 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
117 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
118 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
119 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
120 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
121 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
122 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
123 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
124 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
125 |
+
"model.layers.20.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
126 |
+
"model.layers.20.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
127 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
128 |
+
"model.layers.20.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
129 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
130 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
131 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
132 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
133 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
134 |
+
"model.layers.21.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
135 |
+
"model.layers.21.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
136 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
137 |
+
"model.layers.21.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
138 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
139 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
140 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
141 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
142 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
143 |
+
"model.layers.22.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
144 |
+
"model.layers.22.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
145 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
146 |
+
"model.layers.22.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
147 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
148 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
149 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
150 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
151 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
152 |
+
"model.layers.23.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
153 |
+
"model.layers.23.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
154 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
155 |
+
"model.layers.23.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
156 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
157 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
158 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
159 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
160 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
161 |
+
"model.layers.24.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
162 |
+
"model.layers.24.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
163 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
164 |
+
"model.layers.24.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
165 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
166 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
167 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
168 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
169 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
170 |
+
"model.layers.25.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
171 |
+
"model.layers.25.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
172 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
173 |
+
"model.layers.25.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
174 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
175 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
176 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
177 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
178 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
179 |
+
"model.layers.26.input_layernorm.weight": "model-00005-of-00005.safetensors",
|
180 |
+
"model.layers.26.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
|
181 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
182 |
+
"model.layers.26.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
|
183 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
|
184 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
185 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
186 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
187 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
188 |
+
"model.layers.27.input_layernorm.weight": "model-00005-of-00005.safetensors",
|
189 |
+
"model.layers.27.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
|
190 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
|
191 |
+
"model.layers.27.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
|
192 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
|
193 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
|
194 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
|
195 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
|
196 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
|
197 |
+
"model.layers.28.input_layernorm.weight": "model-00005-of-00005.safetensors",
|
198 |
+
"model.layers.28.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
|
199 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
|
200 |
+
"model.layers.28.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
|
201 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
|
202 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
|
203 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
|
204 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
|
205 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
|
206 |
+
"model.layers.29.input_layernorm.weight": "model-00005-of-00005.safetensors",
|
207 |
+
"model.layers.29.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
|
208 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
|
209 |
+
"model.layers.29.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
|
210 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
|
211 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
|
212 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
|
213 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
|
214 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
|
215 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
216 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
217 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
218 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
219 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
220 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
221 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
222 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
223 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
224 |
+
"model.layers.30.input_layernorm.weight": "model-00005-of-00005.safetensors",
|
225 |
+
"model.layers.30.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
|
226 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
|
227 |
+
"model.layers.30.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
|
228 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
|
229 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
|
230 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
|
231 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
|
232 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
|
233 |
+
"model.layers.31.input_layernorm.weight": "model-00005-of-00005.safetensors",
|
234 |
+
"model.layers.31.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
|
235 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
|
236 |
+
"model.layers.31.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
|
237 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
|
238 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
|
239 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
|
240 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
|
241 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
|
242 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
243 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
244 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
245 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
246 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
247 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
248 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
249 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
250 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
251 |
+
"model.layers.5.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
252 |
+
"model.layers.5.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
253 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
254 |
+
"model.layers.5.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
255 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
256 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
257 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
258 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
259 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
260 |
+
"model.layers.6.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
261 |
+
"model.layers.6.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
262 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
263 |
+
"model.layers.6.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
264 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
265 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
266 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
267 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
268 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
269 |
+
"model.layers.7.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
270 |
+
"model.layers.7.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
271 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
272 |
+
"model.layers.7.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
273 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
274 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
275 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
276 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
277 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
278 |
+
"model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
279 |
+
"model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
280 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
281 |
+
"model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
282 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
283 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
284 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
285 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
286 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
287 |
+
"model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
288 |
+
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
289 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
290 |
+
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
291 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
292 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
293 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
294 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
295 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
296 |
+
"model.norm.weight": "model-00005-of-00005.safetensors"
|
297 |
+
}
|
298 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>",
|
5 |
+
"<|Human|>",
|
6 |
+
"<|Assistant|>",
|
7 |
+
"<|System|>"
|
8 |
+
],
|
9 |
+
"bos_token": {
|
10 |
+
"content": "<|startoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "<|im_end|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"pad_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "<unk>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67909bb0045622af428982dee9b3f1033cf5c4bca5c9423a028d3748364ee14f
|
3 |
+
size 1044277
|
tokenizer_config.json
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<|startoftext|>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "<|endoftext|>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
},
|
30 |
+
"3": {
|
31 |
+
"content": "<|Human|>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": true
|
37 |
+
},
|
38 |
+
"4": {
|
39 |
+
"content": "<|Assistant|>",
|
40 |
+
"lstrip": false,
|
41 |
+
"normalized": false,
|
42 |
+
"rstrip": false,
|
43 |
+
"single_word": false,
|
44 |
+
"special": true
|
45 |
+
},
|
46 |
+
"5": {
|
47 |
+
"content": "<|System|>",
|
48 |
+
"lstrip": false,
|
49 |
+
"normalized": false,
|
50 |
+
"rstrip": false,
|
51 |
+
"single_word": false,
|
52 |
+
"special": true
|
53 |
+
},
|
54 |
+
"6": {
|
55 |
+
"content": "<|im_start|>",
|
56 |
+
"lstrip": false,
|
57 |
+
"normalized": false,
|
58 |
+
"rstrip": false,
|
59 |
+
"single_word": false,
|
60 |
+
"special": true
|
61 |
+
},
|
62 |
+
"7": {
|
63 |
+
"content": "<|im_end|>",
|
64 |
+
"lstrip": false,
|
65 |
+
"normalized": false,
|
66 |
+
"rstrip": false,
|
67 |
+
"single_word": false,
|
68 |
+
"special": true
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"additional_special_tokens": [
|
72 |
+
"<|im_start|>",
|
73 |
+
"<|im_end|>",
|
74 |
+
"<|Human|>",
|
75 |
+
"<|Assistant|>",
|
76 |
+
"<|System|>"
|
77 |
+
],
|
78 |
+
"bos_token": "<|startoftext|>",
|
79 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|><|System|>\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|><|Human|>\n' + content + '<|im_end|>\n<|im_start|><|Assistant|>\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
|
80 |
+
"clean_up_tokenization_spaces": false,
|
81 |
+
"eos_token": "<|im_end|>",
|
82 |
+
"legacy": true,
|
83 |
+
"model_max_length": 4096,
|
84 |
+
"pad_token": "<unk>",
|
85 |
+
"padding_side": "right",
|
86 |
+
"sp_model_kwargs": {},
|
87 |
+
"split_special_tokens": false,
|
88 |
+
"tokenizer_class": "LlamaTokenizer",
|
89 |
+
"unk_token": "<unk>",
|
90 |
+
"use_default_system_prompt": false
|
91 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 0.9986738753442823,
|
3 |
+
"total_flos": 7.860958022007259e+18,
|
4 |
+
"train_loss": 2.698132219207421,
|
5 |
+
"train_runtime": 39941.2631,
|
6 |
+
"train_samples_per_second": 43.195,
|
7 |
+
"train_steps_per_second": 0.011
|
8 |
+
}
|
trainer_log.jsonl
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"current_steps": 10, "total_steps": 445, "loss": 3.8476, "learning_rate": 2.222222222222222e-06, "epoch": 0.02244210955829848, "percentage": 2.25, "elapsed_time": "0:13:44", "remaining_time": "9:57:47", "throughput": "0.00", "total_tokens": 0}
|
2 |
+
{"current_steps": 20, "total_steps": 445, "loss": 3.2218, "learning_rate": 4.444444444444444e-06, "epoch": 0.04488421911659696, "percentage": 4.49, "elapsed_time": "0:27:04", "remaining_time": "9:35:13", "throughput": "0.00", "total_tokens": 0}
|
3 |
+
{"current_steps": 30, "total_steps": 445, "loss": 2.9756, "learning_rate": 6.666666666666667e-06, "epoch": 0.06732632867489544, "percentage": 6.74, "elapsed_time": "0:40:32", "remaining_time": "9:20:53", "throughput": "0.00", "total_tokens": 0}
|
4 |
+
{"current_steps": 40, "total_steps": 445, "loss": 2.88, "learning_rate": 8.888888888888888e-06, "epoch": 0.08976843823319391, "percentage": 8.99, "elapsed_time": "0:54:05", "remaining_time": "9:07:43", "throughput": "0.00", "total_tokens": 0}
|
5 |
+
{"current_steps": 50, "total_steps": 445, "loss": 2.8198, "learning_rate": 9.996145181203616e-06, "epoch": 0.11221054779149241, "percentage": 11.24, "elapsed_time": "1:08:29", "remaining_time": "9:01:03", "throughput": "0.00", "total_tokens": 0}
|
6 |
+
{"current_steps": 60, "total_steps": 445, "loss": 2.8122, "learning_rate": 9.965342284774633e-06, "epoch": 0.13465265734979087, "percentage": 13.48, "elapsed_time": "1:22:54", "remaining_time": "8:51:57", "throughput": "0.00", "total_tokens": 0}
|
7 |
+
{"current_steps": 70, "total_steps": 445, "loss": 2.7806, "learning_rate": 9.903926402016153e-06, "epoch": 0.15709476690808935, "percentage": 15.73, "elapsed_time": "1:36:58", "remaining_time": "8:39:31", "throughput": "0.00", "total_tokens": 0}
|
8 |
+
{"current_steps": 80, "total_steps": 445, "loss": 2.7422, "learning_rate": 9.812276182268236e-06, "epoch": 0.17953687646638783, "percentage": 17.98, "elapsed_time": "1:50:49", "remaining_time": "8:25:40", "throughput": "0.00", "total_tokens": 0}
|
9 |
+
{"current_steps": 90, "total_steps": 445, "loss": 2.736, "learning_rate": 9.690956679612422e-06, "epoch": 0.2019789860246863, "percentage": 20.22, "elapsed_time": "2:04:30", "remaining_time": "8:11:08", "throughput": "0.00", "total_tokens": 0}
|
10 |
+
{"current_steps": 100, "total_steps": 445, "loss": 2.7361, "learning_rate": 9.540715869125407e-06, "epoch": 0.22442109558298481, "percentage": 22.47, "elapsed_time": "2:18:35", "remaining_time": "7:58:07", "throughput": "0.00", "total_tokens": 0}
|
11 |
+
{"current_steps": 110, "total_steps": 445, "loss": 2.7135, "learning_rate": 9.362480035363987e-06, "epoch": 0.2468632051412833, "percentage": 24.72, "elapsed_time": "2:32:40", "remaining_time": "7:44:59", "throughput": "0.00", "total_tokens": 0}
|
12 |
+
{"current_steps": 120, "total_steps": 445, "loss": 2.7064, "learning_rate": 9.157348061512728e-06, "epoch": 0.26930531469958174, "percentage": 26.97, "elapsed_time": "2:46:38", "remaining_time": "7:31:19", "throughput": "0.00", "total_tokens": 0}
|
13 |
+
{"current_steps": 130, "total_steps": 445, "loss": 2.7018, "learning_rate": 8.926584654403725e-06, "epoch": 0.29174742425788025, "percentage": 29.21, "elapsed_time": "3:00:35", "remaining_time": "7:17:36", "throughput": "0.00", "total_tokens": 0}
|
14 |
+
{"current_steps": 140, "total_steps": 445, "loss": 2.6961, "learning_rate": 8.671612547178428e-06, "epoch": 0.3141895338161787, "percentage": 31.46, "elapsed_time": "3:14:28", "remaining_time": "7:03:40", "throughput": "0.00", "total_tokens": 0}
|
15 |
+
{"current_steps": 150, "total_steps": 445, "loss": 2.6968, "learning_rate": 8.39400372766471e-06, "epoch": 0.3366316433744772, "percentage": 33.71, "elapsed_time": "3:28:14", "remaining_time": "6:49:33", "throughput": "0.00", "total_tokens": 0}
|
16 |
+
{"current_steps": 160, "total_steps": 445, "loss": 2.6879, "learning_rate": 8.095469746549172e-06, "epoch": 0.35907375293277566, "percentage": 35.96, "elapsed_time": "3:42:09", "remaining_time": "6:35:43", "throughput": "0.00", "total_tokens": 0}
|
17 |
+
{"current_steps": 170, "total_steps": 445, "loss": 2.6686, "learning_rate": 7.777851165098012e-06, "epoch": 0.38151586249107416, "percentage": 38.2, "elapsed_time": "3:56:01", "remaining_time": "6:21:48", "throughput": "0.00", "total_tokens": 0}
|
18 |
+
{"current_steps": 180, "total_steps": 445, "loss": 2.6497, "learning_rate": 7.443106207484776e-06, "epoch": 0.4039579720493726, "percentage": 40.45, "elapsed_time": "4:09:47", "remaining_time": "6:07:45", "throughput": "0.00", "total_tokens": 0}
|
19 |
+
{"current_steps": 190, "total_steps": 445, "loss": 2.6413, "learning_rate": 7.093298687687141e-06, "epoch": 0.4264000816076711, "percentage": 42.7, "elapsed_time": "4:23:56", "remaining_time": "5:54:14", "throughput": "0.00", "total_tokens": 0}
|
20 |
+
{"current_steps": 200, "total_steps": 445, "loss": 2.6472, "learning_rate": 6.730585285387465e-06, "epoch": 0.44884219116596963, "percentage": 44.94, "elapsed_time": "4:37:58", "remaining_time": "5:40:30", "throughput": "0.00", "total_tokens": 0}
|
21 |
+
{"current_steps": 200, "total_steps": 445, "eval_loss": 2.642993450164795, "epoch": 0.44884219116596963, "percentage": 44.94, "elapsed_time": "4:59:33", "remaining_time": "6:06:57", "throughput": "0.00", "total_tokens": 0}
|
22 |
+
{"current_steps": 210, "total_steps": 445, "loss": 2.6369, "learning_rate": 6.3572022493253715e-06, "epoch": 0.4712843007242681, "percentage": 47.19, "elapsed_time": "5:13:34", "remaining_time": "5:50:54", "throughput": "0.00", "total_tokens": 0}
|
23 |
+
{"current_steps": 220, "total_steps": 445, "loss": 2.63, "learning_rate": 5.975451610080643e-06, "epoch": 0.4937264102825666, "percentage": 49.44, "elapsed_time": "5:27:23", "remaining_time": "5:34:49", "throughput": "0.00", "total_tokens": 0}
|
24 |
+
{"current_steps": 230, "total_steps": 445, "loss": 2.6209, "learning_rate": 5.587686987289189e-06, "epoch": 0.5161685198408651, "percentage": 51.69, "elapsed_time": "5:41:21", "remaining_time": "5:19:06", "throughput": "0.00", "total_tokens": 0}
|
25 |
+
{"current_steps": 240, "total_steps": 445, "loss": 2.6318, "learning_rate": 5.1962990787953436e-06, "epoch": 0.5386106293991635, "percentage": 53.93, "elapsed_time": "5:55:03", "remaining_time": "5:03:16", "throughput": "0.00", "total_tokens": 0}
|
26 |
+
{"current_steps": 250, "total_steps": 445, "loss": 2.621, "learning_rate": 4.803700921204659e-06, "epoch": 0.561052738957462, "percentage": 56.18, "elapsed_time": "6:08:55", "remaining_time": "4:47:45", "throughput": "0.00", "total_tokens": 0}
|
27 |
+
{"current_steps": 260, "total_steps": 445, "loss": 2.6161, "learning_rate": 4.4123130127108125e-06, "epoch": 0.5834948485157605, "percentage": 58.43, "elapsed_time": "6:22:39", "remaining_time": "4:32:16", "throughput": "0.00", "total_tokens": 0}
|
28 |
+
{"current_steps": 270, "total_steps": 445, "loss": 2.6164, "learning_rate": 4.02454838991936e-06, "epoch": 0.605936958074059, "percentage": 60.67, "elapsed_time": "6:36:39", "remaining_time": "4:17:05", "throughput": "0.00", "total_tokens": 0}
|
29 |
+
{"current_steps": 280, "total_steps": 445, "loss": 2.6091, "learning_rate": 3.6427977506746293e-06, "epoch": 0.6283790676323574, "percentage": 62.92, "elapsed_time": "6:50:40", "remaining_time": "4:02:00", "throughput": "0.00", "total_tokens": 0}
|
30 |
+
{"current_steps": 290, "total_steps": 445, "loss": 2.5967, "learning_rate": 3.269414714612534e-06, "epoch": 0.6508211771906559, "percentage": 65.17, "elapsed_time": "7:04:34", "remaining_time": "3:46:55", "throughput": "0.00", "total_tokens": 0}
|
31 |
+
{"current_steps": 300, "total_steps": 445, "loss": 2.6031, "learning_rate": 2.906701312312861e-06, "epoch": 0.6732632867489544, "percentage": 67.42, "elapsed_time": "7:18:40", "remaining_time": "3:32:01", "throughput": "0.00", "total_tokens": 0}
|
32 |
+
{"current_steps": 310, "total_steps": 445, "loss": 2.6014, "learning_rate": 2.5568937925152272e-06, "epoch": 0.6957053963072529, "percentage": 69.66, "elapsed_time": "7:32:32", "remaining_time": "3:17:04", "throughput": "0.00", "total_tokens": 0}
|
33 |
+
{"current_steps": 320, "total_steps": 445, "loss": 2.5978, "learning_rate": 2.2221488349019903e-06, "epoch": 0.7181475058655513, "percentage": 71.91, "elapsed_time": "7:46:30", "remaining_time": "3:02:13", "throughput": "0.00", "total_tokens": 0}
|
34 |
+
{"current_steps": 330, "total_steps": 445, "loss": 2.5911, "learning_rate": 1.9045302534508298e-06, "epoch": 0.7405896154238498, "percentage": 74.16, "elapsed_time": "8:00:23", "remaining_time": "2:47:24", "throughput": "0.00", "total_tokens": 0}
|
35 |
+
{"current_steps": 340, "total_steps": 445, "loss": 2.5913, "learning_rate": 1.6059962723352912e-06, "epoch": 0.7630317249821483, "percentage": 76.4, "elapsed_time": "8:14:27", "remaining_time": "2:32:42", "throughput": "0.00", "total_tokens": 0}
|
36 |
+
{"current_steps": 350, "total_steps": 445, "loss": 2.5819, "learning_rate": 1.3283874528215735e-06, "epoch": 0.7854738345404468, "percentage": 78.65, "elapsed_time": "8:28:44", "remaining_time": "2:18:05", "throughput": "0.00", "total_tokens": 0}
|
37 |
+
{"current_steps": 360, "total_steps": 445, "loss": 2.5833, "learning_rate": 1.0734153455962765e-06, "epoch": 0.8079159440987452, "percentage": 80.9, "elapsed_time": "8:42:44", "remaining_time": "2:03:25", "throughput": "0.00", "total_tokens": 0}
|
38 |
+
{"current_steps": 370, "total_steps": 445, "loss": 2.5851, "learning_rate": 8.426519384872733e-07, "epoch": 0.8303580536570437, "percentage": 83.15, "elapsed_time": "8:56:50", "remaining_time": "1:48:49", "throughput": "0.00", "total_tokens": 0}
|
39 |
+
{"current_steps": 380, "total_steps": 445, "loss": 2.5853, "learning_rate": 6.375199646360142e-07, "epoch": 0.8528001632153422, "percentage": 85.39, "elapsed_time": "9:10:53", "remaining_time": "1:34:13", "throughput": "0.00", "total_tokens": 0}
|
40 |
+
{"current_steps": 390, "total_steps": 445, "loss": 2.5832, "learning_rate": 4.5928413087459325e-07, "epoch": 0.8752422727736408, "percentage": 87.64, "elapsed_time": "9:24:43", "remaining_time": "1:19:38", "throughput": "0.00", "total_tokens": 0}
|
41 |
+
{"current_steps": 400, "total_steps": 445, "loss": 2.5779, "learning_rate": 3.0904332038757977e-07, "epoch": 0.8976843823319393, "percentage": 89.89, "elapsed_time": "9:38:32", "remaining_time": "1:05:05", "throughput": "0.00", "total_tokens": 0}
|
42 |
+
{"current_steps": 400, "total_steps": 445, "eval_loss": 2.582942485809326, "epoch": 0.8976843823319393, "percentage": 89.89, "elapsed_time": "9:59:46", "remaining_time": "1:07:28", "throughput": "0.00", "total_tokens": 0}
|
43 |
+
{"current_steps": 410, "total_steps": 445, "loss": 2.5833, "learning_rate": 1.8772381773176417e-07, "epoch": 0.9201264918902377, "percentage": 92.13, "elapsed_time": "10:13:48", "remaining_time": "0:52:23", "throughput": "0.00", "total_tokens": 0}
|
44 |
+
{"current_steps": 420, "total_steps": 445, "loss": 2.597, "learning_rate": 9.607359798384785e-08, "epoch": 0.9425686014485362, "percentage": 94.38, "elapsed_time": "10:28:05", "remaining_time": "0:37:23", "throughput": "0.00", "total_tokens": 0}
|
45 |
+
{"current_steps": 430, "total_steps": 445, "loss": 2.5725, "learning_rate": 3.465771522536854e-08, "epoch": 0.9650107110068347, "percentage": 96.63, "elapsed_time": "10:42:08", "remaining_time": "0:22:24", "throughput": "0.00", "total_tokens": 0}
|
46 |
+
{"current_steps": 440, "total_steps": 445, "loss": 2.575, "learning_rate": 3.854818796385495e-09, "epoch": 0.9874528205651332, "percentage": 98.88, "elapsed_time": "10:55:58", "remaining_time": "0:07:27", "throughput": "0.00", "total_tokens": 0}
|
47 |
+
{"current_steps": 445, "total_steps": 445, "epoch": 0.9986738753442823, "percentage": 100.0, "elapsed_time": "11:05:37", "remaining_time": "0:00:00", "throughput": "0.00", "total_tokens": 0}
|
trainer_state.json
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.9986738753442823,
|
5 |
+
"eval_steps": 200,
|
6 |
+
"global_step": 445,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.02244210955829848,
|
13 |
+
"grad_norm": 19.15482521057129,
|
14 |
+
"learning_rate": 2.222222222222222e-06,
|
15 |
+
"loss": 3.8476,
|
16 |
+
"step": 10
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.04488421911659696,
|
20 |
+
"grad_norm": 5.446422576904297,
|
21 |
+
"learning_rate": 4.444444444444444e-06,
|
22 |
+
"loss": 3.2218,
|
23 |
+
"step": 20
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.06732632867489544,
|
27 |
+
"grad_norm": 1.8523049354553223,
|
28 |
+
"learning_rate": 6.666666666666667e-06,
|
29 |
+
"loss": 2.9756,
|
30 |
+
"step": 30
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.08976843823319391,
|
34 |
+
"grad_norm": 2.139192581176758,
|
35 |
+
"learning_rate": 8.888888888888888e-06,
|
36 |
+
"loss": 2.88,
|
37 |
+
"step": 40
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.11221054779149241,
|
41 |
+
"grad_norm": 1.853474497795105,
|
42 |
+
"learning_rate": 9.996145181203616e-06,
|
43 |
+
"loss": 2.8198,
|
44 |
+
"step": 50
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.13465265734979087,
|
48 |
+
"grad_norm": 1.501637578010559,
|
49 |
+
"learning_rate": 9.965342284774633e-06,
|
50 |
+
"loss": 2.8122,
|
51 |
+
"step": 60
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.15709476690808935,
|
55 |
+
"grad_norm": 2.0072269439697266,
|
56 |
+
"learning_rate": 9.903926402016153e-06,
|
57 |
+
"loss": 2.7806,
|
58 |
+
"step": 70
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.17953687646638783,
|
62 |
+
"grad_norm": 1.7332258224487305,
|
63 |
+
"learning_rate": 9.812276182268236e-06,
|
64 |
+
"loss": 2.7422,
|
65 |
+
"step": 80
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.2019789860246863,
|
69 |
+
"grad_norm": 1.3256088495254517,
|
70 |
+
"learning_rate": 9.690956679612422e-06,
|
71 |
+
"loss": 2.736,
|
72 |
+
"step": 90
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.22442109558298481,
|
76 |
+
"grad_norm": 1.6238477230072021,
|
77 |
+
"learning_rate": 9.540715869125407e-06,
|
78 |
+
"loss": 2.7361,
|
79 |
+
"step": 100
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.2468632051412833,
|
83 |
+
"grad_norm": 1.326378583908081,
|
84 |
+
"learning_rate": 9.362480035363987e-06,
|
85 |
+
"loss": 2.7135,
|
86 |
+
"step": 110
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.26930531469958174,
|
90 |
+
"grad_norm": 1.3376497030258179,
|
91 |
+
"learning_rate": 9.157348061512728e-06,
|
92 |
+
"loss": 2.7064,
|
93 |
+
"step": 120
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.29174742425788025,
|
97 |
+
"grad_norm": 1.2815560102462769,
|
98 |
+
"learning_rate": 8.926584654403725e-06,
|
99 |
+
"loss": 2.7018,
|
100 |
+
"step": 130
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.3141895338161787,
|
104 |
+
"grad_norm": 1.5868873596191406,
|
105 |
+
"learning_rate": 8.671612547178428e-06,
|
106 |
+
"loss": 2.6961,
|
107 |
+
"step": 140
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.3366316433744772,
|
111 |
+
"grad_norm": 1.366570234298706,
|
112 |
+
"learning_rate": 8.39400372766471e-06,
|
113 |
+
"loss": 2.6968,
|
114 |
+
"step": 150
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.35907375293277566,
|
118 |
+
"grad_norm": 1.6603009700775146,
|
119 |
+
"learning_rate": 8.095469746549172e-06,
|
120 |
+
"loss": 2.6879,
|
121 |
+
"step": 160
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.38151586249107416,
|
125 |
+
"grad_norm": 1.4688373804092407,
|
126 |
+
"learning_rate": 7.777851165098012e-06,
|
127 |
+
"loss": 2.6686,
|
128 |
+
"step": 170
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.4039579720493726,
|
132 |
+
"grad_norm": 1.2386434078216553,
|
133 |
+
"learning_rate": 7.443106207484776e-06,
|
134 |
+
"loss": 2.6497,
|
135 |
+
"step": 180
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.4264000816076711,
|
139 |
+
"grad_norm": 1.3002716302871704,
|
140 |
+
"learning_rate": 7.093298687687141e-06,
|
141 |
+
"loss": 2.6413,
|
142 |
+
"step": 190
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.44884219116596963,
|
146 |
+
"grad_norm": 1.2603603601455688,
|
147 |
+
"learning_rate": 6.730585285387465e-06,
|
148 |
+
"loss": 2.6472,
|
149 |
+
"step": 200
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.44884219116596963,
|
153 |
+
"eval_loss": 2.642993450164795,
|
154 |
+
"eval_runtime": 1294.6794,
|
155 |
+
"eval_samples_per_second": 148.064,
|
156 |
+
"eval_steps_per_second": 0.842,
|
157 |
+
"step": 200
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"epoch": 0.4712843007242681,
|
161 |
+
"grad_norm": 1.2680917978286743,
|
162 |
+
"learning_rate": 6.3572022493253715e-06,
|
163 |
+
"loss": 2.6369,
|
164 |
+
"step": 210
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"epoch": 0.4937264102825666,
|
168 |
+
"grad_norm": 1.3160443305969238,
|
169 |
+
"learning_rate": 5.975451610080643e-06,
|
170 |
+
"loss": 2.63,
|
171 |
+
"step": 220
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"epoch": 0.5161685198408651,
|
175 |
+
"grad_norm": 1.2467771768569946,
|
176 |
+
"learning_rate": 5.587686987289189e-06,
|
177 |
+
"loss": 2.6209,
|
178 |
+
"step": 230
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"epoch": 0.5386106293991635,
|
182 |
+
"grad_norm": 1.208018183708191,
|
183 |
+
"learning_rate": 5.1962990787953436e-06,
|
184 |
+
"loss": 2.6318,
|
185 |
+
"step": 240
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"epoch": 0.561052738957462,
|
189 |
+
"grad_norm": 1.2416397333145142,
|
190 |
+
"learning_rate": 4.803700921204659e-06,
|
191 |
+
"loss": 2.621,
|
192 |
+
"step": 250
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"epoch": 0.5834948485157605,
|
196 |
+
"grad_norm": 1.1826361417770386,
|
197 |
+
"learning_rate": 4.4123130127108125e-06,
|
198 |
+
"loss": 2.6161,
|
199 |
+
"step": 260
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"epoch": 0.605936958074059,
|
203 |
+
"grad_norm": 1.2550407648086548,
|
204 |
+
"learning_rate": 4.02454838991936e-06,
|
205 |
+
"loss": 2.6164,
|
206 |
+
"step": 270
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"epoch": 0.6283790676323574,
|
210 |
+
"grad_norm": 1.2681384086608887,
|
211 |
+
"learning_rate": 3.6427977506746293e-06,
|
212 |
+
"loss": 2.6091,
|
213 |
+
"step": 280
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"epoch": 0.6508211771906559,
|
217 |
+
"grad_norm": 1.2637056112289429,
|
218 |
+
"learning_rate": 3.269414714612534e-06,
|
219 |
+
"loss": 2.5967,
|
220 |
+
"step": 290
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"epoch": 0.6732632867489544,
|
224 |
+
"grad_norm": 1.211774468421936,
|
225 |
+
"learning_rate": 2.906701312312861e-06,
|
226 |
+
"loss": 2.6031,
|
227 |
+
"step": 300
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"epoch": 0.6957053963072529,
|
231 |
+
"grad_norm": 1.1411036252975464,
|
232 |
+
"learning_rate": 2.5568937925152272e-06,
|
233 |
+
"loss": 2.6014,
|
234 |
+
"step": 310
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"epoch": 0.7181475058655513,
|
238 |
+
"grad_norm": 1.1422080993652344,
|
239 |
+
"learning_rate": 2.2221488349019903e-06,
|
240 |
+
"loss": 2.5978,
|
241 |
+
"step": 320
|
242 |
+
},
|
243 |
+
{
|
244 |
+
"epoch": 0.7405896154238498,
|
245 |
+
"grad_norm": 1.172059416770935,
|
246 |
+
"learning_rate": 1.9045302534508298e-06,
|
247 |
+
"loss": 2.5911,
|
248 |
+
"step": 330
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"epoch": 0.7630317249821483,
|
252 |
+
"grad_norm": 1.1655080318450928,
|
253 |
+
"learning_rate": 1.6059962723352912e-06,
|
254 |
+
"loss": 2.5913,
|
255 |
+
"step": 340
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"epoch": 0.7854738345404468,
|
259 |
+
"grad_norm": 1.1286932229995728,
|
260 |
+
"learning_rate": 1.3283874528215735e-06,
|
261 |
+
"loss": 2.5819,
|
262 |
+
"step": 350
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"epoch": 0.8079159440987452,
|
266 |
+
"grad_norm": 1.1322216987609863,
|
267 |
+
"learning_rate": 1.0734153455962765e-06,
|
268 |
+
"loss": 2.5833,
|
269 |
+
"step": 360
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"epoch": 0.8303580536570437,
|
273 |
+
"grad_norm": 1.1392606496810913,
|
274 |
+
"learning_rate": 8.426519384872733e-07,
|
275 |
+
"loss": 2.5851,
|
276 |
+
"step": 370
|
277 |
+
},
|
278 |
+
{
|
279 |
+
"epoch": 0.8528001632153422,
|
280 |
+
"grad_norm": 1.1811796426773071,
|
281 |
+
"learning_rate": 6.375199646360142e-07,
|
282 |
+
"loss": 2.5853,
|
283 |
+
"step": 380
|
284 |
+
},
|
285 |
+
{
|
286 |
+
"epoch": 0.8752422727736408,
|
287 |
+
"grad_norm": 1.1267277002334595,
|
288 |
+
"learning_rate": 4.5928413087459325e-07,
|
289 |
+
"loss": 2.5832,
|
290 |
+
"step": 390
|
291 |
+
},
|
292 |
+
{
|
293 |
+
"epoch": 0.8976843823319393,
|
294 |
+
"grad_norm": 1.116821527481079,
|
295 |
+
"learning_rate": 3.0904332038757977e-07,
|
296 |
+
"loss": 2.5779,
|
297 |
+
"step": 400
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"epoch": 0.8976843823319393,
|
301 |
+
"eval_loss": 2.582942485809326,
|
302 |
+
"eval_runtime": 1274.0632,
|
303 |
+
"eval_samples_per_second": 150.46,
|
304 |
+
"eval_steps_per_second": 0.856,
|
305 |
+
"step": 400
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"epoch": 0.9201264918902377,
|
309 |
+
"grad_norm": 1.1507278680801392,
|
310 |
+
"learning_rate": 1.8772381773176417e-07,
|
311 |
+
"loss": 2.5833,
|
312 |
+
"step": 410
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"epoch": 0.9425686014485362,
|
316 |
+
"grad_norm": 1.0935174226760864,
|
317 |
+
"learning_rate": 9.607359798384785e-08,
|
318 |
+
"loss": 2.597,
|
319 |
+
"step": 420
|
320 |
+
},
|
321 |
+
{
|
322 |
+
"epoch": 0.9650107110068347,
|
323 |
+
"grad_norm": 1.1115341186523438,
|
324 |
+
"learning_rate": 3.465771522536854e-08,
|
325 |
+
"loss": 2.5725,
|
326 |
+
"step": 430
|
327 |
+
},
|
328 |
+
{
|
329 |
+
"epoch": 0.9874528205651332,
|
330 |
+
"grad_norm": 1.131402611732483,
|
331 |
+
"learning_rate": 3.854818796385495e-09,
|
332 |
+
"loss": 2.575,
|
333 |
+
"step": 440
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"epoch": 0.9986738753442823,
|
337 |
+
"step": 445,
|
338 |
+
"total_flos": 7.860958022007259e+18,
|
339 |
+
"train_loss": 2.698132219207421,
|
340 |
+
"train_runtime": 39941.2631,
|
341 |
+
"train_samples_per_second": 43.195,
|
342 |
+
"train_steps_per_second": 0.011
|
343 |
+
}
|
344 |
+
],
|
345 |
+
"logging_steps": 10,
|
346 |
+
"max_steps": 445,
|
347 |
+
"num_input_tokens_seen": 0,
|
348 |
+
"num_train_epochs": 1,
|
349 |
+
"save_steps": 500,
|
350 |
+
"stateful_callbacks": {
|
351 |
+
"TrainerControl": {
|
352 |
+
"args": {
|
353 |
+
"should_epoch_stop": false,
|
354 |
+
"should_evaluate": false,
|
355 |
+
"should_log": false,
|
356 |
+
"should_save": true,
|
357 |
+
"should_training_stop": true
|
358 |
+
},
|
359 |
+
"attributes": {}
|
360 |
+
}
|
361 |
+
},
|
362 |
+
"total_flos": 7.860958022007259e+18,
|
363 |
+
"train_batch_size": 22,
|
364 |
+
"trial_name": null,
|
365 |
+
"trial_params": null
|
366 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:938afb7225b757e389466dbf73b64ea7c28ce1d3896a99335672249afcb6f74e
|
3 |
+
size 5432
|
training_eval_loss.png
ADDED
training_loss.png
ADDED