Upload folder using huggingface_hub
Browse files- README.md +59 -0
- adapter_config.json +27 -0
- adapter_model.safetensors +3 -0
- all_results.json +7 -0
- checkpoint-100/README.md +204 -0
- checkpoint-100/adapter_config.json +27 -0
- checkpoint-100/adapter_model.safetensors +3 -0
- checkpoint-100/optimizer.pt +3 -0
- checkpoint-100/rng_state.pth +3 -0
- checkpoint-100/scheduler.pt +3 -0
- checkpoint-100/special_tokens_map.json +30 -0
- checkpoint-100/tokenization_yi.py +255 -0
- checkpoint-100/tokenizer.model +3 -0
- checkpoint-100/tokenizer_config.json +46 -0
- checkpoint-100/trainer_state.json +81 -0
- checkpoint-100/training_args.bin +3 -0
- checkpoint-200/README.md +204 -0
- checkpoint-200/adapter_config.json +27 -0
- checkpoint-200/adapter_model.safetensors +3 -0
- checkpoint-200/optimizer.pt +3 -0
- checkpoint-200/rng_state.pth +3 -0
- checkpoint-200/scheduler.pt +3 -0
- checkpoint-200/special_tokens_map.json +30 -0
- checkpoint-200/tokenization_yi.py +255 -0
- checkpoint-200/tokenizer.model +3 -0
- checkpoint-200/tokenizer_config.json +46 -0
- checkpoint-200/trainer_state.json +141 -0
- checkpoint-200/training_args.bin +3 -0
- special_tokens_map.json +30 -0
- tokenization_yi.py +255 -0
- tokenizer.model +3 -0
- tokenizer_config.json +46 -0
- train_results.json +7 -0
- trainer_log.jsonl +21 -0
- trainer_state.json +150 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: other
|
3 |
+
library_name: peft
|
4 |
+
tags:
|
5 |
+
- llama-factory
|
6 |
+
- lora
|
7 |
+
- generated_from_trainer
|
8 |
+
base_model: chargoddard/Yi-34B-Llama
|
9 |
+
model-index:
|
10 |
+
- name: model-update
|
11 |
+
results: []
|
12 |
+
---
|
13 |
+
|
14 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
15 |
+
should probably proofread and complete it, then remove this comment. -->
|
16 |
+
|
17 |
+
# model-update
|
18 |
+
|
19 |
+
This model is a fine-tuned version of [chargoddard/Yi-34B-Llama](https://huggingface.co/chargoddard/Yi-34B-Llama) on the oncc_medqa_instruct dataset.
|
20 |
+
|
21 |
+
## Model description
|
22 |
+
|
23 |
+
More information needed
|
24 |
+
|
25 |
+
## Intended uses & limitations
|
26 |
+
|
27 |
+
More information needed
|
28 |
+
|
29 |
+
## Training and evaluation data
|
30 |
+
|
31 |
+
More information needed
|
32 |
+
|
33 |
+
## Training procedure
|
34 |
+
|
35 |
+
### Training hyperparameters
|
36 |
+
|
37 |
+
The following hyperparameters were used during training:
|
38 |
+
- learning_rate: 0.0005
|
39 |
+
- train_batch_size: 4
|
40 |
+
- eval_batch_size: 8
|
41 |
+
- seed: 42
|
42 |
+
- gradient_accumulation_steps: 4
|
43 |
+
- total_train_batch_size: 16
|
44 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
45 |
+
- lr_scheduler_type: cosine
|
46 |
+
- lr_scheduler_warmup_steps: 20
|
47 |
+
- num_epochs: 1.0
|
48 |
+
|
49 |
+
### Training results
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
### Framework versions
|
54 |
+
|
55 |
+
- PEFT 0.8.2
|
56 |
+
- Transformers 4.37.2
|
57 |
+
- Pytorch 2.0.1+cu118
|
58 |
+
- Datasets 2.17.0
|
59 |
+
- Tokenizers 0.15.2
|
adapter_config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "chargoddard/Yi-34B-Llama",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"loftq_config": {},
|
12 |
+
"lora_alpha": 16,
|
13 |
+
"lora_dropout": 0.2,
|
14 |
+
"megatron_config": null,
|
15 |
+
"megatron_core": "megatron.core",
|
16 |
+
"modules_to_save": null,
|
17 |
+
"peft_type": "LORA",
|
18 |
+
"r": 8,
|
19 |
+
"rank_pattern": {},
|
20 |
+
"revision": null,
|
21 |
+
"target_modules": [
|
22 |
+
"q_proj",
|
23 |
+
"v_proj"
|
24 |
+
],
|
25 |
+
"task_type": "CAUSAL_LM",
|
26 |
+
"use_rslora": false
|
27 |
+
}
|
adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8357b7ef3f4fad6f2633a031dfcf6dd31d672ef7d79baad925e8e9e2500531a3
|
3 |
+
size 43285800
|
all_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"train_loss": 0.15793546711282777,
|
4 |
+
"train_runtime": 2949.2917,
|
5 |
+
"train_samples_per_second": 1.103,
|
6 |
+
"train_steps_per_second": 0.069
|
7 |
+
}
|
checkpoint-100/README.md
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
base_model: /workspace/model
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
|
201 |
+
|
202 |
+
### Framework versions
|
203 |
+
|
204 |
+
- PEFT 0.8.2
|
checkpoint-100/adapter_config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "/workspace/model",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"loftq_config": {},
|
12 |
+
"lora_alpha": 16,
|
13 |
+
"lora_dropout": 0.2,
|
14 |
+
"megatron_config": null,
|
15 |
+
"megatron_core": "megatron.core",
|
16 |
+
"modules_to_save": null,
|
17 |
+
"peft_type": "LORA",
|
18 |
+
"r": 8,
|
19 |
+
"rank_pattern": {},
|
20 |
+
"revision": null,
|
21 |
+
"target_modules": [
|
22 |
+
"q_proj",
|
23 |
+
"v_proj"
|
24 |
+
],
|
25 |
+
"task_type": "CAUSAL_LM",
|
26 |
+
"use_rslora": false
|
27 |
+
}
|
checkpoint-100/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6408d2ff97929b628ddc34833988af8230e40e8e9e54b8c5cbe233ec62050657
|
3 |
+
size 43285800
|
checkpoint-100/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fef55b039289447a076815eb66f16d88a270f3f697dca3d4798105ae590fc7ba
|
3 |
+
size 86708485
|
checkpoint-100/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5aa4851eee620994c0ba92e523ba18a1dee79c89af4477f681e6d9dbb50c8f61
|
3 |
+
size 14575
|
checkpoint-100/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eea28048b2b2c95c10ff2d17df0799ff3a4e498d5d4bd7187083a67fbae901d6
|
3 |
+
size 627
|
checkpoint-100/special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
checkpoint-100/tokenization_yi.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from shutil import copyfile
|
3 |
+
from typing import Any, Dict, List, Optional, Tuple
|
4 |
+
|
5 |
+
import sentencepiece as spm
|
6 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
7 |
+
from transformers.utils import logging
|
8 |
+
|
9 |
+
logger = logging.get_logger(__name__)
|
10 |
+
|
11 |
+
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
|
12 |
+
|
13 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
14 |
+
"vocab_file": {},
|
15 |
+
"tokenizer_file": {},
|
16 |
+
}
|
17 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
18 |
+
|
19 |
+
|
20 |
+
class YiTokenizer(PreTrainedTokenizer):
|
21 |
+
"""
|
22 |
+
Construct a Yi tokenizer. Based on byte-level Byte-Pair-Encoding.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
vocab_file (`str`):
|
26 |
+
Path to the vocabulary file.
|
27 |
+
"""
|
28 |
+
|
29 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
30 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
31 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
32 |
+
model_input_names = ["input_ids", "attention_mask"]
|
33 |
+
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
vocab_file,
|
37 |
+
unk_token="<unk>",
|
38 |
+
bos_token="<|startoftext|>",
|
39 |
+
eos_token="<|endoftext|>",
|
40 |
+
pad_token="<unk>",
|
41 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
42 |
+
add_bos_token=True,
|
43 |
+
add_eos_token=False,
|
44 |
+
clean_up_tokenization_spaces=False,
|
45 |
+
**kwargs,
|
46 |
+
):
|
47 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
48 |
+
bos_token = (
|
49 |
+
AddedToken(bos_token, lstrip=False, rstrip=False)
|
50 |
+
if isinstance(bos_token, str)
|
51 |
+
else bos_token
|
52 |
+
)
|
53 |
+
eos_token = (
|
54 |
+
AddedToken(eos_token, lstrip=False, rstrip=False)
|
55 |
+
if isinstance(eos_token, str)
|
56 |
+
else eos_token
|
57 |
+
)
|
58 |
+
unk_token = (
|
59 |
+
AddedToken(unk_token, lstrip=False, rstrip=False)
|
60 |
+
if isinstance(unk_token, str)
|
61 |
+
else unk_token
|
62 |
+
)
|
63 |
+
pad_token = (
|
64 |
+
AddedToken(pad_token, lstrip=False, rstrip=False)
|
65 |
+
if isinstance(pad_token, str)
|
66 |
+
else pad_token
|
67 |
+
)
|
68 |
+
self.vocab_file = vocab_file
|
69 |
+
self.add_bos_token = add_bos_token
|
70 |
+
self.add_eos_token = add_eos_token
|
71 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
72 |
+
self.sp_model.Load(vocab_file)
|
73 |
+
super().__init__(
|
74 |
+
bos_token=bos_token,
|
75 |
+
eos_token=eos_token,
|
76 |
+
unk_token=unk_token,
|
77 |
+
pad_token=pad_token,
|
78 |
+
add_bos_token=add_bos_token,
|
79 |
+
add_eos_token=add_eos_token,
|
80 |
+
sp_model_kwargs=self.sp_model_kwargs,
|
81 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
82 |
+
**kwargs,
|
83 |
+
)
|
84 |
+
|
85 |
+
def __getstate__(self):
|
86 |
+
state = self.__dict__.copy()
|
87 |
+
state["sp_model"] = None
|
88 |
+
return state
|
89 |
+
|
90 |
+
def __setstate__(self, d):
|
91 |
+
self.__dict__ = d
|
92 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
93 |
+
self.sp_model.Load(self.vocab_file)
|
94 |
+
|
95 |
+
@property
|
96 |
+
def vocab_size(self):
|
97 |
+
"""Returns vocab size"""
|
98 |
+
return self.sp_model.get_piece_size()
|
99 |
+
|
100 |
+
def get_vocab(self):
|
101 |
+
"""Returns vocab as a dict"""
|
102 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
103 |
+
vocab.update(self.added_tokens_encoder)
|
104 |
+
return vocab
|
105 |
+
|
106 |
+
def _tokenize(self, text):
|
107 |
+
"""Returns a tokenized string."""
|
108 |
+
return self.sp_model.encode(text, out_type=str)
|
109 |
+
|
110 |
+
def _convert_token_to_id(self, token):
|
111 |
+
"""Converts a token (str) in an id using the vocab."""
|
112 |
+
return self.sp_model.piece_to_id(token)
|
113 |
+
|
114 |
+
def _convert_id_to_token(self, index):
|
115 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
116 |
+
token = self.sp_model.IdToPiece(index)
|
117 |
+
return token
|
118 |
+
|
119 |
+
def convert_tokens_to_string(self, tokens):
|
120 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
121 |
+
current_sub_tokens = []
|
122 |
+
out_string = ""
|
123 |
+
prev_is_special = False
|
124 |
+
for i, token in enumerate(tokens):
|
125 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
126 |
+
if token in self.all_special_tokens:
|
127 |
+
if not prev_is_special and i != 0:
|
128 |
+
out_string += " "
|
129 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
130 |
+
prev_is_special = True
|
131 |
+
current_sub_tokens = []
|
132 |
+
else:
|
133 |
+
current_sub_tokens.append(token)
|
134 |
+
prev_is_special = False
|
135 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
136 |
+
return out_string
|
137 |
+
|
138 |
+
def save_vocabulary(
|
139 |
+
self, save_directory, filename_prefix: Optional[str] = None
|
140 |
+
) -> Tuple[str]:
|
141 |
+
"""
|
142 |
+
Save the vocabulary and special tokens file to a directory.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
save_directory (`str`):
|
146 |
+
The directory in which to save the vocabulary.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
`Tuple(str)`: Paths to the files saved.
|
150 |
+
"""
|
151 |
+
if not os.path.isdir(save_directory):
|
152 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
153 |
+
return
|
154 |
+
out_vocab_file = os.path.join(
|
155 |
+
save_directory,
|
156 |
+
(filename_prefix + "-" if filename_prefix else "")
|
157 |
+
+ VOCAB_FILES_NAMES["vocab_file"],
|
158 |
+
)
|
159 |
+
|
160 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
161 |
+
out_vocab_file
|
162 |
+
) and os.path.isfile(self.vocab_file):
|
163 |
+
copyfile(self.vocab_file, out_vocab_file)
|
164 |
+
elif not os.path.isfile(self.vocab_file):
|
165 |
+
with open(out_vocab_file, "wb") as fi:
|
166 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
167 |
+
fi.write(content_spiece_model)
|
168 |
+
|
169 |
+
return (out_vocab_file,)
|
170 |
+
|
171 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
172 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
173 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
174 |
+
|
175 |
+
output = bos_token_id + token_ids_0 + eos_token_id
|
176 |
+
|
177 |
+
if token_ids_1 is not None:
|
178 |
+
output = output + bos_token_id + token_ids_1 + eos_token_id
|
179 |
+
|
180 |
+
return output
|
181 |
+
|
182 |
+
def get_special_tokens_mask(
|
183 |
+
self,
|
184 |
+
token_ids_0: List[int],
|
185 |
+
token_ids_1: Optional[List[int]] = None,
|
186 |
+
already_has_special_tokens: bool = False,
|
187 |
+
) -> List[int]:
|
188 |
+
"""
|
189 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
190 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
token_ids_0 (`List[int]`):
|
194 |
+
List of IDs.
|
195 |
+
token_ids_1 (`List[int]`, *optional*):
|
196 |
+
Optional second list of IDs for sequence pairs.
|
197 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
198 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
199 |
+
|
200 |
+
Returns:
|
201 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
202 |
+
"""
|
203 |
+
if already_has_special_tokens:
|
204 |
+
return super().get_special_tokens_mask(
|
205 |
+
token_ids_0=token_ids_0,
|
206 |
+
token_ids_1=token_ids_1,
|
207 |
+
already_has_special_tokens=True,
|
208 |
+
)
|
209 |
+
|
210 |
+
bos_token_id = [1] if self.add_bos_token else []
|
211 |
+
eos_token_id = [1] if self.add_eos_token else []
|
212 |
+
|
213 |
+
if token_ids_1 is None:
|
214 |
+
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
215 |
+
return (
|
216 |
+
bos_token_id
|
217 |
+
+ ([0] * len(token_ids_0))
|
218 |
+
+ eos_token_id
|
219 |
+
+ bos_token_id
|
220 |
+
+ ([0] * len(token_ids_1))
|
221 |
+
+ eos_token_id
|
222 |
+
)
|
223 |
+
|
224 |
+
def create_token_type_ids_from_sequences(
|
225 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
226 |
+
) -> List[int]:
|
227 |
+
"""
|
228 |
+
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
229 |
+
sequence pair mask has the following format:
|
230 |
+
|
231 |
+
```
|
232 |
+
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
233 |
+
| first sequence | second sequence |
|
234 |
+
```
|
235 |
+
|
236 |
+
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
237 |
+
|
238 |
+
Args:
|
239 |
+
token_ids_0 (`List[int]`):
|
240 |
+
List of ids.
|
241 |
+
token_ids_1 (`List[int]`, *optional*):
|
242 |
+
Optional second list of IDs for sequence pairs.
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
246 |
+
"""
|
247 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
248 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
249 |
+
|
250 |
+
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
251 |
+
|
252 |
+
if token_ids_1 is not None:
|
253 |
+
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
254 |
+
|
255 |
+
return output
|
checkpoint-100/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
|
3 |
+
size 1033105
|
checkpoint-100/tokenizer_config.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<|startoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "<|endoftext|>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": true,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"auto_map": {
|
31 |
+
"AutoTokenizer": [
|
32 |
+
"tokenization_yi.YiTokenizer",
|
33 |
+
null
|
34 |
+
]
|
35 |
+
},
|
36 |
+
"bos_token": "<|startoftext|>",
|
37 |
+
"clean_up_tokenization_spaces": false,
|
38 |
+
"eos_token": "<|endoftext|>",
|
39 |
+
"model_max_length": 4096,
|
40 |
+
"pad_token": "<unk>",
|
41 |
+
"padding_side": "right",
|
42 |
+
"sp_model_kwargs": {},
|
43 |
+
"split_special_tokens": false,
|
44 |
+
"tokenizer_class": "YiTokenizer",
|
45 |
+
"unk_token": "<unk>"
|
46 |
+
}
|
checkpoint-100/trainer_state.json
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.4914004914004914,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 100,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.05,
|
13 |
+
"learning_rate": 0.00025,
|
14 |
+
"loss": 1.1555,
|
15 |
+
"step": 10
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 0.1,
|
19 |
+
"learning_rate": 0.0005,
|
20 |
+
"loss": 0.1709,
|
21 |
+
"step": 20
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 0.15,
|
25 |
+
"learning_rate": 0.0004963251406715272,
|
26 |
+
"loss": 0.1324,
|
27 |
+
"step": 30
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 0.2,
|
31 |
+
"learning_rate": 0.0004854085994147815,
|
32 |
+
"loss": 0.1154,
|
33 |
+
"step": 40
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 0.25,
|
37 |
+
"learning_rate": 0.00046757131025753886,
|
38 |
+
"loss": 0.1094,
|
39 |
+
"step": 50
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 0.29,
|
43 |
+
"learning_rate": 0.00044333766942743246,
|
44 |
+
"loss": 0.1093,
|
45 |
+
"step": 60
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 0.34,
|
49 |
+
"learning_rate": 0.0004134201186930015,
|
50 |
+
"loss": 0.0908,
|
51 |
+
"step": 70
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.39,
|
55 |
+
"learning_rate": 0.00037869820037745775,
|
56 |
+
"loss": 0.1121,
|
57 |
+
"step": 80
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 0.44,
|
61 |
+
"learning_rate": 0.0003401926998041959,
|
62 |
+
"loss": 0.0932,
|
63 |
+
"step": 90
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 0.49,
|
67 |
+
"learning_rate": 0.00029903563535704927,
|
68 |
+
"loss": 0.0862,
|
69 |
+
"step": 100
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"logging_steps": 10,
|
73 |
+
"max_steps": 203,
|
74 |
+
"num_input_tokens_seen": 0,
|
75 |
+
"num_train_epochs": 1,
|
76 |
+
"save_steps": 100,
|
77 |
+
"total_flos": 1.594758970469253e+17,
|
78 |
+
"train_batch_size": 4,
|
79 |
+
"trial_name": null,
|
80 |
+
"trial_params": null
|
81 |
+
}
|
checkpoint-100/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84b205a7d9c952643ccebb1d24fa6807e9ccf85117b0cbdd58aff8e0c08e1cf9
|
3 |
+
size 4411
|
checkpoint-200/README.md
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
base_model: /workspace/model
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
|
201 |
+
|
202 |
+
### Framework versions
|
203 |
+
|
204 |
+
- PEFT 0.8.2
|
checkpoint-200/adapter_config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "/workspace/model",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"loftq_config": {},
|
12 |
+
"lora_alpha": 16,
|
13 |
+
"lora_dropout": 0.2,
|
14 |
+
"megatron_config": null,
|
15 |
+
"megatron_core": "megatron.core",
|
16 |
+
"modules_to_save": null,
|
17 |
+
"peft_type": "LORA",
|
18 |
+
"r": 8,
|
19 |
+
"rank_pattern": {},
|
20 |
+
"revision": null,
|
21 |
+
"target_modules": [
|
22 |
+
"q_proj",
|
23 |
+
"v_proj"
|
24 |
+
],
|
25 |
+
"task_type": "CAUSAL_LM",
|
26 |
+
"use_rslora": false
|
27 |
+
}
|
checkpoint-200/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3dfc623e59f866c04640e09ed166056acdf9b04afb5e788aa39bb1edcf5fcd91
|
3 |
+
size 43285800
|
checkpoint-200/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7adb1a0215611c65e570002bc5374a85d09b48c7283ab25d67e55f272bc3d7da
|
3 |
+
size 86708485
|
checkpoint-200/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1158a2d0a5160b25e5e56d91400d6234b8802dfb077d53ca5daadd78ec49e76e
|
3 |
+
size 14575
|
checkpoint-200/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db0acb3b8d508c274423278e116dc4ae36967c330f90ba70aa5db2466effebe8
|
3 |
+
size 627
|
checkpoint-200/special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
checkpoint-200/tokenization_yi.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from shutil import copyfile
|
3 |
+
from typing import Any, Dict, List, Optional, Tuple
|
4 |
+
|
5 |
+
import sentencepiece as spm
|
6 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
7 |
+
from transformers.utils import logging
|
8 |
+
|
9 |
+
logger = logging.get_logger(__name__)
|
10 |
+
|
11 |
+
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
|
12 |
+
|
13 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
14 |
+
"vocab_file": {},
|
15 |
+
"tokenizer_file": {},
|
16 |
+
}
|
17 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
18 |
+
|
19 |
+
|
20 |
+
class YiTokenizer(PreTrainedTokenizer):
|
21 |
+
"""
|
22 |
+
Construct a Yi tokenizer. Based on byte-level Byte-Pair-Encoding.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
vocab_file (`str`):
|
26 |
+
Path to the vocabulary file.
|
27 |
+
"""
|
28 |
+
|
29 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
30 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
31 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
32 |
+
model_input_names = ["input_ids", "attention_mask"]
|
33 |
+
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
vocab_file,
|
37 |
+
unk_token="<unk>",
|
38 |
+
bos_token="<|startoftext|>",
|
39 |
+
eos_token="<|endoftext|>",
|
40 |
+
pad_token="<unk>",
|
41 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
42 |
+
add_bos_token=True,
|
43 |
+
add_eos_token=False,
|
44 |
+
clean_up_tokenization_spaces=False,
|
45 |
+
**kwargs,
|
46 |
+
):
|
47 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
48 |
+
bos_token = (
|
49 |
+
AddedToken(bos_token, lstrip=False, rstrip=False)
|
50 |
+
if isinstance(bos_token, str)
|
51 |
+
else bos_token
|
52 |
+
)
|
53 |
+
eos_token = (
|
54 |
+
AddedToken(eos_token, lstrip=False, rstrip=False)
|
55 |
+
if isinstance(eos_token, str)
|
56 |
+
else eos_token
|
57 |
+
)
|
58 |
+
unk_token = (
|
59 |
+
AddedToken(unk_token, lstrip=False, rstrip=False)
|
60 |
+
if isinstance(unk_token, str)
|
61 |
+
else unk_token
|
62 |
+
)
|
63 |
+
pad_token = (
|
64 |
+
AddedToken(pad_token, lstrip=False, rstrip=False)
|
65 |
+
if isinstance(pad_token, str)
|
66 |
+
else pad_token
|
67 |
+
)
|
68 |
+
self.vocab_file = vocab_file
|
69 |
+
self.add_bos_token = add_bos_token
|
70 |
+
self.add_eos_token = add_eos_token
|
71 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
72 |
+
self.sp_model.Load(vocab_file)
|
73 |
+
super().__init__(
|
74 |
+
bos_token=bos_token,
|
75 |
+
eos_token=eos_token,
|
76 |
+
unk_token=unk_token,
|
77 |
+
pad_token=pad_token,
|
78 |
+
add_bos_token=add_bos_token,
|
79 |
+
add_eos_token=add_eos_token,
|
80 |
+
sp_model_kwargs=self.sp_model_kwargs,
|
81 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
82 |
+
**kwargs,
|
83 |
+
)
|
84 |
+
|
85 |
+
def __getstate__(self):
|
86 |
+
state = self.__dict__.copy()
|
87 |
+
state["sp_model"] = None
|
88 |
+
return state
|
89 |
+
|
90 |
+
def __setstate__(self, d):
|
91 |
+
self.__dict__ = d
|
92 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
93 |
+
self.sp_model.Load(self.vocab_file)
|
94 |
+
|
95 |
+
@property
|
96 |
+
def vocab_size(self):
|
97 |
+
"""Returns vocab size"""
|
98 |
+
return self.sp_model.get_piece_size()
|
99 |
+
|
100 |
+
def get_vocab(self):
|
101 |
+
"""Returns vocab as a dict"""
|
102 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
103 |
+
vocab.update(self.added_tokens_encoder)
|
104 |
+
return vocab
|
105 |
+
|
106 |
+
def _tokenize(self, text):
|
107 |
+
"""Returns a tokenized string."""
|
108 |
+
return self.sp_model.encode(text, out_type=str)
|
109 |
+
|
110 |
+
def _convert_token_to_id(self, token):
|
111 |
+
"""Converts a token (str) in an id using the vocab."""
|
112 |
+
return self.sp_model.piece_to_id(token)
|
113 |
+
|
114 |
+
def _convert_id_to_token(self, index):
|
115 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
116 |
+
token = self.sp_model.IdToPiece(index)
|
117 |
+
return token
|
118 |
+
|
119 |
+
def convert_tokens_to_string(self, tokens):
|
120 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
121 |
+
current_sub_tokens = []
|
122 |
+
out_string = ""
|
123 |
+
prev_is_special = False
|
124 |
+
for i, token in enumerate(tokens):
|
125 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
126 |
+
if token in self.all_special_tokens:
|
127 |
+
if not prev_is_special and i != 0:
|
128 |
+
out_string += " "
|
129 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
130 |
+
prev_is_special = True
|
131 |
+
current_sub_tokens = []
|
132 |
+
else:
|
133 |
+
current_sub_tokens.append(token)
|
134 |
+
prev_is_special = False
|
135 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
136 |
+
return out_string
|
137 |
+
|
138 |
+
def save_vocabulary(
|
139 |
+
self, save_directory, filename_prefix: Optional[str] = None
|
140 |
+
) -> Tuple[str]:
|
141 |
+
"""
|
142 |
+
Save the vocabulary and special tokens file to a directory.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
save_directory (`str`):
|
146 |
+
The directory in which to save the vocabulary.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
`Tuple(str)`: Paths to the files saved.
|
150 |
+
"""
|
151 |
+
if not os.path.isdir(save_directory):
|
152 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
153 |
+
return
|
154 |
+
out_vocab_file = os.path.join(
|
155 |
+
save_directory,
|
156 |
+
(filename_prefix + "-" if filename_prefix else "")
|
157 |
+
+ VOCAB_FILES_NAMES["vocab_file"],
|
158 |
+
)
|
159 |
+
|
160 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
161 |
+
out_vocab_file
|
162 |
+
) and os.path.isfile(self.vocab_file):
|
163 |
+
copyfile(self.vocab_file, out_vocab_file)
|
164 |
+
elif not os.path.isfile(self.vocab_file):
|
165 |
+
with open(out_vocab_file, "wb") as fi:
|
166 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
167 |
+
fi.write(content_spiece_model)
|
168 |
+
|
169 |
+
return (out_vocab_file,)
|
170 |
+
|
171 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
172 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
173 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
174 |
+
|
175 |
+
output = bos_token_id + token_ids_0 + eos_token_id
|
176 |
+
|
177 |
+
if token_ids_1 is not None:
|
178 |
+
output = output + bos_token_id + token_ids_1 + eos_token_id
|
179 |
+
|
180 |
+
return output
|
181 |
+
|
182 |
+
def get_special_tokens_mask(
|
183 |
+
self,
|
184 |
+
token_ids_0: List[int],
|
185 |
+
token_ids_1: Optional[List[int]] = None,
|
186 |
+
already_has_special_tokens: bool = False,
|
187 |
+
) -> List[int]:
|
188 |
+
"""
|
189 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
190 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
token_ids_0 (`List[int]`):
|
194 |
+
List of IDs.
|
195 |
+
token_ids_1 (`List[int]`, *optional*):
|
196 |
+
Optional second list of IDs for sequence pairs.
|
197 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
198 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
199 |
+
|
200 |
+
Returns:
|
201 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
202 |
+
"""
|
203 |
+
if already_has_special_tokens:
|
204 |
+
return super().get_special_tokens_mask(
|
205 |
+
token_ids_0=token_ids_0,
|
206 |
+
token_ids_1=token_ids_1,
|
207 |
+
already_has_special_tokens=True,
|
208 |
+
)
|
209 |
+
|
210 |
+
bos_token_id = [1] if self.add_bos_token else []
|
211 |
+
eos_token_id = [1] if self.add_eos_token else []
|
212 |
+
|
213 |
+
if token_ids_1 is None:
|
214 |
+
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
215 |
+
return (
|
216 |
+
bos_token_id
|
217 |
+
+ ([0] * len(token_ids_0))
|
218 |
+
+ eos_token_id
|
219 |
+
+ bos_token_id
|
220 |
+
+ ([0] * len(token_ids_1))
|
221 |
+
+ eos_token_id
|
222 |
+
)
|
223 |
+
|
224 |
+
def create_token_type_ids_from_sequences(
|
225 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
226 |
+
) -> List[int]:
|
227 |
+
"""
|
228 |
+
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
229 |
+
sequence pair mask has the following format:
|
230 |
+
|
231 |
+
```
|
232 |
+
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
233 |
+
| first sequence | second sequence |
|
234 |
+
```
|
235 |
+
|
236 |
+
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
237 |
+
|
238 |
+
Args:
|
239 |
+
token_ids_0 (`List[int]`):
|
240 |
+
List of ids.
|
241 |
+
token_ids_1 (`List[int]`, *optional*):
|
242 |
+
Optional second list of IDs for sequence pairs.
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
246 |
+
"""
|
247 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
248 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
249 |
+
|
250 |
+
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
251 |
+
|
252 |
+
if token_ids_1 is not None:
|
253 |
+
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
254 |
+
|
255 |
+
return output
|
checkpoint-200/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
|
3 |
+
size 1033105
|
checkpoint-200/tokenizer_config.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<|startoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "<|endoftext|>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": true,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"auto_map": {
|
31 |
+
"AutoTokenizer": [
|
32 |
+
"tokenization_yi.YiTokenizer",
|
33 |
+
null
|
34 |
+
]
|
35 |
+
},
|
36 |
+
"bos_token": "<|startoftext|>",
|
37 |
+
"clean_up_tokenization_spaces": false,
|
38 |
+
"eos_token": "<|endoftext|>",
|
39 |
+
"model_max_length": 4096,
|
40 |
+
"pad_token": "<unk>",
|
41 |
+
"padding_side": "right",
|
42 |
+
"sp_model_kwargs": {},
|
43 |
+
"split_special_tokens": false,
|
44 |
+
"tokenizer_class": "YiTokenizer",
|
45 |
+
"unk_token": "<unk>"
|
46 |
+
}
|
checkpoint-200/trainer_state.json
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.9828009828009828,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 200,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.05,
|
13 |
+
"learning_rate": 0.00025,
|
14 |
+
"loss": 1.1555,
|
15 |
+
"step": 10
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 0.1,
|
19 |
+
"learning_rate": 0.0005,
|
20 |
+
"loss": 0.1709,
|
21 |
+
"step": 20
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 0.15,
|
25 |
+
"learning_rate": 0.0004963251406715272,
|
26 |
+
"loss": 0.1324,
|
27 |
+
"step": 30
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 0.2,
|
31 |
+
"learning_rate": 0.0004854085994147815,
|
32 |
+
"loss": 0.1154,
|
33 |
+
"step": 40
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 0.25,
|
37 |
+
"learning_rate": 0.00046757131025753886,
|
38 |
+
"loss": 0.1094,
|
39 |
+
"step": 50
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 0.29,
|
43 |
+
"learning_rate": 0.00044333766942743246,
|
44 |
+
"loss": 0.1093,
|
45 |
+
"step": 60
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 0.34,
|
49 |
+
"learning_rate": 0.0004134201186930015,
|
50 |
+
"loss": 0.0908,
|
51 |
+
"step": 70
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.39,
|
55 |
+
"learning_rate": 0.00037869820037745775,
|
56 |
+
"loss": 0.1121,
|
57 |
+
"step": 80
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 0.44,
|
61 |
+
"learning_rate": 0.0003401926998041959,
|
62 |
+
"loss": 0.0932,
|
63 |
+
"step": 90
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 0.49,
|
67 |
+
"learning_rate": 0.00029903563535704927,
|
68 |
+
"loss": 0.0862,
|
69 |
+
"step": 100
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 0.54,
|
73 |
+
"learning_rate": 0.0002564369784137472,
|
74 |
+
"loss": 0.0996,
|
75 |
+
"step": 110
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 0.59,
|
79 |
+
"learning_rate": 0.00021364908154907752,
|
80 |
+
"loss": 0.1059,
|
81 |
+
"step": 120
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 0.64,
|
85 |
+
"learning_rate": 0.00017192986077855136,
|
86 |
+
"loss": 0.1097,
|
87 |
+
"step": 130
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 0.69,
|
91 |
+
"learning_rate": 0.0001325058142431701,
|
92 |
+
"loss": 0.1096,
|
93 |
+
"step": 140
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.74,
|
97 |
+
"learning_rate": 9.653596454434699e-05,
|
98 |
+
"loss": 0.1032,
|
99 |
+
"step": 150
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 0.79,
|
103 |
+
"learning_rate": 6.507778478375834e-05,
|
104 |
+
"loss": 0.0968,
|
105 |
+
"step": 160
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 0.84,
|
109 |
+
"learning_rate": 3.905611004420359e-05,
|
110 |
+
"loss": 0.0951,
|
111 |
+
"step": 170
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 0.88,
|
115 |
+
"learning_rate": 1.9235948278956e-05,
|
116 |
+
"loss": 0.102,
|
117 |
+
"step": 180
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 0.93,
|
121 |
+
"learning_rate": 6.199989938854372e-06,
|
122 |
+
"loss": 0.0922,
|
123 |
+
"step": 190
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 0.98,
|
127 |
+
"learning_rate": 3.314775287923677e-07,
|
128 |
+
"loss": 0.0869,
|
129 |
+
"step": 200
|
130 |
+
}
|
131 |
+
],
|
132 |
+
"logging_steps": 10,
|
133 |
+
"max_steps": 203,
|
134 |
+
"num_input_tokens_seen": 0,
|
135 |
+
"num_train_epochs": 1,
|
136 |
+
"save_steps": 100,
|
137 |
+
"total_flos": 3.2162362793617e+17,
|
138 |
+
"train_batch_size": 4,
|
139 |
+
"trial_name": null,
|
140 |
+
"trial_params": null
|
141 |
+
}
|
checkpoint-200/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84b205a7d9c952643ccebb1d24fa6807e9ccf85117b0cbdd58aff8e0c08e1cf9
|
3 |
+
size 4411
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenization_yi.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from shutil import copyfile
|
3 |
+
from typing import Any, Dict, List, Optional, Tuple
|
4 |
+
|
5 |
+
import sentencepiece as spm
|
6 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
7 |
+
from transformers.utils import logging
|
8 |
+
|
9 |
+
logger = logging.get_logger(__name__)
|
10 |
+
|
11 |
+
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
|
12 |
+
|
13 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
14 |
+
"vocab_file": {},
|
15 |
+
"tokenizer_file": {},
|
16 |
+
}
|
17 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
18 |
+
|
19 |
+
|
20 |
+
class YiTokenizer(PreTrainedTokenizer):
|
21 |
+
"""
|
22 |
+
Construct a Yi tokenizer. Based on byte-level Byte-Pair-Encoding.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
vocab_file (`str`):
|
26 |
+
Path to the vocabulary file.
|
27 |
+
"""
|
28 |
+
|
29 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
30 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
31 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
32 |
+
model_input_names = ["input_ids", "attention_mask"]
|
33 |
+
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
vocab_file,
|
37 |
+
unk_token="<unk>",
|
38 |
+
bos_token="<|startoftext|>",
|
39 |
+
eos_token="<|endoftext|>",
|
40 |
+
pad_token="<unk>",
|
41 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
42 |
+
add_bos_token=True,
|
43 |
+
add_eos_token=False,
|
44 |
+
clean_up_tokenization_spaces=False,
|
45 |
+
**kwargs,
|
46 |
+
):
|
47 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
48 |
+
bos_token = (
|
49 |
+
AddedToken(bos_token, lstrip=False, rstrip=False)
|
50 |
+
if isinstance(bos_token, str)
|
51 |
+
else bos_token
|
52 |
+
)
|
53 |
+
eos_token = (
|
54 |
+
AddedToken(eos_token, lstrip=False, rstrip=False)
|
55 |
+
if isinstance(eos_token, str)
|
56 |
+
else eos_token
|
57 |
+
)
|
58 |
+
unk_token = (
|
59 |
+
AddedToken(unk_token, lstrip=False, rstrip=False)
|
60 |
+
if isinstance(unk_token, str)
|
61 |
+
else unk_token
|
62 |
+
)
|
63 |
+
pad_token = (
|
64 |
+
AddedToken(pad_token, lstrip=False, rstrip=False)
|
65 |
+
if isinstance(pad_token, str)
|
66 |
+
else pad_token
|
67 |
+
)
|
68 |
+
self.vocab_file = vocab_file
|
69 |
+
self.add_bos_token = add_bos_token
|
70 |
+
self.add_eos_token = add_eos_token
|
71 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
72 |
+
self.sp_model.Load(vocab_file)
|
73 |
+
super().__init__(
|
74 |
+
bos_token=bos_token,
|
75 |
+
eos_token=eos_token,
|
76 |
+
unk_token=unk_token,
|
77 |
+
pad_token=pad_token,
|
78 |
+
add_bos_token=add_bos_token,
|
79 |
+
add_eos_token=add_eos_token,
|
80 |
+
sp_model_kwargs=self.sp_model_kwargs,
|
81 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
82 |
+
**kwargs,
|
83 |
+
)
|
84 |
+
|
85 |
+
def __getstate__(self):
|
86 |
+
state = self.__dict__.copy()
|
87 |
+
state["sp_model"] = None
|
88 |
+
return state
|
89 |
+
|
90 |
+
def __setstate__(self, d):
|
91 |
+
self.__dict__ = d
|
92 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
93 |
+
self.sp_model.Load(self.vocab_file)
|
94 |
+
|
95 |
+
@property
|
96 |
+
def vocab_size(self):
|
97 |
+
"""Returns vocab size"""
|
98 |
+
return self.sp_model.get_piece_size()
|
99 |
+
|
100 |
+
def get_vocab(self):
|
101 |
+
"""Returns vocab as a dict"""
|
102 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
103 |
+
vocab.update(self.added_tokens_encoder)
|
104 |
+
return vocab
|
105 |
+
|
106 |
+
def _tokenize(self, text):
|
107 |
+
"""Returns a tokenized string."""
|
108 |
+
return self.sp_model.encode(text, out_type=str)
|
109 |
+
|
110 |
+
def _convert_token_to_id(self, token):
|
111 |
+
"""Converts a token (str) in an id using the vocab."""
|
112 |
+
return self.sp_model.piece_to_id(token)
|
113 |
+
|
114 |
+
def _convert_id_to_token(self, index):
|
115 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
116 |
+
token = self.sp_model.IdToPiece(index)
|
117 |
+
return token
|
118 |
+
|
119 |
+
def convert_tokens_to_string(self, tokens):
|
120 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
121 |
+
current_sub_tokens = []
|
122 |
+
out_string = ""
|
123 |
+
prev_is_special = False
|
124 |
+
for i, token in enumerate(tokens):
|
125 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
126 |
+
if token in self.all_special_tokens:
|
127 |
+
if not prev_is_special and i != 0:
|
128 |
+
out_string += " "
|
129 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
130 |
+
prev_is_special = True
|
131 |
+
current_sub_tokens = []
|
132 |
+
else:
|
133 |
+
current_sub_tokens.append(token)
|
134 |
+
prev_is_special = False
|
135 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
136 |
+
return out_string
|
137 |
+
|
138 |
+
def save_vocabulary(
|
139 |
+
self, save_directory, filename_prefix: Optional[str] = None
|
140 |
+
) -> Tuple[str]:
|
141 |
+
"""
|
142 |
+
Save the vocabulary and special tokens file to a directory.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
save_directory (`str`):
|
146 |
+
The directory in which to save the vocabulary.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
`Tuple(str)`: Paths to the files saved.
|
150 |
+
"""
|
151 |
+
if not os.path.isdir(save_directory):
|
152 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
153 |
+
return
|
154 |
+
out_vocab_file = os.path.join(
|
155 |
+
save_directory,
|
156 |
+
(filename_prefix + "-" if filename_prefix else "")
|
157 |
+
+ VOCAB_FILES_NAMES["vocab_file"],
|
158 |
+
)
|
159 |
+
|
160 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
161 |
+
out_vocab_file
|
162 |
+
) and os.path.isfile(self.vocab_file):
|
163 |
+
copyfile(self.vocab_file, out_vocab_file)
|
164 |
+
elif not os.path.isfile(self.vocab_file):
|
165 |
+
with open(out_vocab_file, "wb") as fi:
|
166 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
167 |
+
fi.write(content_spiece_model)
|
168 |
+
|
169 |
+
return (out_vocab_file,)
|
170 |
+
|
171 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
172 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
173 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
174 |
+
|
175 |
+
output = bos_token_id + token_ids_0 + eos_token_id
|
176 |
+
|
177 |
+
if token_ids_1 is not None:
|
178 |
+
output = output + bos_token_id + token_ids_1 + eos_token_id
|
179 |
+
|
180 |
+
return output
|
181 |
+
|
182 |
+
def get_special_tokens_mask(
|
183 |
+
self,
|
184 |
+
token_ids_0: List[int],
|
185 |
+
token_ids_1: Optional[List[int]] = None,
|
186 |
+
already_has_special_tokens: bool = False,
|
187 |
+
) -> List[int]:
|
188 |
+
"""
|
189 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
190 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
token_ids_0 (`List[int]`):
|
194 |
+
List of IDs.
|
195 |
+
token_ids_1 (`List[int]`, *optional*):
|
196 |
+
Optional second list of IDs for sequence pairs.
|
197 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
198 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
199 |
+
|
200 |
+
Returns:
|
201 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
202 |
+
"""
|
203 |
+
if already_has_special_tokens:
|
204 |
+
return super().get_special_tokens_mask(
|
205 |
+
token_ids_0=token_ids_0,
|
206 |
+
token_ids_1=token_ids_1,
|
207 |
+
already_has_special_tokens=True,
|
208 |
+
)
|
209 |
+
|
210 |
+
bos_token_id = [1] if self.add_bos_token else []
|
211 |
+
eos_token_id = [1] if self.add_eos_token else []
|
212 |
+
|
213 |
+
if token_ids_1 is None:
|
214 |
+
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
215 |
+
return (
|
216 |
+
bos_token_id
|
217 |
+
+ ([0] * len(token_ids_0))
|
218 |
+
+ eos_token_id
|
219 |
+
+ bos_token_id
|
220 |
+
+ ([0] * len(token_ids_1))
|
221 |
+
+ eos_token_id
|
222 |
+
)
|
223 |
+
|
224 |
+
def create_token_type_ids_from_sequences(
|
225 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
226 |
+
) -> List[int]:
|
227 |
+
"""
|
228 |
+
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
229 |
+
sequence pair mask has the following format:
|
230 |
+
|
231 |
+
```
|
232 |
+
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
233 |
+
| first sequence | second sequence |
|
234 |
+
```
|
235 |
+
|
236 |
+
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
237 |
+
|
238 |
+
Args:
|
239 |
+
token_ids_0 (`List[int]`):
|
240 |
+
List of ids.
|
241 |
+
token_ids_1 (`List[int]`, *optional*):
|
242 |
+
Optional second list of IDs for sequence pairs.
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
246 |
+
"""
|
247 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
248 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
249 |
+
|
250 |
+
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
251 |
+
|
252 |
+
if token_ids_1 is not None:
|
253 |
+
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
254 |
+
|
255 |
+
return output
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
|
3 |
+
size 1033105
|
tokenizer_config.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<|startoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "<|endoftext|>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": true,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"auto_map": {
|
31 |
+
"AutoTokenizer": [
|
32 |
+
"tokenization_yi.YiTokenizer",
|
33 |
+
null
|
34 |
+
]
|
35 |
+
},
|
36 |
+
"bos_token": "<|startoftext|>",
|
37 |
+
"clean_up_tokenization_spaces": false,
|
38 |
+
"eos_token": "<|endoftext|>",
|
39 |
+
"model_max_length": 4096,
|
40 |
+
"pad_token": "<unk>",
|
41 |
+
"padding_side": "right",
|
42 |
+
"sp_model_kwargs": {},
|
43 |
+
"split_special_tokens": false,
|
44 |
+
"tokenizer_class": "YiTokenizer",
|
45 |
+
"unk_token": "<unk>"
|
46 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"train_loss": 0.15793546711282777,
|
4 |
+
"train_runtime": 2949.2917,
|
5 |
+
"train_samples_per_second": 1.103,
|
6 |
+
"train_steps_per_second": 0.069
|
7 |
+
}
|
trainer_log.jsonl
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"current_steps": 10, "total_steps": 203, "loss": 1.1555, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00025, "epoch": 0.05, "percentage": 4.93, "elapsed_time": "0:02:24", "remaining_time": "0:46:34"}
|
2 |
+
{"current_steps": 20, "total_steps": 203, "loss": 0.1709, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005, "epoch": 0.1, "percentage": 9.85, "elapsed_time": "0:04:48", "remaining_time": "0:43:57"}
|
3 |
+
{"current_steps": 30, "total_steps": 203, "loss": 0.1324, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004963251406715272, "epoch": 0.15, "percentage": 14.78, "elapsed_time": "0:07:10", "remaining_time": "0:41:20"}
|
4 |
+
{"current_steps": 40, "total_steps": 203, "loss": 0.1154, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004854085994147815, "epoch": 0.2, "percentage": 19.7, "elapsed_time": "0:09:36", "remaining_time": "0:39:08"}
|
5 |
+
{"current_steps": 50, "total_steps": 203, "loss": 0.1094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00046757131025753886, "epoch": 0.25, "percentage": 24.63, "elapsed_time": "0:12:01", "remaining_time": "0:36:46"}
|
6 |
+
{"current_steps": 60, "total_steps": 203, "loss": 0.1093, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00044333766942743246, "epoch": 0.29, "percentage": 29.56, "elapsed_time": "0:14:27", "remaining_time": "0:34:28"}
|
7 |
+
{"current_steps": 70, "total_steps": 203, "loss": 0.0908, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004134201186930015, "epoch": 0.34, "percentage": 34.48, "elapsed_time": "0:16:54", "remaining_time": "0:32:07"}
|
8 |
+
{"current_steps": 80, "total_steps": 203, "loss": 0.1121, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00037869820037745775, "epoch": 0.39, "percentage": 39.41, "elapsed_time": "0:19:20", "remaining_time": "0:29:44"}
|
9 |
+
{"current_steps": 90, "total_steps": 203, "loss": 0.0932, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003401926998041959, "epoch": 0.44, "percentage": 44.33, "elapsed_time": "0:21:43", "remaining_time": "0:27:16"}
|
10 |
+
{"current_steps": 100, "total_steps": 203, "loss": 0.0862, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00029903563535704927, "epoch": 0.49, "percentage": 49.26, "elapsed_time": "0:24:02", "remaining_time": "0:24:45"}
|
11 |
+
{"current_steps": 110, "total_steps": 203, "loss": 0.0996, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002564369784137472, "epoch": 0.54, "percentage": 54.19, "elapsed_time": "0:26:27", "remaining_time": "0:22:21"}
|
12 |
+
{"current_steps": 120, "total_steps": 203, "loss": 0.1059, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021364908154907752, "epoch": 0.59, "percentage": 59.11, "elapsed_time": "0:29:00", "remaining_time": "0:20:04"}
|
13 |
+
{"current_steps": 130, "total_steps": 203, "loss": 0.1097, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017192986077855136, "epoch": 0.64, "percentage": 64.04, "elapsed_time": "0:31:25", "remaining_time": "0:17:38"}
|
14 |
+
{"current_steps": 140, "total_steps": 203, "loss": 0.1096, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001325058142431701, "epoch": 0.69, "percentage": 68.97, "elapsed_time": "0:33:52", "remaining_time": "0:15:14"}
|
15 |
+
{"current_steps": 150, "total_steps": 203, "loss": 0.1032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.653596454434699e-05, "epoch": 0.74, "percentage": 73.89, "elapsed_time": "0:36:17", "remaining_time": "0:12:49"}
|
16 |
+
{"current_steps": 160, "total_steps": 203, "loss": 0.0968, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.507778478375834e-05, "epoch": 0.79, "percentage": 78.82, "elapsed_time": "0:38:44", "remaining_time": "0:10:24"}
|
17 |
+
{"current_steps": 170, "total_steps": 203, "loss": 0.0951, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.905611004420359e-05, "epoch": 0.84, "percentage": 83.74, "elapsed_time": "0:41:10", "remaining_time": "0:07:59"}
|
18 |
+
{"current_steps": 180, "total_steps": 203, "loss": 0.102, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9235948278956e-05, "epoch": 0.88, "percentage": 88.67, "elapsed_time": "0:43:37", "remaining_time": "0:05:34"}
|
19 |
+
{"current_steps": 190, "total_steps": 203, "loss": 0.0922, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.199989938854372e-06, "epoch": 0.93, "percentage": 93.6, "elapsed_time": "0:45:59", "remaining_time": "0:03:08"}
|
20 |
+
{"current_steps": 200, "total_steps": 203, "loss": 0.0869, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.314775287923677e-07, "epoch": 0.98, "percentage": 98.52, "elapsed_time": "0:48:23", "remaining_time": "0:00:43"}
|
21 |
+
{"current_steps": 203, "total_steps": 203, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:49:09", "remaining_time": "0:00:00"}
|
trainer_state.json
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.9975429975429976,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 203,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.05,
|
13 |
+
"learning_rate": 0.00025,
|
14 |
+
"loss": 1.1555,
|
15 |
+
"step": 10
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 0.1,
|
19 |
+
"learning_rate": 0.0005,
|
20 |
+
"loss": 0.1709,
|
21 |
+
"step": 20
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 0.15,
|
25 |
+
"learning_rate": 0.0004963251406715272,
|
26 |
+
"loss": 0.1324,
|
27 |
+
"step": 30
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 0.2,
|
31 |
+
"learning_rate": 0.0004854085994147815,
|
32 |
+
"loss": 0.1154,
|
33 |
+
"step": 40
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 0.25,
|
37 |
+
"learning_rate": 0.00046757131025753886,
|
38 |
+
"loss": 0.1094,
|
39 |
+
"step": 50
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 0.29,
|
43 |
+
"learning_rate": 0.00044333766942743246,
|
44 |
+
"loss": 0.1093,
|
45 |
+
"step": 60
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 0.34,
|
49 |
+
"learning_rate": 0.0004134201186930015,
|
50 |
+
"loss": 0.0908,
|
51 |
+
"step": 70
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.39,
|
55 |
+
"learning_rate": 0.00037869820037745775,
|
56 |
+
"loss": 0.1121,
|
57 |
+
"step": 80
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 0.44,
|
61 |
+
"learning_rate": 0.0003401926998041959,
|
62 |
+
"loss": 0.0932,
|
63 |
+
"step": 90
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 0.49,
|
67 |
+
"learning_rate": 0.00029903563535704927,
|
68 |
+
"loss": 0.0862,
|
69 |
+
"step": 100
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 0.54,
|
73 |
+
"learning_rate": 0.0002564369784137472,
|
74 |
+
"loss": 0.0996,
|
75 |
+
"step": 110
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 0.59,
|
79 |
+
"learning_rate": 0.00021364908154907752,
|
80 |
+
"loss": 0.1059,
|
81 |
+
"step": 120
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 0.64,
|
85 |
+
"learning_rate": 0.00017192986077855136,
|
86 |
+
"loss": 0.1097,
|
87 |
+
"step": 130
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 0.69,
|
91 |
+
"learning_rate": 0.0001325058142431701,
|
92 |
+
"loss": 0.1096,
|
93 |
+
"step": 140
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.74,
|
97 |
+
"learning_rate": 9.653596454434699e-05,
|
98 |
+
"loss": 0.1032,
|
99 |
+
"step": 150
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 0.79,
|
103 |
+
"learning_rate": 6.507778478375834e-05,
|
104 |
+
"loss": 0.0968,
|
105 |
+
"step": 160
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 0.84,
|
109 |
+
"learning_rate": 3.905611004420359e-05,
|
110 |
+
"loss": 0.0951,
|
111 |
+
"step": 170
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 0.88,
|
115 |
+
"learning_rate": 1.9235948278956e-05,
|
116 |
+
"loss": 0.102,
|
117 |
+
"step": 180
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 0.93,
|
121 |
+
"learning_rate": 6.199989938854372e-06,
|
122 |
+
"loss": 0.0922,
|
123 |
+
"step": 190
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 0.98,
|
127 |
+
"learning_rate": 3.314775287923677e-07,
|
128 |
+
"loss": 0.0869,
|
129 |
+
"step": 200
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 1.0,
|
133 |
+
"step": 203,
|
134 |
+
"total_flos": 3.266870789007606e+17,
|
135 |
+
"train_loss": 0.15793546711282777,
|
136 |
+
"train_runtime": 2949.2917,
|
137 |
+
"train_samples_per_second": 1.103,
|
138 |
+
"train_steps_per_second": 0.069
|
139 |
+
}
|
140 |
+
],
|
141 |
+
"logging_steps": 10,
|
142 |
+
"max_steps": 203,
|
143 |
+
"num_input_tokens_seen": 0,
|
144 |
+
"num_train_epochs": 1,
|
145 |
+
"save_steps": 100,
|
146 |
+
"total_flos": 3.266870789007606e+17,
|
147 |
+
"train_batch_size": 4,
|
148 |
+
"trial_name": null,
|
149 |
+
"trial_params": null
|
150 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84b205a7d9c952643ccebb1d24fa6807e9ccf85117b0cbdd58aff8e0c08e1cf9
|
3 |
+
size 4411
|