Sylvia2025 commited on
Commit
5f9c60b
1 Parent(s): 6457b0d

Upload folder using huggingface_hub

Browse files
Files changed (48) hide show
  1. README.md +59 -0
  2. adapter_config.json +28 -0
  3. adapter_model.safetensors +3 -0
  4. all_results.json +7 -0
  5. checkpoint-100/README.md +202 -0
  6. checkpoint-100/adapter_config.json +28 -0
  7. checkpoint-100/adapter_model.safetensors +3 -0
  8. checkpoint-100/optimizer.pt +3 -0
  9. checkpoint-100/rng_state.pth +3 -0
  10. checkpoint-100/scheduler.pt +3 -0
  11. checkpoint-100/special_tokens_map.json +24 -0
  12. checkpoint-100/tokenization_baichuan.py +251 -0
  13. checkpoint-100/tokenizer.model +3 -0
  14. checkpoint-100/tokenizer_config.json +47 -0
  15. checkpoint-100/trainer_state.json +141 -0
  16. checkpoint-100/training_args.bin +3 -0
  17. checkpoint-200/README.md +202 -0
  18. checkpoint-200/adapter_config.json +28 -0
  19. checkpoint-200/adapter_model.safetensors +3 -0
  20. checkpoint-200/optimizer.pt +3 -0
  21. checkpoint-200/rng_state.pth +3 -0
  22. checkpoint-200/scheduler.pt +3 -0
  23. checkpoint-200/special_tokens_map.json +24 -0
  24. checkpoint-200/tokenization_baichuan.py +251 -0
  25. checkpoint-200/tokenizer.model +3 -0
  26. checkpoint-200/tokenizer_config.json +47 -0
  27. checkpoint-200/trainer_state.json +261 -0
  28. checkpoint-200/training_args.bin +3 -0
  29. checkpoint-300/README.md +202 -0
  30. checkpoint-300/adapter_config.json +28 -0
  31. checkpoint-300/adapter_model.safetensors +3 -0
  32. checkpoint-300/optimizer.pt +3 -0
  33. checkpoint-300/rng_state.pth +3 -0
  34. checkpoint-300/scheduler.pt +3 -0
  35. checkpoint-300/special_tokens_map.json +24 -0
  36. checkpoint-300/tokenization_baichuan.py +251 -0
  37. checkpoint-300/tokenizer.model +3 -0
  38. checkpoint-300/tokenizer_config.json +47 -0
  39. checkpoint-300/trainer_state.json +381 -0
  40. checkpoint-300/training_args.bin +3 -0
  41. special_tokens_map.json +24 -0
  42. tokenization_baichuan.py +251 -0
  43. tokenizer.model +3 -0
  44. tokenizer_config.json +47 -0
  45. train_results.json +7 -0
  46. trainer_log.jsonl +76 -0
  47. trainer_state.json +480 -0
  48. training_args.bin +3 -0
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: baichuan-inc/Baichuan-7B
9
+ model-index:
10
+ - name: train_2024-04-24-13-17-50
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # train_2024-04-24-13-17-50
18
+
19
+ This model is a fine-tuned version of [baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B) on the alpaca_gpt4_zh and the alpaca_zh datasets.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0002
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 8
43
+ - total_train_batch_size: 16
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 3.0
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+
52
+
53
+ ### Framework versions
54
+
55
+ - PEFT 0.10.0
56
+ - Transformers 4.37.2
57
+ - Pytorch 2.1.2+cu121
58
+ - Datasets 2.19.0
59
+ - Tokenizers 0.15.2
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "baichuan-inc/Baichuan-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "W_pack"
24
+ ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_dora": false,
27
+ "use_rslora": false
28
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc78daf24caf5389f7e184abb48d6d551b8b0237f4d38b40783a3879fe2e72af
3
+ size 16785760
all_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 1.628477378845215,
4
+ "train_runtime": 1045.5144,
5
+ "train_samples_per_second": 5.739,
6
+ "train_steps_per_second": 0.359
7
+ }
checkpoint-100/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: baichuan-inc/Baichuan-7B
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "baichuan-inc/Baichuan-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "W_pack"
24
+ ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_dora": false,
27
+ "use_rslora": false
28
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:964dbe013935f9a4a0622cfc65c00eb8a1b12a09be0c11bbff411331d1481bb4
3
+ size 16785760
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e46fb0086b5ce076a8ddcab4e3f5e18ae88af0044725aaf062561a4836205b48
3
+ size 33608634
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36
3
+ size 14244
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4edd7cee06c7bbd4d1f7764b46a812cfc788b88695132d1b9ae0efb29eb9229c
3
+ size 1064
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-100/tokenization_baichuan.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ import os
22
+ from shutil import copyfile
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ import sentencepiece as spm
26
+
27
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
28
+ from transformers.utils import logging
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
34
+
35
+ PRETRAINED_VOCAB_FILES_MAP = {
36
+ "vocab_file": {},
37
+ "tokenizer_file": {},
38
+ }
39
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
40
+
41
+
42
+ class BaiChuanTokenizer(PreTrainedTokenizer):
43
+ """
44
+ Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding.
45
+
46
+ Args:
47
+ vocab_file (`str`):
48
+ Path to the vocabulary file.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
53
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ def __init__(
57
+ self,
58
+ vocab_file,
59
+ unk_token="<unk>",
60
+ bos_token="<s>",
61
+ eos_token="</s>",
62
+ pad_token=None,
63
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
64
+ add_bos_token=True,
65
+ add_eos_token=False,
66
+ clean_up_tokenization_spaces=False,
67
+ **kwargs,
68
+ ):
69
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
70
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
71
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
72
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
73
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
74
+ self.vocab_file = vocab_file
75
+ self.add_bos_token = add_bos_token
76
+ self.add_eos_token = add_eos_token
77
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
78
+ self.sp_model.Load(vocab_file)
79
+
80
+ super().__init__(
81
+ bos_token=bos_token,
82
+ eos_token=eos_token,
83
+ unk_token=unk_token,
84
+ pad_token=pad_token,
85
+ add_bos_token=add_bos_token,
86
+ add_eos_token=add_eos_token,
87
+ sp_model_kwargs=self.sp_model_kwargs,
88
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
89
+ **kwargs,
90
+ )
91
+
92
+ def __getstate__(self):
93
+ state = self.__dict__.copy()
94
+ state["sp_model"] = None
95
+ return state
96
+
97
+ def __setstate__(self, d):
98
+ self.__dict__ = d
99
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
100
+ self.sp_model.Load(self.vocab_file)
101
+
102
+ @property
103
+ def vocab_size(self):
104
+ """Returns vocab size"""
105
+ return self.sp_model.get_piece_size()
106
+
107
+ def get_vocab(self):
108
+ """Returns vocab as a dict"""
109
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
110
+ vocab.update(self.added_tokens_encoder)
111
+ return vocab
112
+
113
+ def _tokenize(self, text):
114
+ """Returns a tokenized string."""
115
+ return self.sp_model.encode(text, out_type=str)
116
+
117
+ def _convert_token_to_id(self, token):
118
+ """Converts a token (str) in an id using the vocab."""
119
+ return self.sp_model.piece_to_id(token)
120
+
121
+ def _convert_id_to_token(self, index):
122
+ """Converts an index (integer) in a token (str) using the vocab."""
123
+ token = self.sp_model.IdToPiece(index)
124
+ return token
125
+
126
+ def convert_tokens_to_string(self, tokens):
127
+ """Converts a sequence of tokens (string) in a single string."""
128
+ current_sub_tokens = []
129
+ out_string = ""
130
+ prev_is_special = False
131
+ for i, token in enumerate(tokens):
132
+ # make sure that special tokens are not decoded using sentencepiece model
133
+ if token in self.all_special_tokens:
134
+ if not prev_is_special and i != 0:
135
+ out_string += " "
136
+ out_string += self.sp_model.decode(current_sub_tokens) + token
137
+ prev_is_special = True
138
+ current_sub_tokens = []
139
+ else:
140
+ current_sub_tokens.append(token)
141
+ prev_is_special = False
142
+ out_string += self.sp_model.decode(current_sub_tokens)
143
+ return out_string
144
+
145
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """
147
+ Save the vocabulary and special tokens file to a directory.
148
+
149
+ Args:
150
+ save_directory (`str`):
151
+ The directory in which to save the vocabulary.
152
+
153
+ Returns:
154
+ `Tuple(str)`: Paths to the files saved.
155
+ """
156
+ if not os.path.isdir(save_directory):
157
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
158
+ return
159
+ out_vocab_file = os.path.join(
160
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
161
+ )
162
+
163
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
164
+ copyfile(self.vocab_file, out_vocab_file)
165
+ elif not os.path.isfile(self.vocab_file):
166
+ with open(out_vocab_file, "wb") as fi:
167
+ content_spiece_model = self.sp_model.serialized_model_proto()
168
+ fi.write(content_spiece_model)
169
+
170
+ return (out_vocab_file,)
171
+
172
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
173
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
174
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
175
+
176
+ output = bos_token_id + token_ids_0 + eos_token_id
177
+
178
+ if token_ids_1 is not None:
179
+ output = output + bos_token_id + token_ids_1 + eos_token_id
180
+
181
+ return output
182
+
183
+ def get_special_tokens_mask(
184
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
185
+ ) -> List[int]:
186
+ """
187
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
188
+ special tokens using the tokenizer `prepare_for_model` method.
189
+
190
+ Args:
191
+ token_ids_0 (`List[int]`):
192
+ List of IDs.
193
+ token_ids_1 (`List[int]`, *optional*):
194
+ Optional second list of IDs for sequence pairs.
195
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
196
+ Whether or not the token list is already formatted with special tokens for the model.
197
+
198
+ Returns:
199
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
200
+ """
201
+ if already_has_special_tokens:
202
+ return super().get_special_tokens_mask(
203
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
204
+ )
205
+
206
+ bos_token_id = [1] if self.add_bos_token else []
207
+ eos_token_id = [1] if self.add_eos_token else []
208
+
209
+ if token_ids_1 is None:
210
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
211
+ return (
212
+ bos_token_id
213
+ + ([0] * len(token_ids_0))
214
+ + eos_token_id
215
+ + bos_token_id
216
+ + ([0] * len(token_ids_1))
217
+ + eos_token_id
218
+ )
219
+
220
+ def create_token_type_ids_from_sequences(
221
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
222
+ ) -> List[int]:
223
+ """
224
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
225
+ sequence pair mask has the following format:
226
+
227
+ ```
228
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
229
+ | first sequence | second sequence |
230
+ ```
231
+
232
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
233
+
234
+ Args:
235
+ token_ids_0 (`List[int]`):
236
+ List of ids.
237
+ token_ids_1 (`List[int]`, *optional*):
238
+ Optional second list of IDs for sequence pairs.
239
+
240
+ Returns:
241
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
242
+ """
243
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
244
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
245
+
246
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
247
+
248
+ if token_ids_1 is not None:
249
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
250
+
251
+ return output
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be54af290d93c113bcbf421115ae9eed9d6340408f564898f1e966dc738ef01
3
+ size 1136699
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenization_baichuan.BaiChuanTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<s>",
37
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message + '\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '\\nAssistant: ' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "</s>",
42
+ "padding_side": "right",
43
+ "sp_model_kwargs": {},
44
+ "split_special_tokens": false,
45
+ "tokenizer_class": "BaiChuanTokenizer",
46
+ "unk_token": "<unk>"
47
+ }
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.8,
5
+ "eval_steps": 500,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.00019991228300988585,
14
+ "loss": 2.1829,
15
+ "step": 5
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 0.00019964928592495045,
20
+ "loss": 1.8983,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 0.0001992114701314478,
26
+ "loss": 1.7871,
27
+ "step": 15
28
+ },
29
+ {
30
+ "epoch": 0.16,
31
+ "learning_rate": 0.0001985996037070505,
32
+ "loss": 1.8482,
33
+ "step": 20
34
+ },
35
+ {
36
+ "epoch": 0.2,
37
+ "learning_rate": 0.00019781476007338058,
38
+ "loss": 1.731,
39
+ "step": 25
40
+ },
41
+ {
42
+ "epoch": 0.24,
43
+ "learning_rate": 0.0001968583161128631,
44
+ "loss": 1.6797,
45
+ "step": 30
46
+ },
47
+ {
48
+ "epoch": 0.28,
49
+ "learning_rate": 0.00019573194975320673,
50
+ "loss": 1.6694,
51
+ "step": 35
52
+ },
53
+ {
54
+ "epoch": 0.32,
55
+ "learning_rate": 0.00019443763702374812,
56
+ "loss": 1.7493,
57
+ "step": 40
58
+ },
59
+ {
60
+ "epoch": 0.36,
61
+ "learning_rate": 0.00019297764858882514,
62
+ "loss": 1.7823,
63
+ "step": 45
64
+ },
65
+ {
66
+ "epoch": 0.4,
67
+ "learning_rate": 0.0001913545457642601,
68
+ "loss": 1.6518,
69
+ "step": 50
70
+ },
71
+ {
72
+ "epoch": 0.44,
73
+ "learning_rate": 0.0001895711760239413,
74
+ "loss": 1.7334,
75
+ "step": 55
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "learning_rate": 0.00018763066800438636,
80
+ "loss": 1.7565,
81
+ "step": 60
82
+ },
83
+ {
84
+ "epoch": 0.52,
85
+ "learning_rate": 0.00018553642601605068,
86
+ "loss": 1.6493,
87
+ "step": 65
88
+ },
89
+ {
90
+ "epoch": 0.56,
91
+ "learning_rate": 0.00018329212407100994,
92
+ "loss": 1.7431,
93
+ "step": 70
94
+ },
95
+ {
96
+ "epoch": 0.6,
97
+ "learning_rate": 0.00018090169943749476,
98
+ "loss": 1.7561,
99
+ "step": 75
100
+ },
101
+ {
102
+ "epoch": 0.64,
103
+ "learning_rate": 0.000178369345732584,
104
+ "loss": 1.6185,
105
+ "step": 80
106
+ },
107
+ {
108
+ "epoch": 0.68,
109
+ "learning_rate": 0.00017569950556517566,
110
+ "loss": 1.6059,
111
+ "step": 85
112
+ },
113
+ {
114
+ "epoch": 0.72,
115
+ "learning_rate": 0.00017289686274214118,
116
+ "loss": 1.772,
117
+ "step": 90
118
+ },
119
+ {
120
+ "epoch": 0.76,
121
+ "learning_rate": 0.00016996633405133655,
122
+ "loss": 1.5847,
123
+ "step": 95
124
+ },
125
+ {
126
+ "epoch": 0.8,
127
+ "learning_rate": 0.00016691306063588583,
128
+ "loss": 1.615,
129
+ "step": 100
130
+ }
131
+ ],
132
+ "logging_steps": 5,
133
+ "max_steps": 375,
134
+ "num_input_tokens_seen": 0,
135
+ "num_train_epochs": 3,
136
+ "save_steps": 100,
137
+ "total_flos": 1.167000136777728e+16,
138
+ "train_batch_size": 2,
139
+ "trial_name": null,
140
+ "trial_params": null
141
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84990e28d454e290f6393201e221ba051e01af18e581a4f5994ac8396ad7c48b
3
+ size 4920
checkpoint-200/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: baichuan-inc/Baichuan-7B
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-200/adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "baichuan-inc/Baichuan-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "W_pack"
24
+ ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_dora": false,
27
+ "use_rslora": false
28
+ }
checkpoint-200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fd427937ffad205d951d665d21e6c29ec78ea7ca2c0874c99af7bce99bf6891
3
+ size 16785760
checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b191c5fd1b3719f54e10d217585d462494ad34105326a4ade27909c4ade32a32
3
+ size 33608634
checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
3
+ size 14244
checkpoint-200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:113d3d57baf5ea6fa1f49e5c9ccd10b9f9f78d00022910057b72b5bcea1f1d14
3
+ size 1064
checkpoint-200/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-200/tokenization_baichuan.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ import os
22
+ from shutil import copyfile
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ import sentencepiece as spm
26
+
27
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
28
+ from transformers.utils import logging
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
34
+
35
+ PRETRAINED_VOCAB_FILES_MAP = {
36
+ "vocab_file": {},
37
+ "tokenizer_file": {},
38
+ }
39
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
40
+
41
+
42
+ class BaiChuanTokenizer(PreTrainedTokenizer):
43
+ """
44
+ Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding.
45
+
46
+ Args:
47
+ vocab_file (`str`):
48
+ Path to the vocabulary file.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
53
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ def __init__(
57
+ self,
58
+ vocab_file,
59
+ unk_token="<unk>",
60
+ bos_token="<s>",
61
+ eos_token="</s>",
62
+ pad_token=None,
63
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
64
+ add_bos_token=True,
65
+ add_eos_token=False,
66
+ clean_up_tokenization_spaces=False,
67
+ **kwargs,
68
+ ):
69
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
70
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
71
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
72
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
73
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
74
+ self.vocab_file = vocab_file
75
+ self.add_bos_token = add_bos_token
76
+ self.add_eos_token = add_eos_token
77
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
78
+ self.sp_model.Load(vocab_file)
79
+
80
+ super().__init__(
81
+ bos_token=bos_token,
82
+ eos_token=eos_token,
83
+ unk_token=unk_token,
84
+ pad_token=pad_token,
85
+ add_bos_token=add_bos_token,
86
+ add_eos_token=add_eos_token,
87
+ sp_model_kwargs=self.sp_model_kwargs,
88
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
89
+ **kwargs,
90
+ )
91
+
92
+ def __getstate__(self):
93
+ state = self.__dict__.copy()
94
+ state["sp_model"] = None
95
+ return state
96
+
97
+ def __setstate__(self, d):
98
+ self.__dict__ = d
99
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
100
+ self.sp_model.Load(self.vocab_file)
101
+
102
+ @property
103
+ def vocab_size(self):
104
+ """Returns vocab size"""
105
+ return self.sp_model.get_piece_size()
106
+
107
+ def get_vocab(self):
108
+ """Returns vocab as a dict"""
109
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
110
+ vocab.update(self.added_tokens_encoder)
111
+ return vocab
112
+
113
+ def _tokenize(self, text):
114
+ """Returns a tokenized string."""
115
+ return self.sp_model.encode(text, out_type=str)
116
+
117
+ def _convert_token_to_id(self, token):
118
+ """Converts a token (str) in an id using the vocab."""
119
+ return self.sp_model.piece_to_id(token)
120
+
121
+ def _convert_id_to_token(self, index):
122
+ """Converts an index (integer) in a token (str) using the vocab."""
123
+ token = self.sp_model.IdToPiece(index)
124
+ return token
125
+
126
+ def convert_tokens_to_string(self, tokens):
127
+ """Converts a sequence of tokens (string) in a single string."""
128
+ current_sub_tokens = []
129
+ out_string = ""
130
+ prev_is_special = False
131
+ for i, token in enumerate(tokens):
132
+ # make sure that special tokens are not decoded using sentencepiece model
133
+ if token in self.all_special_tokens:
134
+ if not prev_is_special and i != 0:
135
+ out_string += " "
136
+ out_string += self.sp_model.decode(current_sub_tokens) + token
137
+ prev_is_special = True
138
+ current_sub_tokens = []
139
+ else:
140
+ current_sub_tokens.append(token)
141
+ prev_is_special = False
142
+ out_string += self.sp_model.decode(current_sub_tokens)
143
+ return out_string
144
+
145
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """
147
+ Save the vocabulary and special tokens file to a directory.
148
+
149
+ Args:
150
+ save_directory (`str`):
151
+ The directory in which to save the vocabulary.
152
+
153
+ Returns:
154
+ `Tuple(str)`: Paths to the files saved.
155
+ """
156
+ if not os.path.isdir(save_directory):
157
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
158
+ return
159
+ out_vocab_file = os.path.join(
160
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
161
+ )
162
+
163
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
164
+ copyfile(self.vocab_file, out_vocab_file)
165
+ elif not os.path.isfile(self.vocab_file):
166
+ with open(out_vocab_file, "wb") as fi:
167
+ content_spiece_model = self.sp_model.serialized_model_proto()
168
+ fi.write(content_spiece_model)
169
+
170
+ return (out_vocab_file,)
171
+
172
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
173
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
174
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
175
+
176
+ output = bos_token_id + token_ids_0 + eos_token_id
177
+
178
+ if token_ids_1 is not None:
179
+ output = output + bos_token_id + token_ids_1 + eos_token_id
180
+
181
+ return output
182
+
183
+ def get_special_tokens_mask(
184
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
185
+ ) -> List[int]:
186
+ """
187
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
188
+ special tokens using the tokenizer `prepare_for_model` method.
189
+
190
+ Args:
191
+ token_ids_0 (`List[int]`):
192
+ List of IDs.
193
+ token_ids_1 (`List[int]`, *optional*):
194
+ Optional second list of IDs for sequence pairs.
195
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
196
+ Whether or not the token list is already formatted with special tokens for the model.
197
+
198
+ Returns:
199
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
200
+ """
201
+ if already_has_special_tokens:
202
+ return super().get_special_tokens_mask(
203
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
204
+ )
205
+
206
+ bos_token_id = [1] if self.add_bos_token else []
207
+ eos_token_id = [1] if self.add_eos_token else []
208
+
209
+ if token_ids_1 is None:
210
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
211
+ return (
212
+ bos_token_id
213
+ + ([0] * len(token_ids_0))
214
+ + eos_token_id
215
+ + bos_token_id
216
+ + ([0] * len(token_ids_1))
217
+ + eos_token_id
218
+ )
219
+
220
+ def create_token_type_ids_from_sequences(
221
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
222
+ ) -> List[int]:
223
+ """
224
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
225
+ sequence pair mask has the following format:
226
+
227
+ ```
228
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
229
+ | first sequence | second sequence |
230
+ ```
231
+
232
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
233
+
234
+ Args:
235
+ token_ids_0 (`List[int]`):
236
+ List of ids.
237
+ token_ids_1 (`List[int]`, *optional*):
238
+ Optional second list of IDs for sequence pairs.
239
+
240
+ Returns:
241
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
242
+ """
243
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
244
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
245
+
246
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
247
+
248
+ if token_ids_1 is not None:
249
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
250
+
251
+ return output
checkpoint-200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be54af290d93c113bcbf421115ae9eed9d6340408f564898f1e966dc738ef01
3
+ size 1136699
checkpoint-200/tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenization_baichuan.BaiChuanTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<s>",
37
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message + '\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '\\nAssistant: ' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "</s>",
42
+ "padding_side": "right",
43
+ "sp_model_kwargs": {},
44
+ "split_special_tokens": false,
45
+ "tokenizer_class": "BaiChuanTokenizer",
46
+ "unk_token": "<unk>"
47
+ }
checkpoint-200/trainer_state.json ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.6,
5
+ "eval_steps": 500,
6
+ "global_step": 200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.00019991228300988585,
14
+ "loss": 2.1829,
15
+ "step": 5
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 0.00019964928592495045,
20
+ "loss": 1.8983,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 0.0001992114701314478,
26
+ "loss": 1.7871,
27
+ "step": 15
28
+ },
29
+ {
30
+ "epoch": 0.16,
31
+ "learning_rate": 0.0001985996037070505,
32
+ "loss": 1.8482,
33
+ "step": 20
34
+ },
35
+ {
36
+ "epoch": 0.2,
37
+ "learning_rate": 0.00019781476007338058,
38
+ "loss": 1.731,
39
+ "step": 25
40
+ },
41
+ {
42
+ "epoch": 0.24,
43
+ "learning_rate": 0.0001968583161128631,
44
+ "loss": 1.6797,
45
+ "step": 30
46
+ },
47
+ {
48
+ "epoch": 0.28,
49
+ "learning_rate": 0.00019573194975320673,
50
+ "loss": 1.6694,
51
+ "step": 35
52
+ },
53
+ {
54
+ "epoch": 0.32,
55
+ "learning_rate": 0.00019443763702374812,
56
+ "loss": 1.7493,
57
+ "step": 40
58
+ },
59
+ {
60
+ "epoch": 0.36,
61
+ "learning_rate": 0.00019297764858882514,
62
+ "loss": 1.7823,
63
+ "step": 45
64
+ },
65
+ {
66
+ "epoch": 0.4,
67
+ "learning_rate": 0.0001913545457642601,
68
+ "loss": 1.6518,
69
+ "step": 50
70
+ },
71
+ {
72
+ "epoch": 0.44,
73
+ "learning_rate": 0.0001895711760239413,
74
+ "loss": 1.7334,
75
+ "step": 55
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "learning_rate": 0.00018763066800438636,
80
+ "loss": 1.7565,
81
+ "step": 60
82
+ },
83
+ {
84
+ "epoch": 0.52,
85
+ "learning_rate": 0.00018553642601605068,
86
+ "loss": 1.6493,
87
+ "step": 65
88
+ },
89
+ {
90
+ "epoch": 0.56,
91
+ "learning_rate": 0.00018329212407100994,
92
+ "loss": 1.7431,
93
+ "step": 70
94
+ },
95
+ {
96
+ "epoch": 0.6,
97
+ "learning_rate": 0.00018090169943749476,
98
+ "loss": 1.7561,
99
+ "step": 75
100
+ },
101
+ {
102
+ "epoch": 0.64,
103
+ "learning_rate": 0.000178369345732584,
104
+ "loss": 1.6185,
105
+ "step": 80
106
+ },
107
+ {
108
+ "epoch": 0.68,
109
+ "learning_rate": 0.00017569950556517566,
110
+ "loss": 1.6059,
111
+ "step": 85
112
+ },
113
+ {
114
+ "epoch": 0.72,
115
+ "learning_rate": 0.00017289686274214118,
116
+ "loss": 1.772,
117
+ "step": 90
118
+ },
119
+ {
120
+ "epoch": 0.76,
121
+ "learning_rate": 0.00016996633405133655,
122
+ "loss": 1.5847,
123
+ "step": 95
124
+ },
125
+ {
126
+ "epoch": 0.8,
127
+ "learning_rate": 0.00016691306063588583,
128
+ "loss": 1.615,
129
+ "step": 100
130
+ },
131
+ {
132
+ "epoch": 0.84,
133
+ "learning_rate": 0.000163742398974869,
134
+ "loss": 1.5706,
135
+ "step": 105
136
+ },
137
+ {
138
+ "epoch": 0.88,
139
+ "learning_rate": 0.0001604599114862375,
140
+ "loss": 1.6997,
141
+ "step": 110
142
+ },
143
+ {
144
+ "epoch": 0.92,
145
+ "learning_rate": 0.0001570713567684432,
146
+ "loss": 1.5529,
147
+ "step": 115
148
+ },
149
+ {
150
+ "epoch": 0.96,
151
+ "learning_rate": 0.00015358267949789966,
152
+ "loss": 1.6631,
153
+ "step": 120
154
+ },
155
+ {
156
+ "epoch": 1.0,
157
+ "learning_rate": 0.00015000000000000001,
158
+ "loss": 1.7483,
159
+ "step": 125
160
+ },
161
+ {
162
+ "epoch": 1.04,
163
+ "learning_rate": 0.00014632960351198618,
164
+ "loss": 1.708,
165
+ "step": 130
166
+ },
167
+ {
168
+ "epoch": 1.08,
169
+ "learning_rate": 0.00014257792915650728,
170
+ "loss": 1.6979,
171
+ "step": 135
172
+ },
173
+ {
174
+ "epoch": 1.12,
175
+ "learning_rate": 0.0001387515586452103,
176
+ "loss": 1.53,
177
+ "step": 140
178
+ },
179
+ {
180
+ "epoch": 1.16,
181
+ "learning_rate": 0.00013485720473218154,
182
+ "loss": 1.6821,
183
+ "step": 145
184
+ },
185
+ {
186
+ "epoch": 1.2,
187
+ "learning_rate": 0.00013090169943749476,
188
+ "loss": 1.7208,
189
+ "step": 150
190
+ },
191
+ {
192
+ "epoch": 1.24,
193
+ "learning_rate": 0.00012689198206152657,
194
+ "loss": 1.6841,
195
+ "step": 155
196
+ },
197
+ {
198
+ "epoch": 1.28,
199
+ "learning_rate": 0.00012283508701106557,
200
+ "loss": 1.544,
201
+ "step": 160
202
+ },
203
+ {
204
+ "epoch": 1.32,
205
+ "learning_rate": 0.00011873813145857249,
206
+ "loss": 1.5851,
207
+ "step": 165
208
+ },
209
+ {
210
+ "epoch": 1.36,
211
+ "learning_rate": 0.00011460830285624118,
212
+ "loss": 1.56,
213
+ "step": 170
214
+ },
215
+ {
216
+ "epoch": 1.4,
217
+ "learning_rate": 0.00011045284632676536,
218
+ "loss": 1.5691,
219
+ "step": 175
220
+ },
221
+ {
222
+ "epoch": 1.44,
223
+ "learning_rate": 0.00010627905195293135,
224
+ "loss": 1.5201,
225
+ "step": 180
226
+ },
227
+ {
228
+ "epoch": 1.48,
229
+ "learning_rate": 0.0001020942419883357,
230
+ "loss": 1.5098,
231
+ "step": 185
232
+ },
233
+ {
234
+ "epoch": 1.52,
235
+ "learning_rate": 9.790575801166432e-05,
236
+ "loss": 1.5805,
237
+ "step": 190
238
+ },
239
+ {
240
+ "epoch": 1.56,
241
+ "learning_rate": 9.372094804706867e-05,
242
+ "loss": 1.6742,
243
+ "step": 195
244
+ },
245
+ {
246
+ "epoch": 1.6,
247
+ "learning_rate": 8.954715367323468e-05,
248
+ "loss": 1.5656,
249
+ "step": 200
250
+ }
251
+ ],
252
+ "logging_steps": 5,
253
+ "max_steps": 375,
254
+ "num_input_tokens_seen": 0,
255
+ "num_train_epochs": 3,
256
+ "save_steps": 100,
257
+ "total_flos": 2.33056963362816e+16,
258
+ "train_batch_size": 2,
259
+ "trial_name": null,
260
+ "trial_params": null
261
+ }
checkpoint-200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84990e28d454e290f6393201e221ba051e01af18e581a4f5994ac8396ad7c48b
3
+ size 4920
checkpoint-300/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: baichuan-inc/Baichuan-7B
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-300/adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "baichuan-inc/Baichuan-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "W_pack"
24
+ ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_dora": false,
27
+ "use_rslora": false
28
+ }
checkpoint-300/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2307fb868a4009a4fc5a667539f2d876a9f07e72b7c1b4c860db994c35619fdc
3
+ size 16785760
checkpoint-300/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2133b8491938d73143f5774b98ff887cce068cde7146f9669879447ceb31015f
3
+ size 33608634
checkpoint-300/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
3
+ size 14244
checkpoint-300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91b317c7ccd56c68fd734532b70e29880e29310f64235ace1ff27743b77de2c0
3
+ size 1064
checkpoint-300/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-300/tokenization_baichuan.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ import os
22
+ from shutil import copyfile
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ import sentencepiece as spm
26
+
27
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
28
+ from transformers.utils import logging
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
34
+
35
+ PRETRAINED_VOCAB_FILES_MAP = {
36
+ "vocab_file": {},
37
+ "tokenizer_file": {},
38
+ }
39
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
40
+
41
+
42
+ class BaiChuanTokenizer(PreTrainedTokenizer):
43
+ """
44
+ Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding.
45
+
46
+ Args:
47
+ vocab_file (`str`):
48
+ Path to the vocabulary file.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
53
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ def __init__(
57
+ self,
58
+ vocab_file,
59
+ unk_token="<unk>",
60
+ bos_token="<s>",
61
+ eos_token="</s>",
62
+ pad_token=None,
63
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
64
+ add_bos_token=True,
65
+ add_eos_token=False,
66
+ clean_up_tokenization_spaces=False,
67
+ **kwargs,
68
+ ):
69
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
70
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
71
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
72
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
73
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
74
+ self.vocab_file = vocab_file
75
+ self.add_bos_token = add_bos_token
76
+ self.add_eos_token = add_eos_token
77
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
78
+ self.sp_model.Load(vocab_file)
79
+
80
+ super().__init__(
81
+ bos_token=bos_token,
82
+ eos_token=eos_token,
83
+ unk_token=unk_token,
84
+ pad_token=pad_token,
85
+ add_bos_token=add_bos_token,
86
+ add_eos_token=add_eos_token,
87
+ sp_model_kwargs=self.sp_model_kwargs,
88
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
89
+ **kwargs,
90
+ )
91
+
92
+ def __getstate__(self):
93
+ state = self.__dict__.copy()
94
+ state["sp_model"] = None
95
+ return state
96
+
97
+ def __setstate__(self, d):
98
+ self.__dict__ = d
99
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
100
+ self.sp_model.Load(self.vocab_file)
101
+
102
+ @property
103
+ def vocab_size(self):
104
+ """Returns vocab size"""
105
+ return self.sp_model.get_piece_size()
106
+
107
+ def get_vocab(self):
108
+ """Returns vocab as a dict"""
109
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
110
+ vocab.update(self.added_tokens_encoder)
111
+ return vocab
112
+
113
+ def _tokenize(self, text):
114
+ """Returns a tokenized string."""
115
+ return self.sp_model.encode(text, out_type=str)
116
+
117
+ def _convert_token_to_id(self, token):
118
+ """Converts a token (str) in an id using the vocab."""
119
+ return self.sp_model.piece_to_id(token)
120
+
121
+ def _convert_id_to_token(self, index):
122
+ """Converts an index (integer) in a token (str) using the vocab."""
123
+ token = self.sp_model.IdToPiece(index)
124
+ return token
125
+
126
+ def convert_tokens_to_string(self, tokens):
127
+ """Converts a sequence of tokens (string) in a single string."""
128
+ current_sub_tokens = []
129
+ out_string = ""
130
+ prev_is_special = False
131
+ for i, token in enumerate(tokens):
132
+ # make sure that special tokens are not decoded using sentencepiece model
133
+ if token in self.all_special_tokens:
134
+ if not prev_is_special and i != 0:
135
+ out_string += " "
136
+ out_string += self.sp_model.decode(current_sub_tokens) + token
137
+ prev_is_special = True
138
+ current_sub_tokens = []
139
+ else:
140
+ current_sub_tokens.append(token)
141
+ prev_is_special = False
142
+ out_string += self.sp_model.decode(current_sub_tokens)
143
+ return out_string
144
+
145
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """
147
+ Save the vocabulary and special tokens file to a directory.
148
+
149
+ Args:
150
+ save_directory (`str`):
151
+ The directory in which to save the vocabulary.
152
+
153
+ Returns:
154
+ `Tuple(str)`: Paths to the files saved.
155
+ """
156
+ if not os.path.isdir(save_directory):
157
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
158
+ return
159
+ out_vocab_file = os.path.join(
160
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
161
+ )
162
+
163
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
164
+ copyfile(self.vocab_file, out_vocab_file)
165
+ elif not os.path.isfile(self.vocab_file):
166
+ with open(out_vocab_file, "wb") as fi:
167
+ content_spiece_model = self.sp_model.serialized_model_proto()
168
+ fi.write(content_spiece_model)
169
+
170
+ return (out_vocab_file,)
171
+
172
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
173
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
174
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
175
+
176
+ output = bos_token_id + token_ids_0 + eos_token_id
177
+
178
+ if token_ids_1 is not None:
179
+ output = output + bos_token_id + token_ids_1 + eos_token_id
180
+
181
+ return output
182
+
183
+ def get_special_tokens_mask(
184
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
185
+ ) -> List[int]:
186
+ """
187
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
188
+ special tokens using the tokenizer `prepare_for_model` method.
189
+
190
+ Args:
191
+ token_ids_0 (`List[int]`):
192
+ List of IDs.
193
+ token_ids_1 (`List[int]`, *optional*):
194
+ Optional second list of IDs for sequence pairs.
195
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
196
+ Whether or not the token list is already formatted with special tokens for the model.
197
+
198
+ Returns:
199
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
200
+ """
201
+ if already_has_special_tokens:
202
+ return super().get_special_tokens_mask(
203
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
204
+ )
205
+
206
+ bos_token_id = [1] if self.add_bos_token else []
207
+ eos_token_id = [1] if self.add_eos_token else []
208
+
209
+ if token_ids_1 is None:
210
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
211
+ return (
212
+ bos_token_id
213
+ + ([0] * len(token_ids_0))
214
+ + eos_token_id
215
+ + bos_token_id
216
+ + ([0] * len(token_ids_1))
217
+ + eos_token_id
218
+ )
219
+
220
+ def create_token_type_ids_from_sequences(
221
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
222
+ ) -> List[int]:
223
+ """
224
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
225
+ sequence pair mask has the following format:
226
+
227
+ ```
228
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
229
+ | first sequence | second sequence |
230
+ ```
231
+
232
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
233
+
234
+ Args:
235
+ token_ids_0 (`List[int]`):
236
+ List of ids.
237
+ token_ids_1 (`List[int]`, *optional*):
238
+ Optional second list of IDs for sequence pairs.
239
+
240
+ Returns:
241
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
242
+ """
243
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
244
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
245
+
246
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
247
+
248
+ if token_ids_1 is not None:
249
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
250
+
251
+ return output
checkpoint-300/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be54af290d93c113bcbf421115ae9eed9d6340408f564898f1e966dc738ef01
3
+ size 1136699
checkpoint-300/tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenization_baichuan.BaiChuanTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<s>",
37
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message + '\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '\\nAssistant: ' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "</s>",
42
+ "padding_side": "right",
43
+ "sp_model_kwargs": {},
44
+ "split_special_tokens": false,
45
+ "tokenizer_class": "BaiChuanTokenizer",
46
+ "unk_token": "<unk>"
47
+ }
checkpoint-300/trainer_state.json ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.4,
5
+ "eval_steps": 500,
6
+ "global_step": 300,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.00019991228300988585,
14
+ "loss": 2.1829,
15
+ "step": 5
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 0.00019964928592495045,
20
+ "loss": 1.8983,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 0.0001992114701314478,
26
+ "loss": 1.7871,
27
+ "step": 15
28
+ },
29
+ {
30
+ "epoch": 0.16,
31
+ "learning_rate": 0.0001985996037070505,
32
+ "loss": 1.8482,
33
+ "step": 20
34
+ },
35
+ {
36
+ "epoch": 0.2,
37
+ "learning_rate": 0.00019781476007338058,
38
+ "loss": 1.731,
39
+ "step": 25
40
+ },
41
+ {
42
+ "epoch": 0.24,
43
+ "learning_rate": 0.0001968583161128631,
44
+ "loss": 1.6797,
45
+ "step": 30
46
+ },
47
+ {
48
+ "epoch": 0.28,
49
+ "learning_rate": 0.00019573194975320673,
50
+ "loss": 1.6694,
51
+ "step": 35
52
+ },
53
+ {
54
+ "epoch": 0.32,
55
+ "learning_rate": 0.00019443763702374812,
56
+ "loss": 1.7493,
57
+ "step": 40
58
+ },
59
+ {
60
+ "epoch": 0.36,
61
+ "learning_rate": 0.00019297764858882514,
62
+ "loss": 1.7823,
63
+ "step": 45
64
+ },
65
+ {
66
+ "epoch": 0.4,
67
+ "learning_rate": 0.0001913545457642601,
68
+ "loss": 1.6518,
69
+ "step": 50
70
+ },
71
+ {
72
+ "epoch": 0.44,
73
+ "learning_rate": 0.0001895711760239413,
74
+ "loss": 1.7334,
75
+ "step": 55
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "learning_rate": 0.00018763066800438636,
80
+ "loss": 1.7565,
81
+ "step": 60
82
+ },
83
+ {
84
+ "epoch": 0.52,
85
+ "learning_rate": 0.00018553642601605068,
86
+ "loss": 1.6493,
87
+ "step": 65
88
+ },
89
+ {
90
+ "epoch": 0.56,
91
+ "learning_rate": 0.00018329212407100994,
92
+ "loss": 1.7431,
93
+ "step": 70
94
+ },
95
+ {
96
+ "epoch": 0.6,
97
+ "learning_rate": 0.00018090169943749476,
98
+ "loss": 1.7561,
99
+ "step": 75
100
+ },
101
+ {
102
+ "epoch": 0.64,
103
+ "learning_rate": 0.000178369345732584,
104
+ "loss": 1.6185,
105
+ "step": 80
106
+ },
107
+ {
108
+ "epoch": 0.68,
109
+ "learning_rate": 0.00017569950556517566,
110
+ "loss": 1.6059,
111
+ "step": 85
112
+ },
113
+ {
114
+ "epoch": 0.72,
115
+ "learning_rate": 0.00017289686274214118,
116
+ "loss": 1.772,
117
+ "step": 90
118
+ },
119
+ {
120
+ "epoch": 0.76,
121
+ "learning_rate": 0.00016996633405133655,
122
+ "loss": 1.5847,
123
+ "step": 95
124
+ },
125
+ {
126
+ "epoch": 0.8,
127
+ "learning_rate": 0.00016691306063588583,
128
+ "loss": 1.615,
129
+ "step": 100
130
+ },
131
+ {
132
+ "epoch": 0.84,
133
+ "learning_rate": 0.000163742398974869,
134
+ "loss": 1.5706,
135
+ "step": 105
136
+ },
137
+ {
138
+ "epoch": 0.88,
139
+ "learning_rate": 0.0001604599114862375,
140
+ "loss": 1.6997,
141
+ "step": 110
142
+ },
143
+ {
144
+ "epoch": 0.92,
145
+ "learning_rate": 0.0001570713567684432,
146
+ "loss": 1.5529,
147
+ "step": 115
148
+ },
149
+ {
150
+ "epoch": 0.96,
151
+ "learning_rate": 0.00015358267949789966,
152
+ "loss": 1.6631,
153
+ "step": 120
154
+ },
155
+ {
156
+ "epoch": 1.0,
157
+ "learning_rate": 0.00015000000000000001,
158
+ "loss": 1.7483,
159
+ "step": 125
160
+ },
161
+ {
162
+ "epoch": 1.04,
163
+ "learning_rate": 0.00014632960351198618,
164
+ "loss": 1.708,
165
+ "step": 130
166
+ },
167
+ {
168
+ "epoch": 1.08,
169
+ "learning_rate": 0.00014257792915650728,
170
+ "loss": 1.6979,
171
+ "step": 135
172
+ },
173
+ {
174
+ "epoch": 1.12,
175
+ "learning_rate": 0.0001387515586452103,
176
+ "loss": 1.53,
177
+ "step": 140
178
+ },
179
+ {
180
+ "epoch": 1.16,
181
+ "learning_rate": 0.00013485720473218154,
182
+ "loss": 1.6821,
183
+ "step": 145
184
+ },
185
+ {
186
+ "epoch": 1.2,
187
+ "learning_rate": 0.00013090169943749476,
188
+ "loss": 1.7208,
189
+ "step": 150
190
+ },
191
+ {
192
+ "epoch": 1.24,
193
+ "learning_rate": 0.00012689198206152657,
194
+ "loss": 1.6841,
195
+ "step": 155
196
+ },
197
+ {
198
+ "epoch": 1.28,
199
+ "learning_rate": 0.00012283508701106557,
200
+ "loss": 1.544,
201
+ "step": 160
202
+ },
203
+ {
204
+ "epoch": 1.32,
205
+ "learning_rate": 0.00011873813145857249,
206
+ "loss": 1.5851,
207
+ "step": 165
208
+ },
209
+ {
210
+ "epoch": 1.36,
211
+ "learning_rate": 0.00011460830285624118,
212
+ "loss": 1.56,
213
+ "step": 170
214
+ },
215
+ {
216
+ "epoch": 1.4,
217
+ "learning_rate": 0.00011045284632676536,
218
+ "loss": 1.5691,
219
+ "step": 175
220
+ },
221
+ {
222
+ "epoch": 1.44,
223
+ "learning_rate": 0.00010627905195293135,
224
+ "loss": 1.5201,
225
+ "step": 180
226
+ },
227
+ {
228
+ "epoch": 1.48,
229
+ "learning_rate": 0.0001020942419883357,
230
+ "loss": 1.5098,
231
+ "step": 185
232
+ },
233
+ {
234
+ "epoch": 1.52,
235
+ "learning_rate": 9.790575801166432e-05,
236
+ "loss": 1.5805,
237
+ "step": 190
238
+ },
239
+ {
240
+ "epoch": 1.56,
241
+ "learning_rate": 9.372094804706867e-05,
242
+ "loss": 1.6742,
243
+ "step": 195
244
+ },
245
+ {
246
+ "epoch": 1.6,
247
+ "learning_rate": 8.954715367323468e-05,
248
+ "loss": 1.5656,
249
+ "step": 200
250
+ },
251
+ {
252
+ "epoch": 1.64,
253
+ "learning_rate": 8.539169714375885e-05,
254
+ "loss": 1.6301,
255
+ "step": 205
256
+ },
257
+ {
258
+ "epoch": 1.68,
259
+ "learning_rate": 8.126186854142752e-05,
260
+ "loss": 1.6027,
261
+ "step": 210
262
+ },
263
+ {
264
+ "epoch": 1.72,
265
+ "learning_rate": 7.716491298893442e-05,
266
+ "loss": 1.6494,
267
+ "step": 215
268
+ },
269
+ {
270
+ "epoch": 1.76,
271
+ "learning_rate": 7.310801793847344e-05,
272
+ "loss": 1.5962,
273
+ "step": 220
274
+ },
275
+ {
276
+ "epoch": 1.8,
277
+ "learning_rate": 6.909830056250527e-05,
278
+ "loss": 1.5375,
279
+ "step": 225
280
+ },
281
+ {
282
+ "epoch": 1.84,
283
+ "learning_rate": 6.51427952678185e-05,
284
+ "loss": 1.596,
285
+ "step": 230
286
+ },
287
+ {
288
+ "epoch": 1.88,
289
+ "learning_rate": 6.12484413547897e-05,
290
+ "loss": 1.6401,
291
+ "step": 235
292
+ },
293
+ {
294
+ "epoch": 1.92,
295
+ "learning_rate": 5.7422070843492734e-05,
296
+ "loss": 1.5735,
297
+ "step": 240
298
+ },
299
+ {
300
+ "epoch": 1.96,
301
+ "learning_rate": 5.3670396488013854e-05,
302
+ "loss": 1.6057,
303
+ "step": 245
304
+ },
305
+ {
306
+ "epoch": 2.0,
307
+ "learning_rate": 5.000000000000002e-05,
308
+ "loss": 1.5428,
309
+ "step": 250
310
+ },
311
+ {
312
+ "epoch": 2.04,
313
+ "learning_rate": 4.6417320502100316e-05,
314
+ "loss": 1.6843,
315
+ "step": 255
316
+ },
317
+ {
318
+ "epoch": 2.08,
319
+ "learning_rate": 4.2928643231556844e-05,
320
+ "loss": 1.6004,
321
+ "step": 260
322
+ },
323
+ {
324
+ "epoch": 2.12,
325
+ "learning_rate": 3.954008851376252e-05,
326
+ "loss": 1.5231,
327
+ "step": 265
328
+ },
329
+ {
330
+ "epoch": 2.16,
331
+ "learning_rate": 3.6257601025131026e-05,
332
+ "loss": 1.6147,
333
+ "step": 270
334
+ },
335
+ {
336
+ "epoch": 2.2,
337
+ "learning_rate": 3.308693936411421e-05,
338
+ "loss": 1.5095,
339
+ "step": 275
340
+ },
341
+ {
342
+ "epoch": 2.24,
343
+ "learning_rate": 3.0033665948663448e-05,
344
+ "loss": 1.6355,
345
+ "step": 280
346
+ },
347
+ {
348
+ "epoch": 2.28,
349
+ "learning_rate": 2.7103137257858868e-05,
350
+ "loss": 1.4353,
351
+ "step": 285
352
+ },
353
+ {
354
+ "epoch": 2.32,
355
+ "learning_rate": 2.4300494434824373e-05,
356
+ "loss": 1.5886,
357
+ "step": 290
358
+ },
359
+ {
360
+ "epoch": 2.36,
361
+ "learning_rate": 2.163065426741603e-05,
362
+ "loss": 1.5915,
363
+ "step": 295
364
+ },
365
+ {
366
+ "epoch": 2.4,
367
+ "learning_rate": 1.9098300562505266e-05,
368
+ "loss": 1.5584,
369
+ "step": 300
370
+ }
371
+ ],
372
+ "logging_steps": 5,
373
+ "max_steps": 375,
374
+ "num_input_tokens_seen": 0,
375
+ "num_train_epochs": 3,
376
+ "save_steps": 100,
377
+ "total_flos": 3.46235716435968e+16,
378
+ "train_batch_size": 2,
379
+ "trial_name": null,
380
+ "trial_params": null
381
+ }
checkpoint-300/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84990e28d454e290f6393201e221ba051e01af18e581a4f5994ac8396ad7c48b
3
+ size 4920
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenization_baichuan.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ import os
22
+ from shutil import copyfile
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ import sentencepiece as spm
26
+
27
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
28
+ from transformers.utils import logging
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
34
+
35
+ PRETRAINED_VOCAB_FILES_MAP = {
36
+ "vocab_file": {},
37
+ "tokenizer_file": {},
38
+ }
39
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
40
+
41
+
42
+ class BaiChuanTokenizer(PreTrainedTokenizer):
43
+ """
44
+ Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding.
45
+
46
+ Args:
47
+ vocab_file (`str`):
48
+ Path to the vocabulary file.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
53
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ def __init__(
57
+ self,
58
+ vocab_file,
59
+ unk_token="<unk>",
60
+ bos_token="<s>",
61
+ eos_token="</s>",
62
+ pad_token=None,
63
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
64
+ add_bos_token=True,
65
+ add_eos_token=False,
66
+ clean_up_tokenization_spaces=False,
67
+ **kwargs,
68
+ ):
69
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
70
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
71
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
72
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
73
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
74
+ self.vocab_file = vocab_file
75
+ self.add_bos_token = add_bos_token
76
+ self.add_eos_token = add_eos_token
77
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
78
+ self.sp_model.Load(vocab_file)
79
+
80
+ super().__init__(
81
+ bos_token=bos_token,
82
+ eos_token=eos_token,
83
+ unk_token=unk_token,
84
+ pad_token=pad_token,
85
+ add_bos_token=add_bos_token,
86
+ add_eos_token=add_eos_token,
87
+ sp_model_kwargs=self.sp_model_kwargs,
88
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
89
+ **kwargs,
90
+ )
91
+
92
+ def __getstate__(self):
93
+ state = self.__dict__.copy()
94
+ state["sp_model"] = None
95
+ return state
96
+
97
+ def __setstate__(self, d):
98
+ self.__dict__ = d
99
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
100
+ self.sp_model.Load(self.vocab_file)
101
+
102
+ @property
103
+ def vocab_size(self):
104
+ """Returns vocab size"""
105
+ return self.sp_model.get_piece_size()
106
+
107
+ def get_vocab(self):
108
+ """Returns vocab as a dict"""
109
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
110
+ vocab.update(self.added_tokens_encoder)
111
+ return vocab
112
+
113
+ def _tokenize(self, text):
114
+ """Returns a tokenized string."""
115
+ return self.sp_model.encode(text, out_type=str)
116
+
117
+ def _convert_token_to_id(self, token):
118
+ """Converts a token (str) in an id using the vocab."""
119
+ return self.sp_model.piece_to_id(token)
120
+
121
+ def _convert_id_to_token(self, index):
122
+ """Converts an index (integer) in a token (str) using the vocab."""
123
+ token = self.sp_model.IdToPiece(index)
124
+ return token
125
+
126
+ def convert_tokens_to_string(self, tokens):
127
+ """Converts a sequence of tokens (string) in a single string."""
128
+ current_sub_tokens = []
129
+ out_string = ""
130
+ prev_is_special = False
131
+ for i, token in enumerate(tokens):
132
+ # make sure that special tokens are not decoded using sentencepiece model
133
+ if token in self.all_special_tokens:
134
+ if not prev_is_special and i != 0:
135
+ out_string += " "
136
+ out_string += self.sp_model.decode(current_sub_tokens) + token
137
+ prev_is_special = True
138
+ current_sub_tokens = []
139
+ else:
140
+ current_sub_tokens.append(token)
141
+ prev_is_special = False
142
+ out_string += self.sp_model.decode(current_sub_tokens)
143
+ return out_string
144
+
145
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """
147
+ Save the vocabulary and special tokens file to a directory.
148
+
149
+ Args:
150
+ save_directory (`str`):
151
+ The directory in which to save the vocabulary.
152
+
153
+ Returns:
154
+ `Tuple(str)`: Paths to the files saved.
155
+ """
156
+ if not os.path.isdir(save_directory):
157
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
158
+ return
159
+ out_vocab_file = os.path.join(
160
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
161
+ )
162
+
163
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
164
+ copyfile(self.vocab_file, out_vocab_file)
165
+ elif not os.path.isfile(self.vocab_file):
166
+ with open(out_vocab_file, "wb") as fi:
167
+ content_spiece_model = self.sp_model.serialized_model_proto()
168
+ fi.write(content_spiece_model)
169
+
170
+ return (out_vocab_file,)
171
+
172
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
173
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
174
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
175
+
176
+ output = bos_token_id + token_ids_0 + eos_token_id
177
+
178
+ if token_ids_1 is not None:
179
+ output = output + bos_token_id + token_ids_1 + eos_token_id
180
+
181
+ return output
182
+
183
+ def get_special_tokens_mask(
184
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
185
+ ) -> List[int]:
186
+ """
187
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
188
+ special tokens using the tokenizer `prepare_for_model` method.
189
+
190
+ Args:
191
+ token_ids_0 (`List[int]`):
192
+ List of IDs.
193
+ token_ids_1 (`List[int]`, *optional*):
194
+ Optional second list of IDs for sequence pairs.
195
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
196
+ Whether or not the token list is already formatted with special tokens for the model.
197
+
198
+ Returns:
199
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
200
+ """
201
+ if already_has_special_tokens:
202
+ return super().get_special_tokens_mask(
203
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
204
+ )
205
+
206
+ bos_token_id = [1] if self.add_bos_token else []
207
+ eos_token_id = [1] if self.add_eos_token else []
208
+
209
+ if token_ids_1 is None:
210
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
211
+ return (
212
+ bos_token_id
213
+ + ([0] * len(token_ids_0))
214
+ + eos_token_id
215
+ + bos_token_id
216
+ + ([0] * len(token_ids_1))
217
+ + eos_token_id
218
+ )
219
+
220
+ def create_token_type_ids_from_sequences(
221
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
222
+ ) -> List[int]:
223
+ """
224
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
225
+ sequence pair mask has the following format:
226
+
227
+ ```
228
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
229
+ | first sequence | second sequence |
230
+ ```
231
+
232
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
233
+
234
+ Args:
235
+ token_ids_0 (`List[int]`):
236
+ List of ids.
237
+ token_ids_1 (`List[int]`, *optional*):
238
+ Optional second list of IDs for sequence pairs.
239
+
240
+ Returns:
241
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
242
+ """
243
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
244
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
245
+
246
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
247
+
248
+ if token_ids_1 is not None:
249
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
250
+
251
+ return output
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be54af290d93c113bcbf421115ae9eed9d6340408f564898f1e966dc738ef01
3
+ size 1136699
tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenization_baichuan.BaiChuanTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<s>",
37
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message + '\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '\\nAssistant: ' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "</s>",
42
+ "padding_side": "right",
43
+ "sp_model_kwargs": {},
44
+ "split_special_tokens": false,
45
+ "tokenizer_class": "BaiChuanTokenizer",
46
+ "unk_token": "<unk>"
47
+ }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 1.628477378845215,
4
+ "train_runtime": 1045.5144,
5
+ "train_samples_per_second": 5.739,
6
+ "train_steps_per_second": 0.359
7
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 375, "loss": 2.1829, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00019991228300988585, "epoch": 0.04, "percentage": 1.33, "elapsed_time": "0:00:12", "remaining_time": "0:15:11"}
2
+ {"current_steps": 10, "total_steps": 375, "loss": 1.8983, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00019964928592495045, "epoch": 0.08, "percentage": 2.67, "elapsed_time": "0:00:25", "remaining_time": "0:15:39"}
3
+ {"current_steps": 15, "total_steps": 375, "loss": 1.7871, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001992114701314478, "epoch": 0.12, "percentage": 4.0, "elapsed_time": "0:00:38", "remaining_time": "0:15:27"}
4
+ {"current_steps": 20, "total_steps": 375, "loss": 1.8482, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001985996037070505, "epoch": 0.16, "percentage": 5.33, "elapsed_time": "0:00:52", "remaining_time": "0:15:39"}
5
+ {"current_steps": 25, "total_steps": 375, "loss": 1.731, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00019781476007338058, "epoch": 0.2, "percentage": 6.67, "elapsed_time": "0:01:07", "remaining_time": "0:15:48"}
6
+ {"current_steps": 30, "total_steps": 375, "loss": 1.6797, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001968583161128631, "epoch": 0.24, "percentage": 8.0, "elapsed_time": "0:01:22", "remaining_time": "0:15:52"}
7
+ {"current_steps": 35, "total_steps": 375, "loss": 1.6694, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00019573194975320673, "epoch": 0.28, "percentage": 9.33, "elapsed_time": "0:01:36", "remaining_time": "0:15:32"}
8
+ {"current_steps": 40, "total_steps": 375, "loss": 1.7493, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00019443763702374812, "epoch": 0.32, "percentage": 10.67, "elapsed_time": "0:01:50", "remaining_time": "0:15:24"}
9
+ {"current_steps": 45, "total_steps": 375, "loss": 1.7823, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00019297764858882514, "epoch": 0.36, "percentage": 12.0, "elapsed_time": "0:02:03", "remaining_time": "0:15:08"}
10
+ {"current_steps": 50, "total_steps": 375, "loss": 1.6518, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001913545457642601, "epoch": 0.4, "percentage": 13.33, "elapsed_time": "0:02:16", "remaining_time": "0:14:45"}
11
+ {"current_steps": 55, "total_steps": 375, "loss": 1.7334, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001895711760239413, "epoch": 0.44, "percentage": 14.67, "elapsed_time": "0:02:30", "remaining_time": "0:14:37"}
12
+ {"current_steps": 60, "total_steps": 375, "loss": 1.7565, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00018763066800438636, "epoch": 0.48, "percentage": 16.0, "elapsed_time": "0:02:43", "remaining_time": "0:14:18"}
13
+ {"current_steps": 65, "total_steps": 375, "loss": 1.6493, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00018553642601605068, "epoch": 0.52, "percentage": 17.33, "elapsed_time": "0:02:56", "remaining_time": "0:14:02"}
14
+ {"current_steps": 70, "total_steps": 375, "loss": 1.7431, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00018329212407100994, "epoch": 0.56, "percentage": 18.67, "elapsed_time": "0:03:10", "remaining_time": "0:13:47"}
15
+ {"current_steps": 75, "total_steps": 375, "loss": 1.7561, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00018090169943749476, "epoch": 0.6, "percentage": 20.0, "elapsed_time": "0:03:23", "remaining_time": "0:13:32"}
16
+ {"current_steps": 80, "total_steps": 375, "loss": 1.6185, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.000178369345732584, "epoch": 0.64, "percentage": 21.33, "elapsed_time": "0:03:39", "remaining_time": "0:13:30"}
17
+ {"current_steps": 85, "total_steps": 375, "loss": 1.6059, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00017569950556517566, "epoch": 0.68, "percentage": 22.67, "elapsed_time": "0:03:54", "remaining_time": "0:13:18"}
18
+ {"current_steps": 90, "total_steps": 375, "loss": 1.772, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00017289686274214118, "epoch": 0.72, "percentage": 24.0, "elapsed_time": "0:04:09", "remaining_time": "0:13:10"}
19
+ {"current_steps": 95, "total_steps": 375, "loss": 1.5847, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00016996633405133655, "epoch": 0.76, "percentage": 25.33, "elapsed_time": "0:04:23", "remaining_time": "0:12:57"}
20
+ {"current_steps": 100, "total_steps": 375, "loss": 1.615, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00016691306063588583, "epoch": 0.8, "percentage": 26.67, "elapsed_time": "0:04:37", "remaining_time": "0:12:42"}
21
+ {"current_steps": 105, "total_steps": 375, "loss": 1.5706, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.000163742398974869, "epoch": 0.84, "percentage": 28.0, "elapsed_time": "0:04:51", "remaining_time": "0:12:29"}
22
+ {"current_steps": 110, "total_steps": 375, "loss": 1.6997, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001604599114862375, "epoch": 0.88, "percentage": 29.33, "elapsed_time": "0:05:04", "remaining_time": "0:12:14"}
23
+ {"current_steps": 115, "total_steps": 375, "loss": 1.5529, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001570713567684432, "epoch": 0.92, "percentage": 30.67, "elapsed_time": "0:05:19", "remaining_time": "0:12:02"}
24
+ {"current_steps": 120, "total_steps": 375, "loss": 1.6631, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00015358267949789966, "epoch": 0.96, "percentage": 32.0, "elapsed_time": "0:05:33", "remaining_time": "0:11:48"}
25
+ {"current_steps": 125, "total_steps": 375, "loss": 1.7483, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00015000000000000001, "epoch": 1.0, "percentage": 33.33, "elapsed_time": "0:05:47", "remaining_time": "0:11:34"}
26
+ {"current_steps": 130, "total_steps": 375, "loss": 1.708, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00014632960351198618, "epoch": 1.04, "percentage": 34.67, "elapsed_time": "0:06:02", "remaining_time": "0:11:22"}
27
+ {"current_steps": 135, "total_steps": 375, "loss": 1.6979, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00014257792915650728, "epoch": 1.08, "percentage": 36.0, "elapsed_time": "0:06:16", "remaining_time": "0:11:09"}
28
+ {"current_steps": 140, "total_steps": 375, "loss": 1.53, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001387515586452103, "epoch": 1.12, "percentage": 37.33, "elapsed_time": "0:06:31", "remaining_time": "0:10:57"}
29
+ {"current_steps": 145, "total_steps": 375, "loss": 1.6821, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00013485720473218154, "epoch": 1.16, "percentage": 38.67, "elapsed_time": "0:06:44", "remaining_time": "0:10:41"}
30
+ {"current_steps": 150, "total_steps": 375, "loss": 1.7208, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00013090169943749476, "epoch": 1.2, "percentage": 40.0, "elapsed_time": "0:06:58", "remaining_time": "0:10:27"}
31
+ {"current_steps": 155, "total_steps": 375, "loss": 1.6841, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00012689198206152657, "epoch": 1.24, "percentage": 41.33, "elapsed_time": "0:07:13", "remaining_time": "0:10:14"}
32
+ {"current_steps": 160, "total_steps": 375, "loss": 1.544, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00012283508701106557, "epoch": 1.28, "percentage": 42.67, "elapsed_time": "0:07:27", "remaining_time": "0:10:00"}
33
+ {"current_steps": 165, "total_steps": 375, "loss": 1.5851, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00011873813145857249, "epoch": 1.32, "percentage": 44.0, "elapsed_time": "0:07:40", "remaining_time": "0:09:46"}
34
+ {"current_steps": 170, "total_steps": 375, "loss": 1.56, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00011460830285624118, "epoch": 1.36, "percentage": 45.33, "elapsed_time": "0:07:54", "remaining_time": "0:09:32"}
35
+ {"current_steps": 175, "total_steps": 375, "loss": 1.5691, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00011045284632676536, "epoch": 1.4, "percentage": 46.67, "elapsed_time": "0:08:09", "remaining_time": "0:09:19"}
36
+ {"current_steps": 180, "total_steps": 375, "loss": 1.5201, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.00010627905195293135, "epoch": 1.44, "percentage": 48.0, "elapsed_time": "0:08:22", "remaining_time": "0:09:04"}
37
+ {"current_steps": 185, "total_steps": 375, "loss": 1.5098, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0001020942419883357, "epoch": 1.48, "percentage": 49.33, "elapsed_time": "0:08:37", "remaining_time": "0:08:51"}
38
+ {"current_steps": 190, "total_steps": 375, "loss": 1.5805, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 9.790575801166432e-05, "epoch": 1.52, "percentage": 50.67, "elapsed_time": "0:08:52", "remaining_time": "0:08:38"}
39
+ {"current_steps": 195, "total_steps": 375, "loss": 1.6742, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 9.372094804706867e-05, "epoch": 1.56, "percentage": 52.0, "elapsed_time": "0:09:05", "remaining_time": "0:08:23"}
40
+ {"current_steps": 200, "total_steps": 375, "loss": 1.5656, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 8.954715367323468e-05, "epoch": 1.6, "percentage": 53.33, "elapsed_time": "0:09:18", "remaining_time": "0:08:08"}
41
+ {"current_steps": 205, "total_steps": 375, "loss": 1.6301, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 8.539169714375885e-05, "epoch": 1.64, "percentage": 54.67, "elapsed_time": "0:09:33", "remaining_time": "0:07:55"}
42
+ {"current_steps": 210, "total_steps": 375, "loss": 1.6027, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 8.126186854142752e-05, "epoch": 1.68, "percentage": 56.0, "elapsed_time": "0:09:47", "remaining_time": "0:07:41"}
43
+ {"current_steps": 215, "total_steps": 375, "loss": 1.6494, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 7.716491298893442e-05, "epoch": 1.72, "percentage": 57.33, "elapsed_time": "0:10:01", "remaining_time": "0:07:27"}
44
+ {"current_steps": 220, "total_steps": 375, "loss": 1.5962, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 7.310801793847344e-05, "epoch": 1.76, "percentage": 58.67, "elapsed_time": "0:10:16", "remaining_time": "0:07:14"}
45
+ {"current_steps": 225, "total_steps": 375, "loss": 1.5375, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.909830056250527e-05, "epoch": 1.8, "percentage": 60.0, "elapsed_time": "0:10:30", "remaining_time": "0:07:00"}
46
+ {"current_steps": 230, "total_steps": 375, "loss": 1.596, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.51427952678185e-05, "epoch": 1.84, "percentage": 61.33, "elapsed_time": "0:10:43", "remaining_time": "0:06:45"}
47
+ {"current_steps": 235, "total_steps": 375, "loss": 1.6401, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.12484413547897e-05, "epoch": 1.88, "percentage": 62.67, "elapsed_time": "0:10:57", "remaining_time": "0:06:31"}
48
+ {"current_steps": 240, "total_steps": 375, "loss": 1.5735, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.7422070843492734e-05, "epoch": 1.92, "percentage": 64.0, "elapsed_time": "0:11:11", "remaining_time": "0:06:17"}
49
+ {"current_steps": 245, "total_steps": 375, "loss": 1.6057, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.3670396488013854e-05, "epoch": 1.96, "percentage": 65.33, "elapsed_time": "0:11:23", "remaining_time": "0:06:02"}
50
+ {"current_steps": 250, "total_steps": 375, "loss": 1.5428, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.000000000000002e-05, "epoch": 2.0, "percentage": 66.67, "elapsed_time": "0:11:37", "remaining_time": "0:05:48"}
51
+ {"current_steps": 255, "total_steps": 375, "loss": 1.6843, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.6417320502100316e-05, "epoch": 2.04, "percentage": 68.0, "elapsed_time": "0:11:51", "remaining_time": "0:05:34"}
52
+ {"current_steps": 260, "total_steps": 375, "loss": 1.6004, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.2928643231556844e-05, "epoch": 2.08, "percentage": 69.33, "elapsed_time": "0:12:05", "remaining_time": "0:05:20"}
53
+ {"current_steps": 265, "total_steps": 375, "loss": 1.5231, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.954008851376252e-05, "epoch": 2.12, "percentage": 70.67, "elapsed_time": "0:12:18", "remaining_time": "0:05:06"}
54
+ {"current_steps": 270, "total_steps": 375, "loss": 1.6147, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.6257601025131026e-05, "epoch": 2.16, "percentage": 72.0, "elapsed_time": "0:12:32", "remaining_time": "0:04:52"}
55
+ {"current_steps": 275, "total_steps": 375, "loss": 1.5095, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.308693936411421e-05, "epoch": 2.2, "percentage": 73.33, "elapsed_time": "0:12:47", "remaining_time": "0:04:38"}
56
+ {"current_steps": 280, "total_steps": 375, "loss": 1.6355, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.0033665948663448e-05, "epoch": 2.24, "percentage": 74.67, "elapsed_time": "0:13:01", "remaining_time": "0:04:25"}
57
+ {"current_steps": 285, "total_steps": 375, "loss": 1.4353, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.7103137257858868e-05, "epoch": 2.28, "percentage": 76.0, "elapsed_time": "0:13:12", "remaining_time": "0:04:10"}
58
+ {"current_steps": 290, "total_steps": 375, "loss": 1.5886, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.4300494434824373e-05, "epoch": 2.32, "percentage": 77.33, "elapsed_time": "0:13:27", "remaining_time": "0:03:56"}
59
+ {"current_steps": 295, "total_steps": 375, "loss": 1.5915, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.163065426741603e-05, "epoch": 2.36, "percentage": 78.67, "elapsed_time": "0:13:42", "remaining_time": "0:03:42"}
60
+ {"current_steps": 300, "total_steps": 375, "loss": 1.5584, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.9098300562505266e-05, "epoch": 2.4, "percentage": 80.0, "elapsed_time": "0:13:56", "remaining_time": "0:03:29"}
61
+ {"current_steps": 305, "total_steps": 375, "loss": 1.4628, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.6707875928990058e-05, "epoch": 2.44, "percentage": 81.33, "elapsed_time": "0:14:11", "remaining_time": "0:03:15"}
62
+ {"current_steps": 310, "total_steps": 375, "loss": 1.5502, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.4463573983949341e-05, "epoch": 2.48, "percentage": 82.67, "elapsed_time": "0:14:27", "remaining_time": "0:03:01"}
63
+ {"current_steps": 315, "total_steps": 375, "loss": 1.4882, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.2369331995613665e-05, "epoch": 2.52, "percentage": 84.0, "elapsed_time": "0:14:41", "remaining_time": "0:02:47"}
64
+ {"current_steps": 320, "total_steps": 375, "loss": 1.6666, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.042882397605871e-05, "epoch": 2.56, "percentage": 85.33, "elapsed_time": "0:14:55", "remaining_time": "0:02:33"}
65
+ {"current_steps": 325, "total_steps": 375, "loss": 1.4874, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 8.645454235739903e-06, "epoch": 2.6, "percentage": 86.67, "elapsed_time": "0:15:08", "remaining_time": "0:02:19"}
66
+ {"current_steps": 330, "total_steps": 375, "loss": 1.6158, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 7.022351411174866e-06, "epoch": 2.64, "percentage": 88.0, "elapsed_time": "0:15:20", "remaining_time": "0:02:05"}
67
+ {"current_steps": 335, "total_steps": 375, "loss": 1.4376, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.562362976251901e-06, "epoch": 2.68, "percentage": 89.33, "elapsed_time": "0:15:33", "remaining_time": "0:01:51"}
68
+ {"current_steps": 340, "total_steps": 375, "loss": 1.6202, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.268050246793276e-06, "epoch": 2.72, "percentage": 90.67, "elapsed_time": "0:15:48", "remaining_time": "0:01:37"}
69
+ {"current_steps": 345, "total_steps": 375, "loss": 1.5493, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.1416838871368924e-06, "epoch": 2.76, "percentage": 92.0, "elapsed_time": "0:16:02", "remaining_time": "0:01:23"}
70
+ {"current_steps": 350, "total_steps": 375, "loss": 1.6157, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.1852399266194314e-06, "epoch": 2.8, "percentage": 93.33, "elapsed_time": "0:16:17", "remaining_time": "0:01:09"}
71
+ {"current_steps": 355, "total_steps": 375, "loss": 1.5631, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.400396292949513e-06, "epoch": 2.84, "percentage": 94.67, "elapsed_time": "0:16:32", "remaining_time": "0:00:55"}
72
+ {"current_steps": 360, "total_steps": 375, "loss": 1.5326, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 7.885298685522235e-07, "epoch": 2.88, "percentage": 96.0, "elapsed_time": "0:16:43", "remaining_time": "0:00:41"}
73
+ {"current_steps": 365, "total_steps": 375, "loss": 1.5978, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.50714075049563e-07, "epoch": 2.92, "percentage": 97.33, "elapsed_time": "0:16:57", "remaining_time": "0:00:27"}
74
+ {"current_steps": 370, "total_steps": 375, "loss": 1.4251, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 8.771699011416168e-08, "epoch": 2.96, "percentage": 98.67, "elapsed_time": "0:17:12", "remaining_time": "0:00:13"}
75
+ {"current_steps": 375, "total_steps": 375, "loss": 1.6276, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "0:17:25", "remaining_time": "0:00:00"}
76
+ {"current_steps": 375, "total_steps": 375, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "0:17:25", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 375,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.00019991228300988585,
14
+ "loss": 2.1829,
15
+ "step": 5
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 0.00019964928592495045,
20
+ "loss": 1.8983,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 0.0001992114701314478,
26
+ "loss": 1.7871,
27
+ "step": 15
28
+ },
29
+ {
30
+ "epoch": 0.16,
31
+ "learning_rate": 0.0001985996037070505,
32
+ "loss": 1.8482,
33
+ "step": 20
34
+ },
35
+ {
36
+ "epoch": 0.2,
37
+ "learning_rate": 0.00019781476007338058,
38
+ "loss": 1.731,
39
+ "step": 25
40
+ },
41
+ {
42
+ "epoch": 0.24,
43
+ "learning_rate": 0.0001968583161128631,
44
+ "loss": 1.6797,
45
+ "step": 30
46
+ },
47
+ {
48
+ "epoch": 0.28,
49
+ "learning_rate": 0.00019573194975320673,
50
+ "loss": 1.6694,
51
+ "step": 35
52
+ },
53
+ {
54
+ "epoch": 0.32,
55
+ "learning_rate": 0.00019443763702374812,
56
+ "loss": 1.7493,
57
+ "step": 40
58
+ },
59
+ {
60
+ "epoch": 0.36,
61
+ "learning_rate": 0.00019297764858882514,
62
+ "loss": 1.7823,
63
+ "step": 45
64
+ },
65
+ {
66
+ "epoch": 0.4,
67
+ "learning_rate": 0.0001913545457642601,
68
+ "loss": 1.6518,
69
+ "step": 50
70
+ },
71
+ {
72
+ "epoch": 0.44,
73
+ "learning_rate": 0.0001895711760239413,
74
+ "loss": 1.7334,
75
+ "step": 55
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "learning_rate": 0.00018763066800438636,
80
+ "loss": 1.7565,
81
+ "step": 60
82
+ },
83
+ {
84
+ "epoch": 0.52,
85
+ "learning_rate": 0.00018553642601605068,
86
+ "loss": 1.6493,
87
+ "step": 65
88
+ },
89
+ {
90
+ "epoch": 0.56,
91
+ "learning_rate": 0.00018329212407100994,
92
+ "loss": 1.7431,
93
+ "step": 70
94
+ },
95
+ {
96
+ "epoch": 0.6,
97
+ "learning_rate": 0.00018090169943749476,
98
+ "loss": 1.7561,
99
+ "step": 75
100
+ },
101
+ {
102
+ "epoch": 0.64,
103
+ "learning_rate": 0.000178369345732584,
104
+ "loss": 1.6185,
105
+ "step": 80
106
+ },
107
+ {
108
+ "epoch": 0.68,
109
+ "learning_rate": 0.00017569950556517566,
110
+ "loss": 1.6059,
111
+ "step": 85
112
+ },
113
+ {
114
+ "epoch": 0.72,
115
+ "learning_rate": 0.00017289686274214118,
116
+ "loss": 1.772,
117
+ "step": 90
118
+ },
119
+ {
120
+ "epoch": 0.76,
121
+ "learning_rate": 0.00016996633405133655,
122
+ "loss": 1.5847,
123
+ "step": 95
124
+ },
125
+ {
126
+ "epoch": 0.8,
127
+ "learning_rate": 0.00016691306063588583,
128
+ "loss": 1.615,
129
+ "step": 100
130
+ },
131
+ {
132
+ "epoch": 0.84,
133
+ "learning_rate": 0.000163742398974869,
134
+ "loss": 1.5706,
135
+ "step": 105
136
+ },
137
+ {
138
+ "epoch": 0.88,
139
+ "learning_rate": 0.0001604599114862375,
140
+ "loss": 1.6997,
141
+ "step": 110
142
+ },
143
+ {
144
+ "epoch": 0.92,
145
+ "learning_rate": 0.0001570713567684432,
146
+ "loss": 1.5529,
147
+ "step": 115
148
+ },
149
+ {
150
+ "epoch": 0.96,
151
+ "learning_rate": 0.00015358267949789966,
152
+ "loss": 1.6631,
153
+ "step": 120
154
+ },
155
+ {
156
+ "epoch": 1.0,
157
+ "learning_rate": 0.00015000000000000001,
158
+ "loss": 1.7483,
159
+ "step": 125
160
+ },
161
+ {
162
+ "epoch": 1.04,
163
+ "learning_rate": 0.00014632960351198618,
164
+ "loss": 1.708,
165
+ "step": 130
166
+ },
167
+ {
168
+ "epoch": 1.08,
169
+ "learning_rate": 0.00014257792915650728,
170
+ "loss": 1.6979,
171
+ "step": 135
172
+ },
173
+ {
174
+ "epoch": 1.12,
175
+ "learning_rate": 0.0001387515586452103,
176
+ "loss": 1.53,
177
+ "step": 140
178
+ },
179
+ {
180
+ "epoch": 1.16,
181
+ "learning_rate": 0.00013485720473218154,
182
+ "loss": 1.6821,
183
+ "step": 145
184
+ },
185
+ {
186
+ "epoch": 1.2,
187
+ "learning_rate": 0.00013090169943749476,
188
+ "loss": 1.7208,
189
+ "step": 150
190
+ },
191
+ {
192
+ "epoch": 1.24,
193
+ "learning_rate": 0.00012689198206152657,
194
+ "loss": 1.6841,
195
+ "step": 155
196
+ },
197
+ {
198
+ "epoch": 1.28,
199
+ "learning_rate": 0.00012283508701106557,
200
+ "loss": 1.544,
201
+ "step": 160
202
+ },
203
+ {
204
+ "epoch": 1.32,
205
+ "learning_rate": 0.00011873813145857249,
206
+ "loss": 1.5851,
207
+ "step": 165
208
+ },
209
+ {
210
+ "epoch": 1.36,
211
+ "learning_rate": 0.00011460830285624118,
212
+ "loss": 1.56,
213
+ "step": 170
214
+ },
215
+ {
216
+ "epoch": 1.4,
217
+ "learning_rate": 0.00011045284632676536,
218
+ "loss": 1.5691,
219
+ "step": 175
220
+ },
221
+ {
222
+ "epoch": 1.44,
223
+ "learning_rate": 0.00010627905195293135,
224
+ "loss": 1.5201,
225
+ "step": 180
226
+ },
227
+ {
228
+ "epoch": 1.48,
229
+ "learning_rate": 0.0001020942419883357,
230
+ "loss": 1.5098,
231
+ "step": 185
232
+ },
233
+ {
234
+ "epoch": 1.52,
235
+ "learning_rate": 9.790575801166432e-05,
236
+ "loss": 1.5805,
237
+ "step": 190
238
+ },
239
+ {
240
+ "epoch": 1.56,
241
+ "learning_rate": 9.372094804706867e-05,
242
+ "loss": 1.6742,
243
+ "step": 195
244
+ },
245
+ {
246
+ "epoch": 1.6,
247
+ "learning_rate": 8.954715367323468e-05,
248
+ "loss": 1.5656,
249
+ "step": 200
250
+ },
251
+ {
252
+ "epoch": 1.64,
253
+ "learning_rate": 8.539169714375885e-05,
254
+ "loss": 1.6301,
255
+ "step": 205
256
+ },
257
+ {
258
+ "epoch": 1.68,
259
+ "learning_rate": 8.126186854142752e-05,
260
+ "loss": 1.6027,
261
+ "step": 210
262
+ },
263
+ {
264
+ "epoch": 1.72,
265
+ "learning_rate": 7.716491298893442e-05,
266
+ "loss": 1.6494,
267
+ "step": 215
268
+ },
269
+ {
270
+ "epoch": 1.76,
271
+ "learning_rate": 7.310801793847344e-05,
272
+ "loss": 1.5962,
273
+ "step": 220
274
+ },
275
+ {
276
+ "epoch": 1.8,
277
+ "learning_rate": 6.909830056250527e-05,
278
+ "loss": 1.5375,
279
+ "step": 225
280
+ },
281
+ {
282
+ "epoch": 1.84,
283
+ "learning_rate": 6.51427952678185e-05,
284
+ "loss": 1.596,
285
+ "step": 230
286
+ },
287
+ {
288
+ "epoch": 1.88,
289
+ "learning_rate": 6.12484413547897e-05,
290
+ "loss": 1.6401,
291
+ "step": 235
292
+ },
293
+ {
294
+ "epoch": 1.92,
295
+ "learning_rate": 5.7422070843492734e-05,
296
+ "loss": 1.5735,
297
+ "step": 240
298
+ },
299
+ {
300
+ "epoch": 1.96,
301
+ "learning_rate": 5.3670396488013854e-05,
302
+ "loss": 1.6057,
303
+ "step": 245
304
+ },
305
+ {
306
+ "epoch": 2.0,
307
+ "learning_rate": 5.000000000000002e-05,
308
+ "loss": 1.5428,
309
+ "step": 250
310
+ },
311
+ {
312
+ "epoch": 2.04,
313
+ "learning_rate": 4.6417320502100316e-05,
314
+ "loss": 1.6843,
315
+ "step": 255
316
+ },
317
+ {
318
+ "epoch": 2.08,
319
+ "learning_rate": 4.2928643231556844e-05,
320
+ "loss": 1.6004,
321
+ "step": 260
322
+ },
323
+ {
324
+ "epoch": 2.12,
325
+ "learning_rate": 3.954008851376252e-05,
326
+ "loss": 1.5231,
327
+ "step": 265
328
+ },
329
+ {
330
+ "epoch": 2.16,
331
+ "learning_rate": 3.6257601025131026e-05,
332
+ "loss": 1.6147,
333
+ "step": 270
334
+ },
335
+ {
336
+ "epoch": 2.2,
337
+ "learning_rate": 3.308693936411421e-05,
338
+ "loss": 1.5095,
339
+ "step": 275
340
+ },
341
+ {
342
+ "epoch": 2.24,
343
+ "learning_rate": 3.0033665948663448e-05,
344
+ "loss": 1.6355,
345
+ "step": 280
346
+ },
347
+ {
348
+ "epoch": 2.28,
349
+ "learning_rate": 2.7103137257858868e-05,
350
+ "loss": 1.4353,
351
+ "step": 285
352
+ },
353
+ {
354
+ "epoch": 2.32,
355
+ "learning_rate": 2.4300494434824373e-05,
356
+ "loss": 1.5886,
357
+ "step": 290
358
+ },
359
+ {
360
+ "epoch": 2.36,
361
+ "learning_rate": 2.163065426741603e-05,
362
+ "loss": 1.5915,
363
+ "step": 295
364
+ },
365
+ {
366
+ "epoch": 2.4,
367
+ "learning_rate": 1.9098300562505266e-05,
368
+ "loss": 1.5584,
369
+ "step": 300
370
+ },
371
+ {
372
+ "epoch": 2.44,
373
+ "learning_rate": 1.6707875928990058e-05,
374
+ "loss": 1.4628,
375
+ "step": 305
376
+ },
377
+ {
378
+ "epoch": 2.48,
379
+ "learning_rate": 1.4463573983949341e-05,
380
+ "loss": 1.5502,
381
+ "step": 310
382
+ },
383
+ {
384
+ "epoch": 2.52,
385
+ "learning_rate": 1.2369331995613665e-05,
386
+ "loss": 1.4882,
387
+ "step": 315
388
+ },
389
+ {
390
+ "epoch": 2.56,
391
+ "learning_rate": 1.042882397605871e-05,
392
+ "loss": 1.6666,
393
+ "step": 320
394
+ },
395
+ {
396
+ "epoch": 2.6,
397
+ "learning_rate": 8.645454235739903e-06,
398
+ "loss": 1.4874,
399
+ "step": 325
400
+ },
401
+ {
402
+ "epoch": 2.64,
403
+ "learning_rate": 7.022351411174866e-06,
404
+ "loss": 1.6158,
405
+ "step": 330
406
+ },
407
+ {
408
+ "epoch": 2.68,
409
+ "learning_rate": 5.562362976251901e-06,
410
+ "loss": 1.4376,
411
+ "step": 335
412
+ },
413
+ {
414
+ "epoch": 2.72,
415
+ "learning_rate": 4.268050246793276e-06,
416
+ "loss": 1.6202,
417
+ "step": 340
418
+ },
419
+ {
420
+ "epoch": 2.76,
421
+ "learning_rate": 3.1416838871368924e-06,
422
+ "loss": 1.5493,
423
+ "step": 345
424
+ },
425
+ {
426
+ "epoch": 2.8,
427
+ "learning_rate": 2.1852399266194314e-06,
428
+ "loss": 1.6157,
429
+ "step": 350
430
+ },
431
+ {
432
+ "epoch": 2.84,
433
+ "learning_rate": 1.400396292949513e-06,
434
+ "loss": 1.5631,
435
+ "step": 355
436
+ },
437
+ {
438
+ "epoch": 2.88,
439
+ "learning_rate": 7.885298685522235e-07,
440
+ "loss": 1.5326,
441
+ "step": 360
442
+ },
443
+ {
444
+ "epoch": 2.92,
445
+ "learning_rate": 3.50714075049563e-07,
446
+ "loss": 1.5978,
447
+ "step": 365
448
+ },
449
+ {
450
+ "epoch": 2.96,
451
+ "learning_rate": 8.771699011416168e-08,
452
+ "loss": 1.4251,
453
+ "step": 370
454
+ },
455
+ {
456
+ "epoch": 3.0,
457
+ "learning_rate": 0.0,
458
+ "loss": 1.6276,
459
+ "step": 375
460
+ },
461
+ {
462
+ "epoch": 3.0,
463
+ "step": 375,
464
+ "total_flos": 4.32034079145984e+16,
465
+ "train_loss": 1.628477378845215,
466
+ "train_runtime": 1045.5144,
467
+ "train_samples_per_second": 5.739,
468
+ "train_steps_per_second": 0.359
469
+ }
470
+ ],
471
+ "logging_steps": 5,
472
+ "max_steps": 375,
473
+ "num_input_tokens_seen": 0,
474
+ "num_train_epochs": 3,
475
+ "save_steps": 100,
476
+ "total_flos": 4.32034079145984e+16,
477
+ "train_batch_size": 2,
478
+ "trial_name": null,
479
+ "trial_params": null
480
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84990e28d454e290f6393201e221ba051e01af18e581a4f5994ac8396ad7c48b
3
+ size 4920