XiaoY1 commited on
Commit
79de4e4
1 Parent(s): a911955

Upload tokenization_mupt.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_mupt.py +371 -0
tokenization_mupt.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for OpenAI GPT."""
16
+
17
+
18
+ import json
19
+ import os
20
+ from functools import lru_cache
21
+ from typing import List, Optional, Tuple
22
+
23
+ import regex as re
24
+
25
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
26
+ from transformers.utils import logging
27
+
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+ VOCAB_FILES_NAMES = {
32
+ "vocab_file": "vocab.json",
33
+ "merges_file": "merges.txt",
34
+ }
35
+
36
+ PRETRAINED_VOCAB_FILES_MAP = {
37
+ "vocab_file": {
38
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
39
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
40
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
41
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
42
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
43
+ },
44
+ "merges_file": {
45
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
46
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
47
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
48
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
49
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
50
+ },
51
+ }
52
+
53
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
54
+ "mupt-110M": 8192,
55
+ "mupt-345M": 8192,
56
+ "mupt-770M": 8192,
57
+ "mupt-1.3B": 8192,
58
+ }
59
+
60
+
61
+ @lru_cache()
62
+ def bytes_to_unicode():
63
+ """
64
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
65
+ characters the bpe code barfs on.
66
+
67
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
68
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
69
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
70
+ tables between utf-8 bytes and unicode strings.
71
+ """
72
+ bs = (
73
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
74
+ )
75
+ cs = bs[:]
76
+ n = 0
77
+ for b in range(2**8):
78
+ if b not in bs:
79
+ bs.append(b)
80
+ cs.append(2**8 + n)
81
+ n += 1
82
+ cs = [chr(n) for n in cs]
83
+ return dict(zip(bs, cs))
84
+
85
+
86
+ def get_pairs(word):
87
+ """
88
+ Return set of symbol pairs in a word.
89
+
90
+ Word is represented as tuple of symbols (symbols being variable-length strings).
91
+ """
92
+ pairs = set()
93
+ prev_char = word[0]
94
+ for char in word[1:]:
95
+ pairs.add((prev_char, char))
96
+ prev_char = char
97
+ return pairs
98
+
99
+
100
+ class MuPTTokenizer(PreTrainedTokenizer):
101
+ """
102
+ Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
103
+
104
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
105
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
106
+
107
+ ```python
108
+ >>> from transformers import GPT2Tokenizer
109
+
110
+ >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
111
+ >>> tokenizer("Hello world")["input_ids"]
112
+ [15496, 995]
113
+
114
+ >>> tokenizer(" Hello world")["input_ids"]
115
+ [18435, 995]
116
+ ```
117
+
118
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
119
+ call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
120
+
121
+ <Tip>
122
+
123
+ When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
124
+
125
+ </Tip>
126
+
127
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
128
+ this superclass for more information regarding those methods.
129
+
130
+ Args:
131
+ vocab_file (`str`):
132
+ Path to the vocabulary file.
133
+ merges_file (`str`):
134
+ Path to the merges file.
135
+ errors (`str`, *optional*, defaults to `"replace"`):
136
+ Paradigm to follow when decoding bytes to UTF-8. See
137
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
138
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
139
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
140
+ token instead.
141
+ bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
142
+ The beginning of sequence token.
143
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
144
+ The end of sequence token.
145
+ pad_token (`str`, *optional*):
146
+ The token used for padding, for example when batching sequences of different lengths.
147
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
148
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
149
+ other word. (GPT2 tokenizer detect beginning of words by the preceding space).
150
+ add_bos_token (`bool`, *optional*, defaults to `False`):
151
+ Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
152
+ word just as any other word.
153
+ """
154
+
155
+ vocab_files_names = VOCAB_FILES_NAMES
156
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
157
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
158
+ model_input_names = ["input_ids", "attention_mask"]
159
+
160
+ def __init__(
161
+ self,
162
+ vocab_file,
163
+ merges_file,
164
+ errors="replace",
165
+ unk_token="<unk>",
166
+ bos_token="<bos>",
167
+ eos_token="<eos>",
168
+ pad_token="<pad>",
169
+ add_prefix_space=False,
170
+ add_bos_token=False,
171
+ **kwargs,
172
+ ):
173
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
174
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
175
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
176
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
177
+
178
+ self.add_bos_token = add_bos_token
179
+
180
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
181
+ self.encoder = json.load(vocab_handle)
182
+ self.decoder = {v: k for k, v in self.encoder.items()}
183
+ self.errors = errors # how to handle errors in decoding
184
+ self.byte_encoder = bytes_to_unicode()
185
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
186
+ with open(merges_file, encoding="utf-8") as merges_handle:
187
+ bpe_merges = merges_handle.read().split("\n")[1:-1]
188
+ bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
189
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
190
+ self.cache = {}
191
+ self.add_prefix_space = add_prefix_space
192
+
193
+ # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
194
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
195
+
196
+ super().__init__(
197
+ errors=errors,
198
+ unk_token=unk_token,
199
+ bos_token=bos_token,
200
+ eos_token=eos_token,
201
+ pad_token=pad_token,
202
+ add_prefix_space=add_prefix_space,
203
+ add_bos_token=add_bos_token,
204
+ **kwargs,
205
+ )
206
+
207
+ @property
208
+ def vocab_size(self):
209
+ return len(self.encoder)
210
+
211
+ def get_vocab(self):
212
+ return dict(self.encoder, **self.added_tokens_encoder)
213
+
214
+ def bpe(self, token):
215
+ if token in self.cache:
216
+ return self.cache[token]
217
+ word = tuple(token)
218
+ pairs = get_pairs(word)
219
+
220
+ if not pairs:
221
+ return token
222
+
223
+ while True:
224
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
225
+ if bigram not in self.bpe_ranks:
226
+ break
227
+ first, second = bigram
228
+ new_word = []
229
+ i = 0
230
+ while i < len(word):
231
+ try:
232
+ j = word.index(first, i)
233
+ except ValueError:
234
+ new_word.extend(word[i:])
235
+ break
236
+ else:
237
+ new_word.extend(word[i:j])
238
+ i = j
239
+
240
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
241
+ new_word.append(first + second)
242
+ i += 2
243
+ else:
244
+ new_word.append(word[i])
245
+ i += 1
246
+ new_word = tuple(new_word)
247
+ word = new_word
248
+ if len(word) == 1:
249
+ break
250
+ else:
251
+ pairs = get_pairs(word)
252
+ word = " ".join(word)
253
+ self.cache[token] = word
254
+ return word
255
+
256
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
257
+ if self.add_bos_token:
258
+ bos_token_ids = [self.bos_token_id]
259
+ else:
260
+ bos_token_ids = []
261
+
262
+ output = bos_token_ids + token_ids_0
263
+
264
+ if token_ids_1 is None:
265
+ return output
266
+
267
+ return output + bos_token_ids + token_ids_1
268
+
269
+ def get_special_tokens_mask(
270
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
271
+ ) -> List[int]:
272
+ """
273
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
274
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
275
+
276
+ Args:
277
+ token_ids_0 (`List[int]`):
278
+ List of IDs.
279
+ token_ids_1 (`List[int]`, *optional*):
280
+ Optional second list of IDs for sequence pairs.
281
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
282
+ Whether or not the token list is already formatted with special tokens for the model.
283
+
284
+ Returns:
285
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
286
+ """
287
+ if already_has_special_tokens:
288
+ return super().get_special_tokens_mask(
289
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
290
+ )
291
+
292
+ if not self.add_bos_token:
293
+ return super().get_special_tokens_mask(
294
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
295
+ )
296
+
297
+ if token_ids_1 is None:
298
+ return [1] + ([0] * len(token_ids_0))
299
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
300
+
301
+ def _tokenize(self, text):
302
+ """Tokenize a string."""
303
+ bpe_tokens = []
304
+ for token in re.findall(self.pat, text):
305
+ token = "".join(
306
+ self.byte_encoder[b] for b in token.encode("utf-8")
307
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
308
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
309
+ return bpe_tokens
310
+
311
+ def _convert_token_to_id(self, token):
312
+ """Converts a token (str) in an id using the vocab."""
313
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
314
+
315
+ def _convert_id_to_token(self, index):
316
+ """Converts an index (integer) in a token (str) using the vocab."""
317
+ return self.decoder.get(index)
318
+
319
+ def convert_tokens_to_string(self, tokens):
320
+ """Converts a sequence of tokens (string) in a single string."""
321
+ text = "".join(tokens)
322
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
323
+ return text
324
+
325
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
326
+ if not os.path.isdir(save_directory):
327
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
328
+ return
329
+ vocab_file = os.path.join(
330
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
331
+ )
332
+ merge_file = os.path.join(
333
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
334
+ )
335
+
336
+ with open(vocab_file, "w", encoding="utf-8") as f:
337
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
338
+
339
+ index = 0
340
+ with open(merge_file, "w", encoding="utf-8") as writer:
341
+ writer.write("#version: 0.2\n")
342
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
343
+ if index != token_index:
344
+ logger.warning(
345
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
346
+ " Please check that the tokenizer is not corrupted!"
347
+ )
348
+ index = token_index
349
+ writer.write(" ".join(bpe_tokens) + "\n")
350
+ index += 1
351
+
352
+ return vocab_file, merge_file
353
+
354
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
355
+ add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
356
+ if is_split_into_words or add_prefix_space:
357
+ text = " " + text
358
+ return (text, kwargs)
359
+
360
+ @property
361
+ def default_chat_template(self):
362
+ """
363
+ A simple chat template that ignores role information and just concatenates messages with EOS tokens.
364
+ """
365
+ logger.warning_once(
366
+ "\nNo chat template is defined for this tokenizer - using the default template "
367
+ f"for the {self.__class__.__name__} class. If the default is not appropriate for "
368
+ "your model, please set `tokenizer.chat_template` to an appropriate template. "
369
+ "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
370
+ )
371
+ return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"