jasonfang3900 commited on
Commit
3f812d1
1 Parent(s): 1902962

Upload tokenization_flm.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_flm.py +403 -0
tokenization_flm.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ """Tokenization classes for FLM."""
22
+ import os
23
+ from shutil import copyfile
24
+ from typing import Any, Dict, List, Optional, Tuple
25
+
26
+ import sentencepiece as spm
27
+ import re
28
+ from transformers.convert_slow_tokenizer import import_protobuf
29
+ from transformers import AddedToken, PreTrainedTokenizer
30
+ from transformers.utils import logging
31
+ from transformers.tokenization_utils_base import TextInput
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
36
+
37
+ PRETRAINED_VOCAB_FILES_MAP = {
38
+ "vocab_file": {},
39
+ "tokenizer_file": {},
40
+ }
41
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
42
+ "flm-tokenizer": 8192,
43
+ }
44
+ SPIECE_UNDERLINE = "▁"
45
+
46
+
47
+ class FLMTokenizer(PreTrainedTokenizer):
48
+ """
49
+ Construct a FLM tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
50
+ no padding token in the original model.
51
+
52
+ Args:
53
+ vocab_file (`str`):
54
+ Path to the vocabulary file.
55
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
56
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
57
+ token instead.
58
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
59
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
60
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
61
+ The end of sequence token.
62
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*):
63
+ A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
64
+ attention mechanisms or loss computation.
65
+ sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
66
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
67
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
68
+ to set:
69
+
70
+ - `enable_sampling`: Enable subword regularization.
71
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
72
+
73
+ - `nbest_size = {0,1}`: No sampling is performed.
74
+ - `nbest_size > 1`: samples from the nbest_size results.
75
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
76
+ using forward-filtering-and-backward-sampling algorithm.
77
+
78
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
79
+ BPE-dropout.
80
+
81
+ add_bos_token (`bool`, *optional*, defaults to `True`):
82
+ Whether or not to add an `bos_token` at the start of sequences.
83
+ add_eos_token (`bool`, *optional*, defaults to `False`):
84
+ Whether or not to add an `eos_token` at the end of sequences.
85
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
86
+ Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
87
+ extra spaces.
88
+ spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
89
+ Whether or not to add spaces between special tokens.
90
+
91
+ """
92
+
93
+ vocab_files_names = VOCAB_FILES_NAMES
94
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
95
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
96
+ model_input_names = ["input_ids", "attention_mask"]
97
+
98
+ def __init__(
99
+ self,
100
+ vocab_file,
101
+ bos_token="<s>",
102
+ eos_token="</s>",
103
+ unk_token="<unk>",
104
+ pad_token=None,
105
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
106
+ add_bos_token=False,
107
+ add_eos_token=False,
108
+ clean_up_tokenization_spaces=False,
109
+ spaces_between_special_tokens=False,
110
+ **kwargs,
111
+ ):
112
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
113
+ bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
114
+ eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
115
+ pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
116
+ self.vocab_file = vocab_file
117
+ self.add_bos_token = add_bos_token
118
+ self.add_eos_token = add_eos_token
119
+ self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
120
+ super().__init__(
121
+ bos_token=bos_token,
122
+ eos_token=eos_token,
123
+ unk_token=unk_token,
124
+ pad_token=pad_token,
125
+ add_bos_token=add_bos_token,
126
+ add_eos_token=add_eos_token,
127
+ sp_model_kwargs=self.sp_model_kwargs,
128
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
129
+ spaces_between_special_tokens=spaces_between_special_tokens,
130
+ **kwargs,
131
+ )
132
+
133
+ @property
134
+ def unk_token_length(self):
135
+ return len(self.sp_model.encode(str(self.unk_token)))
136
+
137
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
138
+ def get_spm_processor(self, from_slow=False):
139
+ tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
140
+ with open(self.vocab_file, "rb") as f:
141
+ sp_model = f.read()
142
+ model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
143
+ model = model_pb2.ModelProto.FromString(sp_model)
144
+ normalizer_spec = model_pb2.NormalizerSpec()
145
+ normalizer_spec.add_dummy_prefix = True
146
+ model.normalizer_spec.MergeFrom(normalizer_spec)
147
+ sp_model = model.SerializeToString()
148
+ tokenizer.LoadFromSerializedProto(sp_model)
149
+ return tokenizer
150
+
151
+ def __getstate__(self):
152
+ state = self.__dict__.copy()
153
+ state["sp_model"] = None
154
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
155
+ return state
156
+
157
+ def __setstate__(self, d):
158
+ self.__dict__ = d
159
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
160
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
161
+
162
+ @property
163
+ def vocab_size(self):
164
+ """Returns vocab size"""
165
+ return self.sp_model.get_piece_size()
166
+
167
+ def get_vocab(self):
168
+ """Returns vocab as a dict"""
169
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
170
+ vocab.update(self.added_tokens_encoder)
171
+ return vocab
172
+
173
+ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
174
+ """
175
+ Converts a string in a sequence of tokens, using the tokenizer.
176
+
177
+ Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
178
+ (BPE/SentencePieces/WordPieces). Takes care of added tokens.
179
+
180
+ Args:
181
+ text (`str`):
182
+ The sequence to be encoded.
183
+ **kwargs (additional keyword arguments):
184
+ Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
185
+
186
+ Returns:
187
+ `List[str]`: The list of tokens.
188
+ """
189
+ split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
190
+ remove_dummy_prefix = kwargs.pop("remove_dummy_prefix", False)
191
+
192
+ text, kwargs = self.prepare_for_tokenization(text, **kwargs)
193
+
194
+ if kwargs:
195
+ logger.warning(f"Keyword arguments {kwargs} not recognized.")
196
+
197
+ if hasattr(self, "do_lower_case") and self.do_lower_case:
198
+ # convert non-special tokens to lowercase. Might be super slow as well?
199
+ escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
200
+ escaped_special_toks += [
201
+ re.escape(s_tok.content)
202
+ for s_tok in (self._added_tokens_decoder.values())
203
+ if not s_tok.special and s_tok.normalized
204
+ ]
205
+ pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
206
+ text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
207
+
208
+ if split_special_tokens:
209
+ no_split_token = []
210
+ tokens = [text]
211
+ else:
212
+ no_split_token = self._added_tokens_encoder.keys() # don't split on any of the added tokens
213
+ # "This is something<special_token_1> else"
214
+ tokens = self.tokens_trie.split(text)
215
+
216
+ # ["This is something", "<special_token_1>", " else"]
217
+ for i, token in enumerate(tokens):
218
+ if token in no_split_token:
219
+ tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
220
+ left = tokens[i - 1] if i > 0 else None
221
+ right = tokens[i + 1] if i < len(tokens) - 1 else None
222
+ if isinstance(tok_extended, AddedToken):
223
+ if tok_extended.rstrip and right:
224
+ # A bit counter-intuitive but we strip the left of the string
225
+ # since tok_extended.rstrip means the special token is eating all white spaces on its right
226
+ tokens[i + 1] = right.lstrip()
227
+ # Strip white spaces on the left
228
+ if tok_extended.lstrip and left:
229
+ tokens[i - 1] = left.rstrip() # Opposite here
230
+ if tok_extended.single_word and left and left[-1] != " ":
231
+ tokens[i - 1] += token
232
+ tokens[i] = ""
233
+ elif tok_extended.single_word and right and right[0] != " ":
234
+ tokens[i + 1] = token + tokens[i + 1]
235
+ tokens[i] = ""
236
+ else:
237
+ raise ValueError(
238
+ f"{tok_extended} cannot be tokenized because it was not properly added"
239
+ f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
240
+ )
241
+ # ["This is something", "<special_token_1>", "else"]
242
+ tokenized_text = []
243
+ for token in tokens:
244
+ # Need to skip eventual empty (fully stripped) tokens
245
+ if not token:
246
+ continue
247
+ if token in no_split_token:
248
+ tokenized_text.append(token)
249
+ else:
250
+ tokenized_text.extend(self._tokenize(token, remove_dummy_prefix=remove_dummy_prefix))
251
+ # ["This", " is", " something", "<special_token_1>", "else"]
252
+ return tokenized_text
253
+
254
+ def _tokenize(self, text, **kwargs):
255
+ """
256
+ Returns a tokenized string.
257
+
258
+ We add a option to remove dummpy prefix during tokenization instead of changing the default behaviour of the sentencepiece tokenizer.
259
+ This is useful when there're two tokenized sentences to be merged into one as the last one will have an extra dummy prefix which results in a
260
+ inconsistant pattern.
261
+ """
262
+ tokens = self.sp_model.encode(text, out_type=str)
263
+ if text.startswith((SPIECE_UNDERLINE, " ")):
264
+ return tokens
265
+ if len(tokens) > 0 and kwargs.get("remove_dummy_prefix") is True:
266
+ tokens[0] = tokens[0].replace(SPIECE_UNDERLINE, "", 1)
267
+ return tokens
268
+
269
+ def _convert_token_to_id(self, token):
270
+ """Converts a token (str) in an id using the vocab."""
271
+ return self.sp_model.piece_to_id(token)
272
+
273
+ def _convert_id_to_token(self, index):
274
+ """Converts an index (integer) in a token (str) using the vocab."""
275
+ token = self.sp_model.IdToPiece(index)
276
+ return token
277
+
278
+ def convert_tokens_to_string(self, tokens):
279
+ """Converts a sequence of tokens (string) in a single string."""
280
+ current_sub_tokens = []
281
+ out_string = ""
282
+ # prev_is_special = False
283
+ for i, token in enumerate(tokens):
284
+ # make sure that special tokens are not decoded using sentencepiece model
285
+ if token in self.all_special_tokens:
286
+ # if not prev_is_special and i != 0 and self.legacy:
287
+ # out_string += " "
288
+ out_string += self.sp_model.decode(current_sub_tokens) + token
289
+ # prev_is_special = True
290
+ current_sub_tokens = []
291
+ else:
292
+ current_sub_tokens.append(token)
293
+ # prev_is_special = False
294
+ out_string += self.sp_model.decode(current_sub_tokens)
295
+ return out_string
296
+
297
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
298
+ """
299
+ Save the vocabulary and special tokens file to a directory.
300
+
301
+ Args:
302
+ save_directory (`str`):
303
+ The directory in which to save the vocabulary.
304
+
305
+ Returns:
306
+ `Tuple(str)`: Paths to the files saved.
307
+ """
308
+ if not os.path.isdir(save_directory):
309
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
310
+ return
311
+ out_vocab_file = os.path.join(
312
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
313
+ )
314
+
315
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
316
+ copyfile(self.vocab_file, out_vocab_file)
317
+ elif not os.path.isfile(self.vocab_file):
318
+ with open(out_vocab_file, "wb") as fi:
319
+ content_spiece_model = self.sp_model.serialized_model_proto()
320
+ fi.write(content_spiece_model)
321
+
322
+ return (out_vocab_file,)
323
+
324
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
325
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
326
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
327
+
328
+ output = bos_token_id + token_ids_0 + eos_token_id
329
+
330
+ if token_ids_1 is not None:
331
+ output = output + bos_token_id + token_ids_1 + eos_token_id
332
+
333
+ return output
334
+
335
+ def get_special_tokens_mask(
336
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
337
+ ) -> List[int]:
338
+ """
339
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
340
+ special tokens using the tokenizer `prepare_for_model` method.
341
+
342
+ Args:
343
+ token_ids_0 (`List[int]`):
344
+ List of IDs.
345
+ token_ids_1 (`List[int]`, *optional*):
346
+ Optional second list of IDs for sequence pairs.
347
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
348
+ Whether or not the token list is already formatted with special tokens for the model.
349
+
350
+ Returns:
351
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
352
+ """
353
+ if already_has_special_tokens:
354
+ return super().get_special_tokens_mask(
355
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
356
+ )
357
+
358
+ bos_token_id = [1] if self.add_bos_token else []
359
+ eos_token_id = [1] if self.add_eos_token else []
360
+
361
+ if token_ids_1 is None:
362
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
363
+ return (
364
+ bos_token_id
365
+ + ([0] * len(token_ids_0))
366
+ + eos_token_id
367
+ + bos_token_id
368
+ + ([0] * len(token_ids_1))
369
+ + eos_token_id
370
+ )
371
+
372
+ def create_token_type_ids_from_sequences(
373
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
374
+ ) -> List[int]:
375
+ """
376
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
377
+ sequence pair mask has the following format:
378
+
379
+ ```
380
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
381
+ | first sequence | second sequence |
382
+ ```
383
+
384
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
385
+
386
+ Args:
387
+ token_ids_0 (`List[int]`):
388
+ List of ids.
389
+ token_ids_1 (`List[int]`, *optional*):
390
+ Optional second list of IDs for sequence pairs.
391
+
392
+ Returns:
393
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
394
+ """
395
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
396
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
397
+
398
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
399
+
400
+ if token_ids_1 is not None:
401
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
402
+
403
+ return output