dqnguyen commited on
Commit
c82753d
1 Parent(s): b32e9f3

Upload 3 files

Browse files
__init__.py ADDED
File without changes
tokenization_bartpho.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 VinAI Research and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License
15
+ """ Tokenization classes for BARTpho-syllable model."""
16
+
17
+
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import sentencepiece as spm
23
+
24
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
25
+ from transformers.utils import logging
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ SPIECE_UNDERLINE = "▁"
31
+
32
+ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"}
33
+
34
+ PRETRAINED_VOCAB_FILES_MAP = {
35
+ "vocab_file": {
36
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/sentencepiece.bpe.model",
37
+ },
38
+ "monolingual_vocab_file": {
39
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/dict.txt",
40
+ },
41
+ }
42
+
43
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
44
+
45
+
46
+ class BartphoTokenizer(PreTrainedTokenizer):
47
+ """
48
+ Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
49
+
50
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
51
+ this superclass for more information regarding those methods.
52
+
53
+ Args:
54
+ vocab_file (`str`):
55
+ Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the
56
+ multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types.
57
+ monolingual_vocab_file (`str`):
58
+ Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized
59
+ types extracted from the multilingual vocabulary vocab_file of 250K types.
60
+ bos_token (`str`, *optional*, defaults to `"<s>"`):
61
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
62
+
63
+ <Tip>
64
+
65
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
66
+ sequence. The token used is the `cls_token`.
67
+
68
+ </Tip>
69
+
70
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
71
+ The end of sequence token.
72
+
73
+ <Tip>
74
+
75
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
76
+ The token used is the `sep_token`.
77
+
78
+ </Tip>
79
+
80
+ sep_token (`str`, *optional*, defaults to `"</s>"`):
81
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
82
+ sequence classification or for a text and a question for question answering. It is also used as the last
83
+ token of a sequence built with special tokens.
84
+ cls_token (`str`, *optional*, defaults to `"<s>"`):
85
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
86
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
87
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
88
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
89
+ token instead.
90
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
91
+ The token used for padding, for example when batching sequences of different lengths.
92
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
93
+ The token used for masking values. This is the token used when training this model with masked language
94
+ modeling. This is the token which the model will try to predict.
95
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
96
+ Additional special tokens used by the tokenizer.
97
+ sp_model_kwargs (`dict`, *optional*):
98
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
99
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
100
+ to set:
101
+
102
+ - `enable_sampling`: Enable subword regularization.
103
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
104
+
105
+ - `nbest_size = {0,1}`: No sampling is performed.
106
+ - `nbest_size > 1`: samples from the nbest_size results.
107
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
108
+ using forward-filtering-and-backward-sampling algorithm.
109
+
110
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
111
+ BPE-dropout.
112
+
113
+ Attributes:
114
+ sp_model (`SentencePieceProcessor`):
115
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
116
+ """
117
+
118
+ vocab_files_names = VOCAB_FILES_NAMES
119
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
120
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
121
+ model_input_names = ["input_ids", "attention_mask"]
122
+
123
+ def __init__(
124
+ self,
125
+ vocab_file,
126
+ monolingual_vocab_file,
127
+ bos_token="<s>",
128
+ eos_token="</s>",
129
+ sep_token="</s>",
130
+ cls_token="<s>",
131
+ unk_token="<unk>",
132
+ pad_token="<pad>",
133
+ mask_token="<mask>",
134
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
135
+ **kwargs
136
+ ) -> None:
137
+ # Mask token behave like a normal word, i.e. include the space before it
138
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
139
+
140
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
141
+
142
+ super().__init__(
143
+ bos_token=bos_token,
144
+ eos_token=eos_token,
145
+ unk_token=unk_token,
146
+ sep_token=sep_token,
147
+ cls_token=cls_token,
148
+ pad_token=pad_token,
149
+ mask_token=mask_token,
150
+ sp_model_kwargs=self.sp_model_kwargs,
151
+ **kwargs,
152
+ )
153
+
154
+ self.vocab_file = vocab_file
155
+ self.monolingual_vocab_file = monolingual_vocab_file
156
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
157
+ self.sp_model.Load(str(vocab_file))
158
+
159
+ # Load the reduced vocab
160
+
161
+ # Keep order of special tokens for backward compatibility
162
+ self.fairseq_tokens_to_ids = {}
163
+ cnt = 0
164
+ for token in [bos_token, pad_token, eos_token, unk_token, sep_token, cls_token]:
165
+ if str(token) not in self.fairseq_tokens_to_ids:
166
+ self.fairseq_tokens_to_ids[str(token)] = cnt
167
+ cnt += 1
168
+ with open(monolingual_vocab_file, "r", encoding="utf-8") as f:
169
+ for line in f.readlines():
170
+ token = line.strip().split()[0]
171
+ self.fairseq_tokens_to_ids[token] = len(self.fairseq_tokens_to_ids)
172
+ if str(mask_token) not in self.fairseq_tokens_to_ids:
173
+ self.fairseq_tokens_to_ids[str(mask_token)] = len(self.fairseq_tokens_to_ids)
174
+
175
+ self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
176
+
177
+ def __getstate__(self):
178
+ state = self.__dict__.copy()
179
+ state["sp_model"] = None
180
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
181
+ return state
182
+
183
+ def __setstate__(self, d):
184
+ self.__dict__ = d
185
+
186
+ # for backward compatibility
187
+ if not hasattr(self, "sp_model_kwargs"):
188
+ self.sp_model_kwargs = {}
189
+
190
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
191
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
192
+
193
+ def build_inputs_with_special_tokens(
194
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
195
+ ) -> List[int]:
196
+ """
197
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
198
+ adding special tokens. An BARTPho sequence has the following format:
199
+
200
+ - single sequence: `<s> X </s>`
201
+ - pair of sequences: `<s> A </s></s> B </s>`
202
+
203
+ Args:
204
+ token_ids_0 (`List[int]`):
205
+ List of IDs to which the special tokens will be added.
206
+ token_ids_1 (`List[int]`, *optional*):
207
+ Optional second list of IDs for sequence pairs.
208
+
209
+ Returns:
210
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
211
+ """
212
+
213
+ if token_ids_1 is None:
214
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
215
+ cls = [self.cls_token_id]
216
+ sep = [self.sep_token_id]
217
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
218
+
219
+ def get_special_tokens_mask(
220
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
221
+ ) -> List[int]:
222
+ """
223
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
224
+ special tokens using the tokenizer `prepare_for_model` method.
225
+
226
+ Args:
227
+ token_ids_0 (`List[int]`):
228
+ List of IDs.
229
+ token_ids_1 (`List[int]`, *optional*):
230
+ Optional second list of IDs for sequence pairs.
231
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
232
+ Whether or not the token list is already formatted with special tokens for the model.
233
+
234
+ Returns:
235
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
236
+ """
237
+
238
+ if already_has_special_tokens:
239
+ return super().get_special_tokens_mask(
240
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
241
+ )
242
+
243
+ if token_ids_1 is None:
244
+ return [1] + ([0] * len(token_ids_0)) + [1]
245
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
246
+
247
+ def create_token_type_ids_from_sequences(
248
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
249
+ ) -> List[int]:
250
+ """
251
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTPho does not
252
+ make use of token type ids, therefore a list of zeros is returned.
253
+
254
+ Args:
255
+ token_ids_0 (`List[int]`):
256
+ List of IDs.
257
+ token_ids_1 (`List[int]`, *optional*):
258
+ Optional second list of IDs for sequence pairs.
259
+
260
+ Returns:
261
+ `List[int]`: List of zeros.
262
+
263
+ """
264
+
265
+ sep = [self.sep_token_id]
266
+ cls = [self.cls_token_id]
267
+
268
+ if token_ids_1 is None:
269
+ return len(cls + token_ids_0 + sep) * [0]
270
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
271
+
272
+ @property
273
+ def vocab_size(self):
274
+ return len(self.fairseq_ids_to_tokens)
275
+
276
+ def get_vocab(self):
277
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
278
+ vocab.update(self.added_tokens_encoder)
279
+ return vocab
280
+
281
+ def _tokenize(self, text: str) -> List[str]:
282
+ return self.sp_model.encode(text, out_type=str)
283
+
284
+ def _convert_token_to_id(self, token):
285
+ """Converts a token (str) in an id using the vocab."""
286
+ if token in self.fairseq_tokens_to_ids:
287
+ return self.fairseq_tokens_to_ids[token]
288
+ else:
289
+ return self.unk_token_id
290
+
291
+ def _convert_id_to_token(self, index):
292
+ """Converts an index (integer) in a token (str) using the vocab."""
293
+ return self.fairseq_ids_to_tokens[index]
294
+
295
+ def convert_tokens_to_string(self, tokens):
296
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
297
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
298
+ return out_string
299
+
300
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
301
+ if not os.path.isdir(save_directory):
302
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
303
+ return
304
+ out_vocab_file = os.path.join(
305
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
306
+ )
307
+ out_monolingual_vocab_file = os.path.join(
308
+ save_directory,
309
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"],
310
+ )
311
+
312
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
313
+ copyfile(self.vocab_file, out_vocab_file)
314
+ elif not os.path.isfile(self.vocab_file):
315
+ with open(out_vocab_file, "wb") as fi:
316
+ content_spiece_model = self.sp_model.serialized_model_proto()
317
+ fi.write(content_spiece_model)
318
+
319
+ if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(
320
+ out_monolingual_vocab_file
321
+ ) and os.path.isfile(self.monolingual_vocab_file):
322
+ copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file)
323
+ elif not os.path.isfile(self.monolingual_vocab_file):
324
+ with open(out_monolingual_vocab_file, "w", encoding="utf-8") as fp:
325
+ for token in self.fairseq_tokens_to_ids:
326
+ if token not in self.all_special_tokens:
327
+ fp.write(f"{str(token)} \n")
328
+
329
+ return out_vocab_file, out_monolingual_vocab_file
tokenization_bartpho_fast.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 VinAI Research and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License
15
+ """ Tokenization classes for BARTpho-syllable model."""
16
+
17
+ import os
18
+ from collections import defaultdict
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, List, Optional, Tuple, Union
21
+
22
+ from transformers.tokenization_utils import AddedToken
23
+ from transformers.tokenization_utils_base import EncodingFast
24
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
25
+ from transformers.utils import is_sentencepiece_available, logging
26
+
27
+
28
+ if is_sentencepiece_available():
29
+ from .tokenization_bartpho import BartphoTokenizer
30
+ else:
31
+ BartphoTokenizer = None
32
+
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+ VOCAB_FILES_NAMES = {
37
+ "vocab_file": "sentencepiece.bpe.model",
38
+ "monolingual_vocab_file": "dict.txt",
39
+ "tokenizer_file": "tokenizer.json",
40
+ }
41
+
42
+ PRETRAINED_VOCAB_FILES_MAP = {
43
+ "vocab_file": {
44
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/sentencepiece.bpe.model",
45
+ },
46
+ "monolingual_vocab_file": {
47
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/dict.txt",
48
+ },
49
+ "tokenizer_file": {
50
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/tokenizer.json",
51
+ },
52
+ }
53
+
54
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
55
+
56
+
57
+ class BartphoTokenizerFast(PreTrainedTokenizerFast):
58
+ """
59
+ Construct a "fast" BARTpho tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
60
+ [`XLMRobertaTokenizerFast`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
61
+
62
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
63
+ refer to this superclass for more information regarding those methods.
64
+
65
+ Args:
66
+ vocab_file (`str`):
67
+ Path to the vocabulary file.
68
+ bos_token (`str`, *optional*, defaults to `"<s>"`):
69
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
70
+
71
+ <Tip>
72
+
73
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
74
+ sequence. The token used is the `cls_token`.
75
+
76
+ </Tip>
77
+
78
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
79
+ The end of sequence token.
80
+
81
+ <Tip>
82
+
83
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
84
+ The token used is the `sep_token`.
85
+
86
+ </Tip>
87
+
88
+ sep_token (`str`, *optional*, defaults to `"</s>"`):
89
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
90
+ sequence classification or for a text and a question for question answering. It is also used as the last
91
+ token of a sequence built with special tokens.
92
+ cls_token (`str`, *optional*, defaults to `"<s>"`):
93
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
94
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
95
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
96
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
97
+ token instead.
98
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
99
+ The token used for padding, for example when batching sequences of different lengths.
100
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
101
+ The token used for masking values. This is the token used when training this model with masked language
102
+ modeling. This is the token which the model will try to predict.
103
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
104
+ Additional special tokens used by the tokenizer.
105
+ """
106
+
107
+ vocab_files_names = VOCAB_FILES_NAMES
108
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
109
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
110
+ model_input_names = ["input_ids", "attention_mask"]
111
+ slow_tokenizer_class = BartphoTokenizer
112
+
113
+ def __init__(
114
+ self,
115
+ vocab_file=None,
116
+ monolingual_vocab_file=None,
117
+ tokenizer_file=None,
118
+ bos_token="<s>",
119
+ eos_token="</s>",
120
+ sep_token="</s>",
121
+ cls_token="<s>",
122
+ unk_token="<unk>",
123
+ pad_token="<pad>",
124
+ mask_token="<mask>",
125
+ **kwargs
126
+ ):
127
+ # Mask token behave like a normal word, i.e. include the space before it
128
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
129
+
130
+ super().__init__(
131
+ vocab_file,
132
+ monolingual_vocab_file,
133
+ tokenizer_file=tokenizer_file,
134
+ bos_token=bos_token,
135
+ eos_token=eos_token,
136
+ sep_token=sep_token,
137
+ cls_token=cls_token,
138
+ unk_token=unk_token,
139
+ pad_token=pad_token,
140
+ mask_token=mask_token,
141
+ **kwargs,
142
+ )
143
+
144
+ self.vocab_file = vocab_file
145
+ self.monolingual_vocab_file = monolingual_vocab_file
146
+ self.can_save_slow_tokenizer = False if not self.vocab_file else True
147
+
148
+ def get_added_vocab_hacking(self):
149
+ """
150
+ Returns the added tokens in the vocabulary as a dictionary of token to index.
151
+
152
+ Returns:
153
+ `Dict[str, int], Dict[int, int]`: The added tokens, and their original and new ids
154
+ """
155
+ base_vocab_size = self._tokenizer.get_vocab_size(with_added_tokens=False)
156
+ full_vocab_size = self._tokenizer.get_vocab_size(with_added_tokens=True)
157
+ if full_vocab_size == base_vocab_size:
158
+ return {}, {}
159
+
160
+ # Tokens in added_vocab should have ids that are equal to or larger than the size of base_vocab
161
+ added_vocab = dict(
162
+ (self._tokenizer.id_to_token(index), index + 1 - base_vocab_size + self.mask_token_id)
163
+ for index in range(base_vocab_size, full_vocab_size)
164
+ )
165
+
166
+ id_mapping = dict((index, self._tokenizer.token_to_id(tok)) for tok, index in added_vocab.items())
167
+
168
+ return added_vocab, id_mapping
169
+
170
+ def _decode(
171
+ self,
172
+ token_ids: Union[int, List[int]],
173
+ skip_special_tokens: bool = False,
174
+ clean_up_tokenization_spaces: bool = True,
175
+ **kwargs
176
+ ) -> str:
177
+ self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
178
+
179
+ if isinstance(token_ids, int):
180
+ token_ids = [token_ids]
181
+
182
+ # Mapping ids into their original values
183
+ _, id_mapping = self.get_added_vocab_hacking()
184
+ if len(id_mapping) > 0:
185
+ token_ids = [id_mapping[id] if id in id_mapping else id for id in token_ids]
186
+
187
+ text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
188
+
189
+ if clean_up_tokenization_spaces:
190
+ clean_text = self.clean_up_tokenization(text)
191
+ return clean_text
192
+ else:
193
+ return text
194
+
195
+ def _convert_encoding(
196
+ self,
197
+ encoding: EncodingFast,
198
+ return_token_type_ids: Optional[bool] = None,
199
+ return_attention_mask: Optional[bool] = None,
200
+ return_overflowing_tokens: bool = False,
201
+ return_special_tokens_mask: bool = False,
202
+ return_offsets_mapping: bool = False,
203
+ return_length: bool = False,
204
+ verbose: bool = True,
205
+ ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
206
+ """
207
+ Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
208
+ of encodings, take care of building a batch from overflowing tokens.
209
+
210
+ Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
211
+ lists (overflows) of lists (tokens).
212
+
213
+ Output shape: (overflows, sequence length)
214
+ """
215
+ if return_token_type_ids is None:
216
+ return_token_type_ids = "token_type_ids" in self.model_input_names
217
+ if return_attention_mask is None:
218
+ return_attention_mask = "attention_mask" in self.model_input_names
219
+
220
+ if return_overflowing_tokens and encoding.overflowing is not None:
221
+ encodings = [encoding] + encoding.overflowing
222
+ else:
223
+ encodings = [encoding]
224
+
225
+ encoding_dict = defaultdict(list)
226
+ added_vocab, _ = self.get_added_vocab_hacking()
227
+ for e in encodings:
228
+ # encoding_dict["input_ids"].append(e.ids)
229
+ # Reassign ids of tokens due to the hacking strategy
230
+ ids = []
231
+ for id, token in zip(e.ids, e.tokens):
232
+ if id <= self.mask_token_id:
233
+ ids.append(id)
234
+ else:
235
+ if token.strip() in added_vocab:
236
+ ids.append(added_vocab[token.strip()])
237
+ else:
238
+ ids.append(self.unk_token_id)
239
+
240
+ encoding_dict["input_ids"].append(ids)
241
+
242
+ if return_token_type_ids:
243
+ encoding_dict["token_type_ids"].append(e.type_ids)
244
+ if return_attention_mask:
245
+ encoding_dict["attention_mask"].append(e.attention_mask)
246
+ if return_special_tokens_mask:
247
+ encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
248
+ if return_offsets_mapping:
249
+ encoding_dict["offset_mapping"].append(e.offsets)
250
+ if return_length:
251
+ # encoding_dict["length"].append(len(e.ids))
252
+ encoding_dict["length"].append(len(ids))
253
+
254
+ return encoding_dict, encodings
255
+
256
+ def build_inputs_with_special_tokens(
257
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
258
+ ) -> List[int]:
259
+ """
260
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
261
+ adding special tokens. A BARTpho sequence has the following format:
262
+
263
+ - single sequence: `<s> X </s>`
264
+ - pair of sequences: `<s> A </s></s> B </s>`
265
+
266
+ Args:
267
+ token_ids_0 (`List[int]`):
268
+ List of IDs to which the special tokens will be added.
269
+ token_ids_1 (`List[int]`, *optional*):
270
+ Optional second list of IDs for sequence pairs.
271
+
272
+ Returns:
273
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
274
+ """
275
+
276
+ if token_ids_1 is None:
277
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
278
+ cls = [self.cls_token_id]
279
+ sep = [self.sep_token_id]
280
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
281
+
282
+ def create_token_type_ids_from_sequences(
283
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
284
+ ) -> List[int]:
285
+ """
286
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTpho does not
287
+ make use of token type ids, therefore a list of zeros is returned.
288
+
289
+ Args:
290
+ token_ids_0 (`List[int]`):
291
+ List of IDs.
292
+ token_ids_1 (`List[int]`, *optional*):
293
+ Optional second list of IDs for sequence pairs.
294
+
295
+ Returns:
296
+ `List[int]`: List of zeros.
297
+
298
+ """
299
+
300
+ sep = [self.sep_token_id]
301
+ cls = [self.cls_token_id]
302
+
303
+ if token_ids_1 is None:
304
+ return len(cls + token_ids_0 + sep) * [0]
305
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
306
+
307
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
308
+ if not self.can_save_slow_tokenizer:
309
+ raise ValueError(
310
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a "
311
+ "slow tokenizer."
312
+ )
313
+
314
+ if not os.path.isdir(save_directory):
315
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
316
+ return
317
+
318
+ out_vocab_file = os.path.join(
319
+ save_directory,
320
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
321
+ )
322
+
323
+ out_monolingual_vocab_file = os.path.join(
324
+ save_directory,
325
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"],
326
+ )
327
+
328
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
329
+ copyfile(self.vocab_file, out_vocab_file)
330
+
331
+ if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(out_monolingual_vocab_file):
332
+ copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file)
333
+
334
+ return (out_vocab_file, out_monolingual_vocab_file)