superdocker commited on
Commit
bc7c04c
1 Parent(s): 2fc4a43

Upload tokenizer

Browse files
midm_bitext_tokenization.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ """ Tokenization class for model Midm_bitext_tonkenizer."""
14
+ import os
15
+ import re
16
+ import warnings
17
+ from shutil import copyfile
18
+ from typing import Any, Dict, List, Optional, Tuple
19
+
20
+ import sentencepiece as spm
21
+
22
+ from transformers.tokenization_utils import PreTrainedTokenizer
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ VOCAB_FILES_NAMES = {"vocab_file": "midm_bitext_tokenizer.model"}
29
+
30
+ PRETRAINED_VOCAB_FILES_MAP = {}
31
+
32
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
33
+
34
+
35
+ class Midm_bitext_Tokenizer(PreTrainedTokenizer):
36
+ """
37
+ Construct a Midm bitext tonkenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
38
+
39
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
40
+ this superclass for more information regarding those methods.
41
+
42
+ Args:
43
+ vocab_file (`str`):
44
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
45
+ contains the vocabulary necessary to instantiate a tokenizer.
46
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
47
+ The end of sequence token.
48
+
49
+ <Tip>
50
+
51
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
52
+ The token used is the `sep_token`.
53
+
54
+ </Tip>
55
+
56
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
57
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
58
+ token instead.
59
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
60
+ The token used for padding, for example when batching sequences of different lengths.
61
+ extra_ids (`int`, *optional*, defaults to 100):
62
+ Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
63
+ accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
64
+ indexed from the end of the vocabulary up to beginning.
65
+ additional_special_tokens (`List[str]`, *optional*):
66
+ Additional special tokens used by the tokenizer.
67
+ sp_model_kwargs (`dict`, *optional*):
68
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
69
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
70
+ to set:
71
+
72
+ - `enable_sampling`: Enable subword regularization.
73
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
74
+
75
+ - `nbest_size = {0,1}`: No sampling is performed.
76
+ - `nbest_size > 1`: samples from the nbest_size results.
77
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
78
+ using forward-filtering-and-backward-sampling algorithm.
79
+
80
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
81
+ BPE-dropout.
82
+
83
+ Attributes:
84
+ sp_model (`SentencePieceProcessor`):
85
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
86
+ """
87
+
88
+ vocab_files_names = VOCAB_FILES_NAMES
89
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
90
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
91
+ model_input_names = ["input_ids", "attention_mask"]
92
+
93
+ def __init__(
94
+ self,
95
+ vocab_file,
96
+ eos_token="</s>",
97
+ unk_token="<unk>",
98
+ pad_token="<pad>",
99
+ extra_ids=100,
100
+ additional_special_tokens=None,
101
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
102
+ **kwargs
103
+ ) -> None:
104
+ # Add extra_ids to the special token list
105
+ if extra_ids > 0 and additional_special_tokens is None:
106
+ additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
107
+ elif extra_ids > 0 and additional_special_tokens is not None:
108
+ # Check that we have the right number of extra_id special tokens
109
+ extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
110
+ if extra_tokens != extra_ids:
111
+ raise ValueError(
112
+ f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are provided to Midm_bitext_Tonkenizer. "
113
+ "In this case the additional_special_tokens must include the extra_ids tokens"
114
+ )
115
+
116
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
117
+
118
+ # custom special tokens
119
+ # convert \n, \t in input text -> <[!newline]>, <[!tab]>
120
+ self.newline_token = "<[!newline]>"
121
+ self.tab_token = "<[!tab]>"
122
+
123
+ self.vocab_file = vocab_file
124
+ self._extra_ids = extra_ids
125
+
126
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
127
+ self.sp_model.Load(vocab_file)
128
+ super().__init__(
129
+ eos_token=eos_token,
130
+ unk_token=unk_token,
131
+ pad_token=pad_token,
132
+ extra_ids=extra_ids,
133
+ additional_special_tokens=additional_special_tokens,
134
+ sp_model_kwargs=self.sp_model_kwargs,
135
+ **kwargs,
136
+ )
137
+
138
+
139
+
140
+ @property
141
+ def vocab_size(self):
142
+ return self.sp_model.get_piece_size() + self._extra_ids
143
+
144
+ def get_vocab(self):
145
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
146
+ vocab.update(self.added_tokens_encoder)
147
+ return vocab
148
+
149
+ def get_special_tokens_mask(
150
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
151
+ ) -> List[int]:
152
+ """
153
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
154
+ special tokens using the tokenizer `prepare_for_model` method.
155
+
156
+ Args:
157
+ token_ids_0 (`List[int]`):
158
+ List of IDs.
159
+ token_ids_1 (`List[int]`, *optional*):
160
+ Optional second list of IDs for sequence pairs.
161
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
162
+ Whether or not the token list is already formatted with special tokens for the model.
163
+
164
+ Returns:
165
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
166
+ """
167
+ if already_has_special_tokens:
168
+ return super().get_special_tokens_mask(
169
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
170
+ )
171
+
172
+ # normal case: some special tokens
173
+ if token_ids_1 is None:
174
+ return ([0] * len(token_ids_0)) + [1]
175
+ return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
176
+
177
+ def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
178
+ """Do not add eos again if user already added it."""
179
+ if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
180
+ warnings.warn(
181
+ f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
182
+ )
183
+ return token_ids
184
+ else:
185
+ return token_ids
186
+ #return token_ids + [self.eos_token_id]
187
+
188
+ def create_token_type_ids_from_sequences(
189
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
190
+ ) -> List[int]:
191
+ """
192
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. Midm does not make
193
+ use of token type ids, therefore a list of zeros is returned.
194
+
195
+ Args:
196
+ token_ids_0 (`List[int]`):
197
+ List of IDs.
198
+ token_ids_1 (`List[int]`, *optional*):
199
+ Optional second list of IDs for sequence pairs.
200
+
201
+ Returns:
202
+ `List[int]`: List of zeros.
203
+ """
204
+ eos = [self.eos_token_id]
205
+
206
+ if token_ids_1 is None:
207
+ return len(token_ids_0 + eos) * [0]
208
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
209
+
210
+ def build_inputs_with_special_tokens(
211
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
212
+ ) -> List[int]:
213
+ """
214
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
215
+ adding special tokens. A sequence has the following format:
216
+
217
+ - single sequence: `X </s>`
218
+ - pair of sequences: `A </s> B </s>`
219
+
220
+ Args:
221
+ token_ids_0 (`List[int]`):
222
+ List of IDs to which the special tokens will be added.
223
+ token_ids_1 (`List[int]`, *optional*):
224
+ Optional second list of IDs for sequence pairs.
225
+
226
+ Returns:
227
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
228
+ """
229
+ token_ids_0 = self._add_eos_if_not_present(token_ids_0)
230
+ if token_ids_1 is None:
231
+ return token_ids_0
232
+ else:
233
+ token_ids_1 = self._add_eos_if_not_present(token_ids_1)
234
+ return token_ids_0 + token_ids_1
235
+
236
+ def __getstate__(self):
237
+ state = self.__dict__.copy()
238
+ state["sp_model"] = None
239
+ return state
240
+
241
+ def __setstate__(self, d):
242
+ self.__dict__ = d
243
+
244
+ # for backward compatibility
245
+ if not hasattr(self, "sp_model_kwargs"):
246
+ self.sp_model_kwargs = {}
247
+
248
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
249
+ self.sp_model.Load(self.vocab_file)
250
+
251
+ def _tokenize(self, text: str) -> List[str]:
252
+ """Take as input a string and return a list of strings (tokens) for words/sub-words"""
253
+ text = text.replace("\n", self.newline_token)
254
+ text = text.replace("\t", self.tab_token)
255
+
256
+ return self.sp_model.encode(text, out_type=str)
257
+
258
+ def _convert_token_to_id(self, token):
259
+ """Converts a token (str) in an id using the vocab."""
260
+ if token.startswith("<extra_id_"):
261
+ match = re.match(r"<extra_id_(\d+)>", token)
262
+ num = int(match.group(1))
263
+ return self.vocab_size - num - 1
264
+ return self.sp_model.piece_to_id(token)
265
+
266
+ def _convert_id_to_token(self, index):
267
+ """Converts an index (integer) in a token (str) using the vocab."""
268
+ if index < self.sp_model.get_piece_size():
269
+ token = self.sp_model.IdToPiece(index)
270
+ else:
271
+ token = f"<extra_id_{self.vocab_size - 1 - index}>"
272
+ return token
273
+
274
+ def convert_tokens_to_string(self, tokens):
275
+ """Converts a sequence of tokens (string) in a single string."""
276
+ current_sub_tokens = []
277
+ out_string = ""
278
+ for token in tokens:
279
+ # make sure that special tokens are not decoded using sentencepiece model
280
+ if token in self.all_special_tokens:
281
+ out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
282
+ current_sub_tokens = []
283
+ else:
284
+ current_sub_tokens.append(token)
285
+ out_string += self.sp_model.decode_pieces(current_sub_tokens)
286
+
287
+ out_string.replace(self.newline_token, "\n")
288
+ out_string.replace(self.tab_token, "\t")
289
+
290
+ return out_string.strip()
291
+
292
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
293
+ if not os.path.isdir(save_directory):
294
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
295
+ return
296
+ out_vocab_file = os.path.join(
297
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
298
+ )
299
+
300
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
301
+ copyfile(self.vocab_file, out_vocab_file)
302
+ elif not os.path.isfile(self.vocab_file):
303
+ with open(out_vocab_file, "wb") as fi:
304
+ content_spiece_model = self.sp_model.serialized_model_proto()
305
+ fi.write(content_spiece_model)
306
+
307
+ return (out_vocab_file,)
midm_bitext_tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98789fa1bf89a1f9692889fb4a0029d3d096a9109cebf4f6bce1a255f2701378
3
+ size 1457356
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "3": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [],
29
+ "auto_map": {
30
+ "AutoTokenizer": [
31
+ "midm_bitext_tokenization.Midm_bitext_Tokenizer",
32
+ null
33
+ ]
34
+ },
35
+ "clean_up_tokenization_spaces": true,
36
+ "eos_token": "</s>",
37
+ "extra_ids": 0,
38
+ "model_max_length": 1000000000000000019884624838656,
39
+ "pad_token": "<pad>",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "Midm_bitext_Tokenizer",
42
+ "unk_token": "<unk>"
43
+ }