kkmkorea commited on
Commit
5989d8a
1 Parent(s): 9292d0c

Upload tokenization_korscideberta_v2.py

Browse files
Files changed (1) hide show
  1. tokenization_korscideberta_v2.py +580 -0
tokenization_korscideberta_v2.py ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 Microsoft and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization class for model DeBERTa."""
16
+
17
+ import os
18
+ import unicodedata
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ import sentencepiece as sp
22
+
23
+ from transformers import AddedToken, PreTrainedTokenizer
24
+ from transformers import logging
25
+ #2023. 7. 28. 형태소 분리(Mecab), 유니코드 정규화 추가
26
+ from konlpy.tag import Mecab
27
+ from unicode import join_jamos
28
+ from normalize import MosesPunctNormalizer
29
+ nor = MosesPunctNormalizer()
30
+
31
+ def has_coda(word):
32
+ return (ord(word[-1]) -44032)%28==0
33
+ def _replace_unicode(line):
34
+ if(line==None):
35
+ return ""
36
+ line = line.replace("—",'-').replace("―","-").replace("–","-").replace(""",'"').replace("'","'").replace("‹","<").replace("›",">").replace("‚","'").replace("‛","'").replace("„",'"').replace("‟",'"').replace("«",'<').replace("»",'>').replace("˝",'"').replace("(",'(').replace(")",')').replace("『",'"').replace("』",'"').replace("“",'"').replace("”",'"').replace("‘","'").replace("’","'").replace("《","<").replace("》",">").replace("〈","<").replace("〉",">").replace("「","'").replace("」","'").replace("【","[").replace("】","]").replace("〔","[").replace("〕","]").replace("[","[").replace("]","]").replace("{","{").replace("}","}")
37
+ line=nor.replace_unicode_punct(line)
38
+ return line
39
+ def _mecab(line):
40
+ mecab = Mecab()
41
+ #참고: VV동사 VA형용사 VX보조 용언 VCP긍정 지정사 VCN부정 지정사 JKS주격 조사 JKC보격 조사, … XSN명사 파생 접미사 XSV동사 파생 접미사 XSA형용사 파생 접미사 EP선어말 어미 EF종결 어미 EC연결 어미 ETN명사형 전성 어미 ETM관형형 전성 어미
42
+
43
+ pdoc = []
44
+ morphs = []
45
+
46
+ poss = mecab.pos(line)
47
+ for pos in poss:
48
+ morphs.append(pos[0])
49
+ '''
50
+ pdoc.append(" ".join(morphs))
51
+ return pdoc
52
+ '''
53
+ return " ".join(morphs)
54
+
55
+ logger = logging.get_logger(__name__)
56
+
57
+ PRETRAINED_VOCAB_FILES_MAP = {
58
+ "vocab_file": {
59
+ "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
60
+ "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
61
+ "microsoft/deberta-v2-xlarge-mnli": (
62
+ "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model"
63
+ ),
64
+ "microsoft/deberta-v2-xxlarge-mnli": (
65
+ "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model"
66
+ ),
67
+ }
68
+ }
69
+
70
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
71
+ "microsoft/deberta-v2-xlarge": 512,
72
+ "microsoft/deberta-v2-xxlarge": 512,
73
+ "microsoft/deberta-v2-xlarge-mnli": 512,
74
+ "microsoft/deberta-v2-xxlarge-mnli": 512,
75
+ }
76
+
77
+ PRETRAINED_INIT_CONFIGURATION = {
78
+ "microsoft/deberta-v2-xlarge": {"do_lower_case": False},
79
+ "microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
80
+ "microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
81
+ "microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
82
+ }
83
+
84
+ VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
85
+
86
+
87
+ class DebertaV2Tokenizer(PreTrainedTokenizer):
88
+ r"""
89
+ Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
90
+
91
+ Args:
92
+ vocab_file (`str`):
93
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
94
+ contains the vocabulary necessary to instantiate a tokenizer.
95
+ do_lower_case (`bool`, *optional*, defaults to `False`):
96
+ Whether or not to lowercase the input when tokenizing.
97
+ bos_token (`string`, *optional*, defaults to `"[CLS]"`):
98
+ The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
99
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
100
+ sequence. The token used is the `cls_token`.
101
+ eos_token (`string`, *optional*, defaults to `"[SEP]"`):
102
+ The end of sequence token. When building a sequence using special tokens, this is not the token that is
103
+ used for the end of sequence. The token used is the `sep_token`.
104
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
105
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
106
+ token instead.
107
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
108
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
109
+ sequence classification or for a text and a question for question answering. It is also used as the last
110
+ token of a sequence built with special tokens.
111
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
112
+ The token used for padding, for example when batching sequences of different lengths.
113
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
114
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
115
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
116
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
117
+ The token used for masking values. This is the token used when training this model with masked language
118
+ modeling. This is the token which the model will try to predict.
119
+ sp_model_kwargs (`dict`, *optional*):
120
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
121
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
122
+ to set:
123
+
124
+ - `enable_sampling`: Enable subword regularization.
125
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
126
+
127
+ - `nbest_size = {0,1}`: No sampling is performed.
128
+ - `nbest_size > 1`: samples from the nbest_size results.
129
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
130
+ using forward-filtering-and-backward-sampling algorithm.
131
+
132
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
133
+ BPE-dropout.
134
+ """
135
+
136
+ vocab_files_names = VOCAB_FILES_NAMES
137
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
138
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
139
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
140
+
141
+ def __init__(
142
+ self,
143
+ vocab_file,
144
+ do_lower_case=False,
145
+ split_by_punct=False,
146
+ bos_token="[CLS]",
147
+ eos_token="[SEP]",
148
+ unk_token="[UNK]",
149
+ sep_token="[SEP]",
150
+ pad_token="[PAD]",
151
+ cls_token="[CLS]",
152
+ mask_token="[MASK]",
153
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
154
+ **kwargs,
155
+ ) -> None:
156
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
157
+
158
+ if not os.path.isfile(vocab_file):
159
+ raise ValueError(
160
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
161
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
162
+ )
163
+ self.do_lower_case = do_lower_case
164
+ self.split_by_punct = split_by_punct
165
+ self.vocab_file = vocab_file
166
+ self._tokenizer = SPMTokenizer(
167
+ vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
168
+ )
169
+ unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
170
+ super().__init__(
171
+ do_lower_case=do_lower_case,
172
+ bos_token=bos_token,
173
+ eos_token=eos_token,
174
+ unk_token=unk_token,
175
+ sep_token=sep_token,
176
+ pad_token=pad_token,
177
+ cls_token=cls_token,
178
+ mask_token=mask_token,
179
+ split_by_punct=split_by_punct,
180
+ sp_model_kwargs=self.sp_model_kwargs,
181
+ **kwargs,
182
+ )
183
+ self._tokenizer.special_tokens = self.all_special_tokens
184
+
185
+ @property
186
+ def vocab_size(self):
187
+ return len(self.vocab)
188
+
189
+ @property
190
+ def vocab(self):
191
+ return self._tokenizer.vocab
192
+
193
+ def get_vocab(self):
194
+ vocab = self.vocab.copy()
195
+ vocab.update(self.get_added_vocab())
196
+ return vocab
197
+
198
+ def _tokenize(self, text: str) -> List[str]:
199
+ """Take as input a string and return a list of strings (tokens) for words/sub-words"""
200
+ if self.do_lower_case:
201
+ text = text.lower()
202
+ return self._tokenizer.tokenize(text)
203
+
204
+ def _convert_token_to_id(self, token):
205
+ """Converts a token (str) in an id using the vocab."""
206
+ return self._tokenizer.spm.PieceToId(token)
207
+
208
+ def _convert_id_to_token(self, index):
209
+ """Converts an index (integer) in a token (str) using the vocab."""
210
+ return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
211
+
212
+ def convert_tokens_to_string(self, tokens):
213
+ """Converts a sequence of tokens (string) in a single string."""
214
+ return self._tokenizer.decode(tokens)
215
+
216
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
217
+ """
218
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
219
+ adding special tokens. A DeBERTa sequence has the following format:
220
+
221
+ - single sequence: [CLS] X [SEP]
222
+ - pair of sequences: [CLS] A [SEP] B [SEP]
223
+
224
+ Args:
225
+ token_ids_0 (`List[int]`):
226
+ List of IDs to which the special tokens will be added.
227
+ token_ids_1 (`List[int]`, *optional*):
228
+ Optional second list of IDs for sequence pairs.
229
+
230
+ Returns:
231
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
232
+ """
233
+
234
+ if token_ids_1 is None:
235
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
236
+ cls = [self.cls_token_id]
237
+ sep = [self.sep_token_id]
238
+ return cls + token_ids_0 + sep + token_ids_1 + sep
239
+
240
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
241
+ """
242
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
243
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
244
+
245
+ Args:
246
+ token_ids_0 (`List[int]`):
247
+ List of IDs.
248
+ token_ids_1 (`List[int]`, *optional*):
249
+ Optional second list of IDs for sequence pairs.
250
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
251
+ Whether or not the token list is already formatted with special tokens for the model.
252
+
253
+ Returns:
254
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
255
+ """
256
+
257
+ if already_has_special_tokens:
258
+ return super().get_special_tokens_mask(
259
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
260
+ )
261
+
262
+ if token_ids_1 is not None:
263
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
264
+ return [1] + ([0] * len(token_ids_0)) + [1]
265
+
266
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
267
+ """
268
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
269
+ sequence pair mask has the following format:
270
+
271
+ ```
272
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
273
+ | first sequence | second sequence |
274
+ ```
275
+
276
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
277
+
278
+ Args:
279
+ token_ids_0 (`List[int]`):
280
+ List of IDs.
281
+ token_ids_1 (`List[int]`, *optional*):
282
+ Optional second list of IDs for sequence pairs.
283
+
284
+ Returns:
285
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
286
+ """
287
+ sep = [self.sep_token_id]
288
+ cls = [self.cls_token_id]
289
+ if token_ids_1 is None:
290
+ return len(cls + token_ids_0 + sep) * [0]
291
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
292
+
293
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
294
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
295
+ if is_split_into_words or add_prefix_space:
296
+ text = " " + text
297
+ return (text, kwargs)
298
+
299
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
300
+ return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
301
+
302
+
303
+ class SPMTokenizer:
304
+ r"""
305
+ Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
306
+
307
+ Args:
308
+ vocab_file (`str`):
309
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
310
+ contains the vocabulary necessary to instantiate a tokenizer.
311
+ sp_model_kwargs (`dict`, *optional*):
312
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
313
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
314
+ to set:
315
+
316
+ - `enable_sampling`: Enable subword regularization.
317
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
318
+
319
+ - `nbest_size = {0,1}`: No sampling is performed.
320
+ - `nbest_size > 1`: samples from the nbest_size results.
321
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
322
+ using forward-filtering-and-backward-sampling algorithm.
323
+
324
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
325
+ BPE-dropout.
326
+ """
327
+
328
+ def __init__(
329
+ self, vocab_file, special_tokens, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None
330
+ ):
331
+ self.split_by_punct = split_by_punct
332
+ self.vocab_file = vocab_file
333
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
334
+ spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
335
+ if not os.path.exists(vocab_file):
336
+ raise FileNotFoundError(f"{vocab_file} does not exist!")
337
+ spm.load(vocab_file)
338
+ bpe_vocab_size = spm.GetPieceSize()
339
+ # Token map
340
+ # <unk> 0+1
341
+ # <s> 1+1
342
+ # </s> 2+1
343
+ self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
344
+ self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
345
+ # self.vocab['[PAD]'] = 0
346
+ # self.vocab['[CLS]'] = 1
347
+ # self.vocab['[SEP]'] = 2
348
+ # self.vocab['[UNK]'] = 3
349
+
350
+ self.spm = spm
351
+ self.special_tokens = special_tokens
352
+
353
+ def __getstate__(self):
354
+ state = self.__dict__.copy()
355
+ state["spm"] = None
356
+ return state
357
+
358
+ def __setstate__(self, d):
359
+ self.__dict__ = d
360
+
361
+ # for backward compatibility
362
+ if not hasattr(self, "sp_model_kwargs"):
363
+ self.sp_model_kwargs = {}
364
+
365
+ self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
366
+ self.spm.Load(self.vocab_file)
367
+
368
+ def tokenize(self, text):
369
+ text = _replace_unicode(text) #유니코드 정규화
370
+ text = _mecab(text) #형태소 분리
371
+ return self._encode_as_pieces(text)
372
+
373
+ def convert_ids_to_tokens(self, ids):
374
+ tokens = []
375
+ for i in ids:
376
+ tokens.append(self.ids_to_tokens[i])
377
+ return tokens
378
+
379
+ def decode(self, tokens, start=-1, end=-1, raw_text=None):
380
+ if raw_text is None:
381
+ current_sub_tokens = []
382
+ out_string = ""
383
+ prev_is_special = False
384
+ for token in tokens:
385
+ # make sure that special tokens are not decoded using sentencepiece model
386
+ if token in self.special_tokens:
387
+ if not prev_is_special:
388
+ out_string += " "
389
+ out_string += self.spm.decode_pieces(current_sub_tokens) + token
390
+ prev_is_special = True
391
+ current_sub_tokens = []
392
+ else:
393
+ current_sub_tokens.append(token)
394
+ prev_is_special = False
395
+ out_string += self.spm.decode_pieces(current_sub_tokens)
396
+ return out_string.strip()
397
+ else:
398
+ words = self.split_to_words(raw_text)
399
+ word_tokens = [self.tokenize(w) for w in words]
400
+ token2words = [0] * len(tokens)
401
+ tid = 0
402
+ for i, w in enumerate(word_tokens):
403
+ for k, t in enumerate(w):
404
+ token2words[tid] = i
405
+ tid += 1
406
+ word_start = token2words[start]
407
+ word_end = token2words[end] if end < len(tokens) else len(words)
408
+ text = "".join(words[word_start:word_end])
409
+ return text
410
+
411
+ # TODO add a deprecation cycle as this can have different behaviour from our API
412
+ def add_special_token(self, token):
413
+ if token not in self.special_tokens:
414
+ self.special_tokens.append(token)
415
+ if token not in self.vocab:
416
+ self.vocab[token] = len(self.vocab) - 1
417
+ self.ids_to_tokens.append(token)
418
+ return self.id(token)
419
+
420
+ def part_of_whole_word(self, token, is_bos=False):
421
+ logger.warning_once(
422
+ "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
423
+ )
424
+ if is_bos:
425
+ return True
426
+ if (
427
+ len(token) == 1
428
+ and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
429
+ ) or token in self.special_tokens:
430
+ return False
431
+
432
+ word_start = b"\xe2\x96\x81".decode("utf-8")
433
+ return not token.startswith(word_start)
434
+
435
+ def pad(self):
436
+ return "[PAD]"
437
+
438
+ def bos(self):
439
+ return "[CLS]"
440
+
441
+ def eos(self):
442
+ return "[SEP]"
443
+
444
+ def unk(self):
445
+ return "[UNK]"
446
+
447
+ def mask(self):
448
+ return "[MASK]"
449
+
450
+ def sym(self, id):
451
+ return self.ids_to_tokens[id]
452
+
453
+ def id(self, sym):
454
+ logger.warning_once(
455
+ "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
456
+ )
457
+ return self.vocab[sym] if sym in self.vocab else 1
458
+
459
+ def _encode_as_pieces(self, text):
460
+ text = convert_to_unicode(text)
461
+ if self.split_by_punct:
462
+ words = self._run_split_on_punc(text)
463
+ pieces = [self.spm.encode(w, out_type=str) for w in words]
464
+ return [p for w in pieces for p in w]
465
+ else:
466
+ return self.spm.encode(text, out_type=str)
467
+
468
+ def split_to_words(self, text):
469
+ pieces = self._encode_as_pieces(text)
470
+ word_start = b"\xe2\x96\x81".decode("utf-8")
471
+ words = []
472
+ offset = 0
473
+ prev_end = 0
474
+ for i, p in enumerate(pieces):
475
+ if p.startswith(word_start):
476
+ if offset > prev_end:
477
+ words.append(text[prev_end:offset])
478
+ prev_end = offset
479
+ w = p.replace(word_start, "")
480
+ else:
481
+ w = p
482
+ try:
483
+ s = text.index(w, offset)
484
+ pn = ""
485
+ k = i + 1
486
+ while k < len(pieces):
487
+ pn = pieces[k].replace(word_start, "")
488
+ if len(pn) > 0:
489
+ break
490
+ k += 1
491
+
492
+ if len(pn) > 0 and pn in text[offset:s]:
493
+ offset = offset + 1
494
+ else:
495
+ offset = s + len(w)
496
+ except Exception:
497
+ offset = offset + 1
498
+
499
+ if prev_end < offset:
500
+ words.append(text[prev_end:offset])
501
+
502
+ return words
503
+
504
+ def _run_split_on_punc(self, text):
505
+ """Splits punctuation on a piece of text."""
506
+ chars = list(text)
507
+ i = 0
508
+ start_new_word = True
509
+ output = []
510
+ while i < len(chars):
511
+ char = chars[i]
512
+ if _is_punctuation(char):
513
+ output.append([char])
514
+ start_new_word = True
515
+ else:
516
+ if start_new_word:
517
+ output.append([])
518
+ start_new_word = False
519
+ output[-1].append(char)
520
+ i += 1
521
+
522
+ return ["".join(x) for x in output]
523
+
524
+ def save_pretrained(self, path: str, filename_prefix: str = None):
525
+ filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
526
+ if filename_prefix is not None:
527
+ filename = filename_prefix + "-" + filename
528
+ full_path = os.path.join(path, filename)
529
+ with open(full_path, "wb") as fs:
530
+ fs.write(self.spm.serialized_model_proto())
531
+ return (full_path,)
532
+
533
+
534
+ def _is_whitespace(char):
535
+ """Checks whether `chars` is a whitespace character."""
536
+ # \t, \n, and \r are technically control characters but we treat them
537
+ # as whitespace since they are generally considered as such.
538
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
539
+ return True
540
+ cat = unicodedata.category(char)
541
+ if cat == "Zs":
542
+ return True
543
+ return False
544
+
545
+
546
+ def _is_control(char):
547
+ """Checks whether `chars` is a control character."""
548
+ # These are technically control characters but we count them as whitespace
549
+ # characters.
550
+ if char == "\t" or char == "\n" or char == "\r":
551
+ return False
552
+ cat = unicodedata.category(char)
553
+ if cat.startswith("C"):
554
+ return True
555
+ return False
556
+
557
+
558
+ def _is_punctuation(char):
559
+ """Checks whether `chars` is a punctuation character."""
560
+ cp = ord(char)
561
+ # We treat all non-letter/number ASCII as punctuation.
562
+ # Characters such as "^", "$", and "`" are not in the Unicode
563
+ # Punctuation class but we treat them as punctuation anyways, for
564
+ # consistency.
565
+ if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
566
+ return True
567
+ cat = unicodedata.category(char)
568
+ if cat.startswith("P"):
569
+ return True
570
+ return False
571
+
572
+
573
+ def convert_to_unicode(text):
574
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
575
+ if isinstance(text, str):
576
+ return text
577
+ elif isinstance(text, bytes):
578
+ return text.decode("utf-8", "ignore")
579
+ else:
580
+ raise ValueError(f"Unsupported string type: {type(text)}")