helboukkouri commited on
Commit
9e9c634
·
1 Parent(s): 925f56a

Create tokenization_character_bert.py

Browse files
Files changed (1) hide show
  1. tokenization_character_bert.py +930 -0
tokenization_character_bert.py ADDED
@@ -0,0 +1,930 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright Hicham EL BOUKKOURI, Olivier FERRET, Thomas LAVERGNE, Hiroshi NOJI,
3
+ # Pierre ZWEIGENBAUM, Junichi TSUJII and The HuggingFace Inc. team.
4
+ # All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """Tokenization classes for CharacterBERT."""
18
+ import json
19
+ import os
20
+ import unicodedata
21
+ from collections import OrderedDict
22
+ from typing import Dict, List, Optional, Tuple, Union
23
+
24
+ import numpy as np
25
+
26
+ from transformers.file_utils import is_tf_available, is_torch_available, to_py_obj
27
+ from transformers.tokenization_utils import (
28
+ BatchEncoding,
29
+ EncodedInput,
30
+ PaddingStrategy,
31
+ PreTrainedTokenizer,
32
+ TensorType,
33
+ _is_control,
34
+ _is_punctuation,
35
+ _is_whitespace,
36
+ )
37
+ from transformers.tokenization_utils_base import ADDED_TOKENS_FILE
38
+ from transformers.utils import logging
39
+
40
+
41
+ logger = logging.get_logger(__name__)
42
+
43
+ VOCAB_FILES_NAMES = {
44
+ "mlm_vocab_file": "mlm_vocab.txt",
45
+ }
46
+
47
+ PRETRAINED_VOCAB_FILES_MAP = {
48
+ "mlm_vocab_file": {
49
+ "helboukkouri/character-bert": "https://huggingface.co/helboukkouri/character-bert/resolve/main/mlm_vocab.txt",
50
+ "helboukkouri/character-bert-medical": "https://huggingface.co/helboukkouri/character-bert-medical/resolve/main/mlm_vocab.txt",
51
+ }
52
+ }
53
+
54
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
55
+ "helboukkouri/character-bert": 512,
56
+ "helboukkouri/character-bert-medical": 512,
57
+ }
58
+
59
+ PRETRAINED_INIT_CONFIGURATION = {
60
+ "helboukkouri/character-bert": {"max_word_length": 50, "do_lower_case": True},
61
+ "helboukkouri/character-bert-medical": {"max_word_length": 50, "do_lower_case": True},
62
+ }
63
+
64
+ PAD_TOKEN_CHAR_ID = 0
65
+
66
+
67
+ def whitespace_tokenize(text):
68
+ """Runs basic whitespace cleaning and splitting on a piece of text."""
69
+ text = text.strip()
70
+ if not text:
71
+ return []
72
+ tokens = text.split()
73
+ return tokens
74
+
75
+
76
+ def build_mlm_ids_to_tokens_mapping(mlm_vocab_file):
77
+ """Builds a Masked Language Modeling ids to masked tokens mapping."""
78
+ vocabulary = []
79
+ with open(mlm_vocab_file, "r", encoding="utf-8") as reader:
80
+ for line in reader:
81
+ line = line.strip()
82
+ if line:
83
+ vocabulary.append(line)
84
+ return OrderedDict(list(enumerate(vocabulary)))
85
+
86
+
87
+ class CharacterBertTokenizer(PreTrainedTokenizer):
88
+ """
89
+ Construct a CharacterBERT tokenizer. Based on characters.
90
+
91
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
92
+ Users should refer to this superclass for more information regarding those methods.
93
+
94
+ Args:
95
+ mlm_vocab_file (`str`, *optional*, defaults to `None`):
96
+ Path to the Masked Language Modeling vocabulary. This is used for converting the output (token ids) of the
97
+ MLM model into tokens.
98
+ max_word_length (`int`, *optional*, defaults to `50`):
99
+ The maximum token length in characters (actually, in bytes as any non-ascii characters will be converted to
100
+ a sequence of utf-8 bytes).
101
+ do_lower_case (`bool`, *optional*, defaults to `True`):
102
+ Whether or not to lowercase the input when tokenizing.
103
+ do_basic_tokenize (`bool`, *optional*, defaults to `True`):
104
+ Whether or not to do basic tokenization before WordPiece.
105
+ never_split (`Iterable`, *optional*):
106
+ Collection of tokens which will never be split during tokenization. Only has an effect when
107
+ `do_basic_tokenize=True`
108
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
109
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
110
+ token instead.
111
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
112
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
113
+ sequence classification or for a text and a question for question answering. It is also used as the last
114
+ token of a sequence built with special tokens.
115
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
116
+ The token used for padding, for example when batching sequences of different lengths.
117
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
118
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
119
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
120
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
121
+ The token used for masking values. This is the token used when training this model with masked language
122
+ modeling. This is the token which the model will try to predict.
123
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
124
+ Whether or not to tokenize Chinese characters.
125
+ strip_accents: (`bool`, *optional*):
126
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
127
+ value for `lowercase` (as in the original BERT).
128
+ """
129
+
130
+ vocab_files_names = VOCAB_FILES_NAMES
131
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
132
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
133
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
134
+
135
+ def __init__(
136
+ self,
137
+ mlm_vocab_file=None,
138
+ max_word_length=50,
139
+ do_lower_case=True,
140
+ do_basic_tokenize=True,
141
+ never_split=None,
142
+ unk_token="[UNK]",
143
+ sep_token="[SEP]",
144
+ pad_token="[PAD]",
145
+ cls_token="[CLS]",
146
+ mask_token="[MASK]",
147
+ tokenize_chinese_chars=True,
148
+ strip_accents=None,
149
+ **kwargs
150
+ ):
151
+ super().__init__(
152
+ max_word_length=max_word_length,
153
+ do_lower_case=do_lower_case,
154
+ do_basic_tokenize=do_basic_tokenize,
155
+ never_split=never_split,
156
+ unk_token=unk_token,
157
+ sep_token=sep_token,
158
+ pad_token=pad_token,
159
+ cls_token=cls_token,
160
+ mask_token=mask_token,
161
+ tokenize_chinese_chars=tokenize_chinese_chars,
162
+ strip_accents=strip_accents,
163
+ **kwargs,
164
+ )
165
+ # This prevents splitting special tokens during tokenization
166
+ self.unique_no_split_tokens = [self.cls_token, self.mask_token, self.pad_token, self.sep_token, self.unk_token]
167
+ # This is used for converting MLM ids into tokens
168
+ if mlm_vocab_file is None:
169
+ self.ids_to_tokens = None
170
+ else:
171
+ if not os.path.isfile(mlm_vocab_file):
172
+ raise ValueError(
173
+ f"Can't find a vocabulary file at path '{mlm_vocab_file}'. "
174
+ "To load the vocabulary from a pretrained model use "
175
+ "`tokenizer = CharacterBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
176
+ )
177
+ self.ids_to_tokens = build_mlm_ids_to_tokens_mapping(mlm_vocab_file)
178
+ # Tokenization is handled by BasicTokenizer
179
+ self.do_basic_tokenize = do_basic_tokenize
180
+ if do_basic_tokenize:
181
+ self.basic_tokenizer = BasicTokenizer(
182
+ do_lower_case=do_lower_case,
183
+ never_split=never_split,
184
+ tokenize_chinese_chars=tokenize_chinese_chars,
185
+ strip_accents=strip_accents,
186
+ )
187
+ # Then, a CharacterMapper is responsible for converting tokens into character ids
188
+ self.max_word_length = max_word_length
189
+ self._mapper = CharacterMapper(max_word_length=max_word_length)
190
+
191
+ def __repr__(self) -> str:
192
+ # NOTE: we overwrite this because CharacterBERT does not have self.vocab_size
193
+ return (
194
+ f"CharacterBertTokenizer(name_or_path='{self.name_or_path}', "
195
+ + (f"mlm_vocab_size={self.mlm_vocab_size}, " if self.ids_to_tokens else "")
196
+ + f"model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
197
+ + f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
198
+ )
199
+
200
+ def __len__(self):
201
+ """
202
+ Size of the full vocabulary with the added tokens.
203
+ """
204
+ # return self.vocab_size + len(self.added_tokens_encoder)
205
+ return 0 + len(self.added_tokens_encoder)
206
+
207
+ @property
208
+ def do_lower_case(self):
209
+ return self.basic_tokenizer.do_lower_case
210
+
211
+ @property
212
+ def vocab_size(self):
213
+ raise NotImplementedError("CharacterBERT does not use a token vocabulary.")
214
+
215
+ @property
216
+ def mlm_vocab_size(self):
217
+ if self.ids_to_tokens is None:
218
+ raise ValueError(
219
+ "CharacterBertTokenizer was initialized without a MLM "
220
+ "vocabulary. You can either pass one manually or load a "
221
+ "pre-trained tokenizer using: "
222
+ "`tokenizer = CharacterBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
223
+ )
224
+ return len(self.ids_to_tokens)
225
+
226
+ def add_special_tokens(self, *args, **kwargs):
227
+ raise NotImplementedError("Adding special tokens is not supported for now.")
228
+
229
+ def add_tokens(self, *args, **kwargs):
230
+ # We don't raise an Exception here to allow for ignoring this step.
231
+ # Otherwise, many inherited methods would need to be re-implemented...
232
+ pass
233
+
234
+ def get_vocab(self):
235
+ raise NotImplementedError("CharacterBERT does not have a token vocabulary.")
236
+
237
+ def get_mlm_vocab(self):
238
+ return {token: i for i, token in self.ids_to_tokens.items()}
239
+
240
+ def _tokenize(self, text):
241
+ split_tokens = []
242
+ if self.do_basic_tokenize:
243
+ split_tokens = self.basic_tokenizer.tokenize(text=text, never_split=self.all_special_tokens)
244
+ else:
245
+ split_tokens = whitespace_tokenize(text) # Default to whitespace tokenization
246
+ return split_tokens
247
+
248
+ def convert_tokens_to_string(self, tokens):
249
+ """Converts a sequence of tokens (string) in a single string."""
250
+ out_string = " ".join(tokens).strip()
251
+ return out_string
252
+
253
+ def _convert_token_to_id(self, token):
254
+ """Converts a token (str) into a sequence of character ids."""
255
+ return self._mapper.convert_word_to_char_ids(token)
256
+
257
+ def _convert_id_to_token(self, index: List[int]):
258
+ # NOTE: keeping the same variable name `ìndex` although this will
259
+ # always be a sequence of indices.
260
+ """Converts an index (actually, a list of indices) in a token (str)."""
261
+ return self._mapper.convert_char_ids_to_word(index)
262
+
263
+ def convert_ids_to_tokens(
264
+ self, ids: Union[List[int], List[List[int]]], skip_special_tokens: bool = False
265
+ ) -> Union[str, List[str]]:
266
+ """
267
+ Converts a single sequence of character indices or a sequence of character id sequences in a token or a
268
+ sequence of tokens.
269
+
270
+ Args:
271
+ ids (`int` or `List[int]`):
272
+ The token id (or token ids) to convert to tokens.
273
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
274
+ Whether or not to remove special tokens in the decoding.
275
+
276
+ Returns:
277
+ `str` or `List[str]`: The decoded token(s).
278
+ """
279
+ if isinstance(ids, list) and isinstance(ids[0], int):
280
+ if tuple(ids) in self.added_tokens_decoder:
281
+ return self.added_tokens_decoder[tuple(ids)]
282
+ else:
283
+ return self._convert_id_to_token(ids)
284
+ tokens = []
285
+ for indices in ids:
286
+ indices = list(map(int, indices))
287
+ if skip_special_tokens and tuple(indices) in self.all_special_ids:
288
+ continue
289
+ if tuple(indices) in self.added_tokens_decoder:
290
+ tokens.append(self.added_tokens_decoder[tuple(indices)])
291
+ else:
292
+ tokens.append(self._convert_id_to_token(indices))
293
+ return tokens
294
+
295
+ def convert_mlm_id_to_token(self, mlm_id):
296
+ """Converts an index (integer) in a token (str) using the vocab."""
297
+ if self.ids_to_tokens is None:
298
+ raise ValueError(
299
+ "CharacterBertTokenizer was initialized without a MLM "
300
+ "vocabulary. You can either pass one manually or load a "
301
+ "pre-trained tokenizer using: "
302
+ "`tokenizer = CharacterBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
303
+ )
304
+ assert (
305
+ mlm_id < self.mlm_vocab_size
306
+ ), "Attempting to convert a MLM id that is greater than the MLM vocabulary size."
307
+ return self.ids_to_tokens[mlm_id]
308
+
309
+ def build_inputs_with_special_tokens(
310
+ self, token_ids_0: List[List[int]], token_ids_1: Optional[List[List[int]]] = None
311
+ ) -> List[List[int]]:
312
+ """
313
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
314
+ adding special tokens. A CharacterBERT sequence has the following format:
315
+
316
+ - single sequence: `[CLS] X [SEP]`
317
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
318
+
319
+ Args:
320
+ token_ids_0 (`List[int]`):
321
+ List of IDs to which the special tokens will be added.
322
+ token_ids_1 (`List[int]`, *optional*):
323
+ Optional second list of IDs for sequence pairs.
324
+
325
+ Returns:
326
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
327
+ """
328
+ if token_ids_1 is None:
329
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
330
+ cls = [self.cls_token_id]
331
+ sep = [self.sep_token_id]
332
+ return cls + token_ids_0 + sep + token_ids_1 + sep
333
+
334
+ def get_special_tokens_mask(
335
+ self,
336
+ token_ids_0: List[List[int]],
337
+ token_ids_1: Optional[List[List[int]]] = None,
338
+ already_has_special_tokens: bool = False,
339
+ ) -> List[int]:
340
+ """
341
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
342
+ special tokens using the tokenizer `prepare_for_model` method.
343
+
344
+ Args:
345
+ token_ids_0 (`List[int]`):
346
+ List of IDs.
347
+ token_ids_1 (`List[int]`, *optional*):
348
+ Optional second list of IDs for sequence pairs.
349
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
350
+ Whether or not the token list is already formatted with special tokens for the model.
351
+
352
+ Returns:
353
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
354
+ """
355
+ if already_has_special_tokens:
356
+ if token_ids_1 is not None:
357
+ raise ValueError(
358
+ "You should not supply a second sequence if the provided sequence of "
359
+ "ids is already formatted with special tokens for the model."
360
+ )
361
+ return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
362
+
363
+ if token_ids_1 is not None:
364
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
365
+ return [1] + ([0] * len(token_ids_0)) + [1]
366
+
367
+ def create_token_type_ids_from_sequences(
368
+ self, token_ids_0: List[List[int]], token_ids_1: Optional[List[List[int]]] = None
369
+ ) -> List[int]:
370
+ """
371
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CharacterBERT
372
+ sequence pair mask has the following format:
373
+
374
+ ```
375
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
376
+ ```
377
+
378
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
379
+
380
+ Args:
381
+ token_ids_0 (`List[int]`):
382
+ List of IDs.
383
+ token_ids_1 (`List[int]`, *optional*):
384
+ Optional second list of IDs for sequence pairs.
385
+
386
+ Returns:
387
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
388
+ sequence(s).
389
+ """
390
+ sep = [self.sep_token_id]
391
+ cls = [self.cls_token_id]
392
+ if token_ids_1 is None:
393
+ return len(cls + token_ids_0 + sep) * [0]
394
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
395
+
396
+ # def pad(
397
+ # self,
398
+ # encoded_inputs: Union[
399
+ # BatchEncoding,
400
+ # List[BatchEncoding],
401
+ # Dict[str, EncodedInput],
402
+ # Dict[str, List[EncodedInput]],
403
+ # List[Dict[str, EncodedInput]],
404
+ # ],
405
+ # padding: Union[bool, str, PaddingStrategy] = True,
406
+ # max_length: Optional[int] = None,
407
+ # pad_to_multiple_of: Optional[int] = None,
408
+ # return_attention_mask: Optional[bool] = None,
409
+ # return_tensors: Optional[Union[str, TensorType]] = None,
410
+ # verbose: bool = True,
411
+ # ) -> BatchEncoding:
412
+ # """
413
+ # Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
414
+ # in the batch.
415
+
416
+ # Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
417
+ # `self.pad_token_id` and `self.pad_token_type_id`)
418
+
419
+ # <Tip>
420
+
421
+ # If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
422
+ # result will use the same type unless you provide a different tensor type with `return_tensors`. In the
423
+ # case of PyTorch tensors, you will lose the specific device of your tensors however.
424
+
425
+ # </Tip>
426
+
427
+ # Args:
428
+ # encoded_inputs (:
429
+ # class:*~transformers.BatchEncoding*, list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`): Tokenized inputs.
430
+ # Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a
431
+ # batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]*
432
+ # or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as well as in a
433
+ # PyTorch Dataloader collate function.
434
+
435
+ # Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
436
+ # see the note above for the return type.
437
+ # padding (:
438
+ # obj:*bool*, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to
439
+ # `True`): Select a strategy to pad the returned sequences (according to the model's padding side
440
+ # and padding index) among:
441
+
442
+ # - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
443
+ # single sequence if provided).
444
+ # - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
445
+ # maximum acceptable input length for the model if that argument is not provided.
446
+ # - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
447
+ # different lengths).
448
+ # max_length (`int`, *optional*):
449
+ # Maximum length of the returned list and optionally padding length (see above).
450
+ # pad_to_multiple_of (`int`, *optional*):
451
+ # If set will pad the sequence to a multiple of the provided value.
452
+
453
+ # This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
454
+ # >= 7.5 (Volta).
455
+ # return_attention_mask (`bool`, *optional*):
456
+ # Whether to return the attention mask. If left to the default, will return the attention mask according
457
+ # to the specific tokenizer's default, defined by the `return_outputs` attribute.
458
+
459
+ # [What are attention masks?](../glossary#attention-mask)
460
+ # return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
461
+ # If set, will return tensors instead of list of python integers. Acceptable values are:
462
+
463
+ # - `'tf'`: Return TensorFlow `tf.constant` objects.
464
+ # - `'pt'`: Return PyTorch `torch.Tensor` objects.
465
+ # - `'np'`: Return Numpy `np.ndarray` objects.
466
+ # verbose (`bool`, *optional*, defaults to `True`):
467
+ # Whether or not to print more information and warnings.
468
+ # """
469
+ # # If we have a list of dicts, let's convert it in a dict of lists
470
+ # # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
471
+ # if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
472
+ # encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
473
+
474
+ # # The model's main input name, usually `input_ids`, has be passed for padding
475
+ # if self.model_input_names[0] not in encoded_inputs:
476
+ # raise ValueError(
477
+ # "You should supply an encoding or a list of encodings to this method "
478
+ # f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
479
+ # )
480
+
481
+ # required_input = encoded_inputs[self.model_input_names[0]]
482
+
483
+ # if not required_input:
484
+ # if return_attention_mask:
485
+ # encoded_inputs["attention_mask"] = []
486
+ # return encoded_inputs
487
+
488
+ # # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
489
+ # # and rebuild them afterwards if no return_tensors is specified
490
+ # # Note that we lose the specific device the tensor may be on for PyTorch
491
+
492
+ # first_element = required_input[0]
493
+ # if isinstance(first_element, (list, tuple)):
494
+ # # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
495
+ # index = 0
496
+ # while len(required_input[index]) == 0:
497
+ # index += 1
498
+ # if index < len(required_input):
499
+ # first_element = required_input[index][0]
500
+ # # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
501
+ # if not isinstance(first_element, (int, list, tuple)):
502
+ # if is_tf_available() and _is_tensorflow(first_element):
503
+ # return_tensors = "tf" if return_tensors is None else return_tensors
504
+ # elif is_torch_available() and _is_torch(first_element):
505
+ # return_tensors = "pt" if return_tensors is None else return_tensors
506
+ # elif isinstance(first_element, np.ndarray):
507
+ # return_tensors = "np" if return_tensors is None else return_tensors
508
+ # else:
509
+ # raise ValueError(
510
+ # f"type of {first_element} unknown: {type(first_element)}. "
511
+ # f"Should be one of a python, numpy, pytorch or tensorflow object."
512
+ # )
513
+
514
+ # for key, value in encoded_inputs.items():
515
+ # encoded_inputs[key] = to_py_obj(value)
516
+
517
+ # # Convert padding_strategy in PaddingStrategy
518
+ # padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
519
+ # padding=padding, max_length=max_length, verbose=verbose
520
+ # )
521
+
522
+ # required_input = encoded_inputs[self.model_input_names[0]]
523
+ # if required_input and not isinstance(required_input[0][0], (list, tuple)):
524
+ # encoded_inputs = self._pad(
525
+ # encoded_inputs,
526
+ # max_length=max_length,
527
+ # padding_strategy=padding_strategy,
528
+ # pad_to_multiple_of=pad_to_multiple_of,
529
+ # return_attention_mask=return_attention_mask,
530
+ # )
531
+ # return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
532
+
533
+ # batch_size = len(required_input)
534
+ # assert all(
535
+ # len(v) == batch_size for v in encoded_inputs.values()
536
+ # ), "Some items in the output dictionary have a different batch size than others."
537
+
538
+ # if padding_strategy == PaddingStrategy.LONGEST:
539
+ # max_length = max(len(inputs) for inputs in required_input)
540
+ # padding_strategy = PaddingStrategy.MAX_LENGTH
541
+
542
+ # batch_outputs = {}
543
+ # for i in range(batch_size):
544
+ # inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
545
+ # outputs = self._pad(
546
+ # inputs,
547
+ # max_length=max_length,
548
+ # padding_strategy=padding_strategy,
549
+ # pad_to_multiple_of=pad_to_multiple_of,
550
+ # return_attention_mask=return_attention_mask,
551
+ # )
552
+
553
+ # for key, value in outputs.items():
554
+ # if key not in batch_outputs:
555
+ # batch_outputs[key] = []
556
+ # batch_outputs[key].append(value)
557
+
558
+ # return BatchEncoding(batch_outputs, tensor_type=return_tensors)
559
+
560
+ # def _pad(
561
+ # self,
562
+ # encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
563
+ # max_length: Optional[int] = None,
564
+ # padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
565
+ # pad_to_multiple_of: Optional[int] = None,
566
+ # return_attention_mask: Optional[bool] = None,
567
+ # ) -> dict:
568
+ # """
569
+ # Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
570
+
571
+ # Args:
572
+ # encoded_inputs:
573
+ # Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
574
+ # max_length: maximum length of the returned list and optionally padding length (see below).
575
+ # Will truncate by taking into account the special tokens.
576
+ # padding_strategy: PaddingStrategy to use for padding.
577
+
578
+ # - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
579
+ # - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
580
+ # - PaddingStrategy.DO_NOT_PAD: Do not pad
581
+ # The tokenizer padding sides are defined in self.padding_side:
582
+
583
+ # - 'left': pads on the left of the sequences
584
+ # - 'right': pads on the right of the sequences
585
+ # pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
586
+ # This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
587
+ # >= 7.5 (Volta).
588
+ # return_attention_mask:
589
+ # (optional) Set to False to avoid returning attention mask (default: set to model specifics)
590
+ # """
591
+ # # Load from model defaults
592
+ # if return_attention_mask is None:
593
+ # return_attention_mask = "attention_mask" in self.model_input_names
594
+
595
+ # required_input = encoded_inputs[self.model_input_names[0]]
596
+
597
+ # if padding_strategy == PaddingStrategy.LONGEST:
598
+ # max_length = len(required_input)
599
+
600
+ # if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
601
+ # max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
602
+
603
+ # needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
604
+
605
+ # if needs_to_be_padded:
606
+ # difference = max_length - len(required_input)
607
+ # if self.padding_side == "right":
608
+ # if return_attention_mask:
609
+ # encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
610
+ # if "token_type_ids" in encoded_inputs:
611
+ # encoded_inputs["token_type_ids"] = (
612
+ # encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
613
+ # )
614
+ # if "special_tokens_mask" in encoded_inputs:
615
+ # encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
616
+ # encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
617
+ # elif self.padding_side == "left":
618
+ # if return_attention_mask:
619
+ # encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
620
+ # if "token_type_ids" in encoded_inputs:
621
+ # encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
622
+ # "token_type_ids"
623
+ # ]
624
+ # if "special_tokens_mask" in encoded_inputs:
625
+ # encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
626
+ # encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
627
+ # else:
628
+ # raise ValueError("Invalid padding strategy:" + str(self.padding_side))
629
+ # elif return_attention_mask and "attention_mask" not in encoded_inputs:
630
+ # if isinstance(encoded_inputs["token_type_ids"], list):
631
+ # encoded_inputs["attention_mask"] = [1] * len(required_input)
632
+ # else:
633
+ # encoded_inputs["attention_mask"] = 1
634
+
635
+ # return encoded_inputs
636
+
637
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
638
+ logger.warning("CharacterBERT does not have a token vocabulary. " "Skipping saving `vocab.txt`.")
639
+ return ()
640
+
641
+ def save_mlm_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
642
+ # NOTE: CharacterBERT has no token vocabulary, this is just to allow
643
+ # saving tokenizer configuration via CharacterBertTokenizer.save_pretrained
644
+ if os.path.isdir(save_directory):
645
+ vocab_file = os.path.join(
646
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + "mlm_vocab.txt"
647
+ )
648
+ else:
649
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
650
+ with open(vocab_file, "w", encoding="utf-8") as f:
651
+ for _, token in self.ids_to_tokens.items():
652
+ f.write(token + "\n")
653
+ return (vocab_file,)
654
+
655
+ def _save_pretrained(
656
+ self,
657
+ save_directory: Union[str, os.PathLike],
658
+ file_names: Tuple[str],
659
+ legacy_format: Optional[bool] = None,
660
+ filename_prefix: Optional[str] = None,
661
+ ) -> Tuple[str]:
662
+ """
663
+ Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
664
+
665
+ Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
666
+ specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
667
+ """
668
+ if legacy_format is False:
669
+ raise ValueError(
670
+ "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
671
+ )
672
+
673
+ save_directory = str(save_directory)
674
+
675
+ added_tokens_file = os.path.join(
676
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
677
+ )
678
+ added_vocab = self.get_added_vocab()
679
+ if added_vocab:
680
+ with open(added_tokens_file, "w", encoding="utf-8") as f:
681
+ out_str = json.dumps(added_vocab, ensure_ascii=False)
682
+ f.write(out_str)
683
+ logger.info(f"added tokens file saved in {added_tokens_file}")
684
+
685
+ vocab_files = self.save_mlm_vocabulary(save_directory, filename_prefix=filename_prefix)
686
+
687
+ return file_names + vocab_files + (added_tokens_file,)
688
+
689
+
690
+ class BasicTokenizer(object):
691
+ """
692
+ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
693
+
694
+ Args:
695
+ do_lower_case (`bool`, *optional*, defaults to `True`):
696
+ Whether or not to lowercase the input when tokenizing.
697
+ never_split (`Iterable`, *optional*):
698
+ Collection of tokens which will never be split during tokenization. Only has an effect when
699
+ `do_basic_tokenize=True`
700
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
701
+ Whether or not to tokenize Chinese characters.
702
+
703
+ This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
704
+ strip_accents: (`bool`, *optional*):
705
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
706
+ value for `lowercase` (as in the original BERT).
707
+ """
708
+
709
+ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
710
+ if never_split is None:
711
+ never_split = []
712
+ self.do_lower_case = do_lower_case
713
+ self.never_split = set(never_split)
714
+ self.tokenize_chinese_chars = tokenize_chinese_chars
715
+ self.strip_accents = strip_accents
716
+
717
+ def tokenize(self, text, never_split=None):
718
+ """
719
+ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
720
+ WordPieceTokenizer.
721
+
722
+ Args:
723
+ **never_split**: (*optional*) list of str
724
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
725
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
726
+ """
727
+ # union() returns a new set by concatenating the two sets.
728
+ never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
729
+ text = self._clean_text(text)
730
+
731
+ # This was added on November 1st, 2018 for the multilingual and Chinese
732
+ # models. This is also applied to the English models now, but it doesn't
733
+ # matter since the English models were not trained on any Chinese data
734
+ # and generally don't have any Chinese data in them (there are Chinese
735
+ # characters in the vocabulary because Wikipedia does have some Chinese
736
+ # words in the English Wikipedia.).
737
+ if self.tokenize_chinese_chars:
738
+ text = self._tokenize_chinese_chars(text)
739
+ orig_tokens = whitespace_tokenize(text)
740
+ split_tokens = []
741
+ for token in orig_tokens:
742
+ if token not in never_split:
743
+ if self.do_lower_case:
744
+ token = token.lower()
745
+ if self.strip_accents is not False:
746
+ token = self._run_strip_accents(token)
747
+ elif self.strip_accents:
748
+ token = self._run_strip_accents(token)
749
+ split_tokens.extend(self._run_split_on_punc(token, never_split))
750
+
751
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
752
+ return output_tokens
753
+
754
+ def _run_strip_accents(self, text):
755
+ """Strips accents from a piece of text."""
756
+ text = unicodedata.normalize("NFD", text)
757
+ output = []
758
+ for char in text:
759
+ cat = unicodedata.category(char)
760
+ if cat == "Mn":
761
+ continue
762
+ output.append(char)
763
+ return "".join(output)
764
+
765
+ def _run_split_on_punc(self, text, never_split=None):
766
+ """Splits punctuation on a piece of text."""
767
+ if never_split is not None and text in never_split:
768
+ return [text]
769
+ chars = list(text)
770
+ i = 0
771
+ start_new_word = True
772
+ output = []
773
+ while i < len(chars):
774
+ char = chars[i]
775
+ if _is_punctuation(char):
776
+ output.append([char])
777
+ start_new_word = True
778
+ else:
779
+ if start_new_word:
780
+ output.append([])
781
+ start_new_word = False
782
+ output[-1].append(char)
783
+ i += 1
784
+
785
+ return ["".join(x) for x in output]
786
+
787
+ def _tokenize_chinese_chars(self, text):
788
+ """Adds whitespace around any CJK character."""
789
+ output = []
790
+ for char in text:
791
+ cp = ord(char)
792
+ if self._is_chinese_char(cp):
793
+ output.append(" ")
794
+ output.append(char)
795
+ output.append(" ")
796
+ else:
797
+ output.append(char)
798
+ return "".join(output)
799
+
800
+ def _is_chinese_char(self, cp):
801
+ """Checks whether CP is the codepoint of a CJK character."""
802
+ # This defines a "chinese character" as anything in the CJK Unicode block:
803
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
804
+ #
805
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
806
+ # despite its name. The modern Korean Hangul alphabet is a different block,
807
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
808
+ # space-separated words, so they are not treated specially and handled
809
+ # like the all of the other languages.
810
+ if (
811
+ (cp >= 0x4E00 and cp <= 0x9FFF)
812
+ or (cp >= 0x3400 and cp <= 0x4DBF) #
813
+ or (cp >= 0x20000 and cp <= 0x2A6DF) #
814
+ or (cp >= 0x2A700 and cp <= 0x2B73F) #
815
+ or (cp >= 0x2B740 and cp <= 0x2B81F) #
816
+ or (cp >= 0x2B820 and cp <= 0x2CEAF) #
817
+ or (cp >= 0xF900 and cp <= 0xFAFF)
818
+ or (cp >= 0x2F800 and cp <= 0x2FA1F) #
819
+ ): #
820
+ return True
821
+
822
+ return False
823
+
824
+ def _clean_text(self, text):
825
+ """Performs invalid character removal and whitespace cleanup on text."""
826
+ output = []
827
+ for char in text:
828
+ cp = ord(char)
829
+ if cp == 0 or cp == 0xFFFD or _is_control(char):
830
+ continue
831
+ if _is_whitespace(char):
832
+ output.append(" ")
833
+ else:
834
+ output.append(char)
835
+ return "".join(output)
836
+
837
+
838
+ class CharacterMapper:
839
+ """
840
+ NOTE: Adapted from ElmoCharacterMapper:
841
+ https://github.com/allenai/allennlp/blob/main/allennlp/data/token_indexers/elmo_indexer.py Maps individual tokens
842
+ to sequences of character ids, compatible with CharacterBERT.
843
+ """
844
+
845
+ # char ids 0-255 come from utf-8 encoding bytes
846
+ # assign 256-300 to special chars
847
+ beginning_of_sentence_character = 256 # <begin sentence>
848
+ end_of_sentence_character = 257 # <end sentence>
849
+ beginning_of_word_character = 258 # <begin word>
850
+ end_of_word_character = 259 # <end word>
851
+ padding_character = 260 # <padding> | short tokens are padded using this + 1
852
+ mask_character = 261 # <mask>
853
+
854
+ bos_token = "[CLS]" # previously: bos_token = "<S>"
855
+ eos_token = "[SEP]" # previously: eos_token = "</S>"
856
+ pad_token = "[PAD]"
857
+ mask_token = "[MASK]"
858
+
859
+ def __init__(
860
+ self,
861
+ max_word_length: int = 50,
862
+ ):
863
+ self.max_word_length = max_word_length
864
+ self.beginning_of_sentence_characters = self._make_char_id_sequence(self.beginning_of_sentence_character)
865
+ self.end_of_sentence_characters = self._make_char_id_sequence(self.end_of_sentence_character)
866
+ self.mask_characters = self._make_char_id_sequence(self.mask_character)
867
+ # This is the character id sequence for the pad token (i.e. [PAD]).
868
+ # We remove 1 because we will add 1 later on and it will be equal to 0.
869
+ self.pad_characters = [PAD_TOKEN_CHAR_ID - 1] * self.max_word_length
870
+
871
+ def _make_char_id_sequence(self, character: int):
872
+ char_ids = [self.padding_character] * self.max_word_length
873
+ char_ids[0] = self.beginning_of_word_character
874
+ char_ids[1] = character
875
+ char_ids[2] = self.end_of_word_character
876
+ return char_ids
877
+
878
+ def convert_word_to_char_ids(self, word: str) -> List[int]:
879
+ if word == self.bos_token:
880
+ char_ids = self.beginning_of_sentence_characters
881
+ elif word == self.eos_token:
882
+ char_ids = self.end_of_sentence_characters
883
+ elif word == self.mask_token:
884
+ char_ids = self.mask_characters
885
+ elif word == self.pad_token:
886
+ char_ids = self.pad_characters
887
+ else:
888
+ # Convert characters to indices
889
+ word_encoded = word.encode("utf-8", "ignore")[: (self.max_word_length - 2)]
890
+ # Initialize character_ids with padding
891
+ char_ids = [self.padding_character] * self.max_word_length
892
+ # First character is BeginningOfWord
893
+ char_ids[0] = self.beginning_of_word_character
894
+ # Populate character_ids with computed indices
895
+ for k, chr_id in enumerate(word_encoded, start=1):
896
+ char_ids[k] = chr_id
897
+ # Last character is EndOfWord
898
+ char_ids[len(word_encoded) + 1] = self.end_of_word_character
899
+
900
+ # +1 one for masking so that character padding == 0
901
+ # char_ids domain is therefore: (1, 256) for actual characters
902
+ # and (257-262) for special symbols (BOS/EOS/BOW/EOW/padding/MLM Mask)
903
+ return [c + 1 for c in char_ids]
904
+
905
+ def convert_char_ids_to_word(self, char_ids: List[int]) -> str:
906
+ "Converts a sequence of character ids into its corresponding word."
907
+
908
+ assert len(char_ids) <= self.max_word_length, (
909
+ f"Got character sequence of length {len(char_ids)} while `max_word_length={self.max_word_length}`"
910
+ )
911
+
912
+ char_ids_ = [(i - 1) for i in char_ids]
913
+ if char_ids_ == self.beginning_of_sentence_characters:
914
+ return self.bos_token
915
+ elif char_ids_ == self.end_of_sentence_characters:
916
+ return self.eos_token
917
+ elif char_ids_ == self.mask_characters:
918
+ return self.mask_token
919
+ elif char_ids_ == self.pad_characters: # token padding
920
+ return self.pad_token
921
+ else:
922
+ utf8_codes = list(
923
+ filter(
924
+ lambda x: (x != self.padding_character)
925
+ and (x != self.beginning_of_word_character)
926
+ and (x != self.end_of_word_character),
927
+ char_ids_,
928
+ )
929
+ )
930
+ return bytes(utf8_codes).decode("utf-8")