zhjohnchan commited on
Commit
31a894d
1 Parent(s): dfe4f8e

Upload tokenization_chexagent.py

Browse files
Files changed (1) hide show
  1. tokenization_chexagent.py +298 -296
tokenization_chexagent.py CHANGED
@@ -1,20 +1,8 @@
1
- import json
2
- from functools import lru_cache
3
- from typing import TYPE_CHECKING
4
-
5
- import regex as re
6
- from transformers.tokenization_utils_base import TextInput
7
- from transformers.utils import is_tf_available, is_torch_available, to_py_obj
8
-
9
- if TYPE_CHECKING:
10
- if is_torch_available():
11
- import torch
12
- if is_tf_available():
13
- import tensorflow as tf
14
-
15
  import os
16
  import random
17
- from typing import Dict, List, Tuple, Union, Any, Callable, Optional
 
 
18
 
19
  import matplotlib as mpl
20
  import matplotlib.colors as mcolors
@@ -22,77 +10,39 @@ import matplotlib.colors as mplc
22
  import matplotlib.figure as mplfigure
23
  import numpy as np
24
  import requests
 
25
  import torch
26
  from PIL import Image
27
  from matplotlib.backends.backend_agg import FigureCanvasAgg
28
  from transformers import PreTrainedTokenizer, AddedToken
 
29
  from transformers.utils import logging
30
 
 
 
 
31
  logger = logging.get_logger(__name__)
32
 
33
- VOCAB_FILES_NAMES = {
34
- "vocab_file": "vocab.json",
35
- "merges_file": "merges.txt",
36
- }
37
 
38
  PRETRAINED_VOCAB_FILES_MAP = {
39
  "vocab_file": {
40
- "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/vocab.json",
41
  },
42
- "merges_file": {
43
- "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/merges.txt",
44
  },
45
  }
46
-
47
  PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
48
- "Salesforce/codegen-350M-mono": 2048,
49
  }
 
50
 
51
- IMG_TOKEN_SPAN = 1024
52
 
53
  DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['from'] == 'human' %}\n{{ '<|user|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'system' %}\n{{ '<|system|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'gpt' %}\n{{ '<|assistant|>\n' + message['value'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
54
 
55
 
56
- @lru_cache()
57
- def bytes_to_unicode():
58
- """
59
- Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
60
- characters the bpe code barfs on.
61
-
62
- The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
63
- if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
64
- decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
65
- tables between utf-8 bytes and unicode strings.
66
- """
67
- bs = (
68
- list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(
69
- range(ord("®"), ord("ÿ") + 1))
70
- )
71
- cs = bs[:]
72
- n = 0
73
- for b in range(2 ** 8):
74
- if b not in bs:
75
- bs.append(b)
76
- cs.append(2 ** 8 + n)
77
- n += 1
78
- cs = [chr(n) for n in cs]
79
- return dict(zip(bs, cs))
80
-
81
-
82
- def get_pairs(word):
83
- """
84
- Return set of symbol pairs in a word.
85
-
86
- Word is represented as tuple of symbols (symbols being variable-length strings).
87
- """
88
- pairs = set()
89
- prev_char = word[0]
90
- for char in word[1:]:
91
- pairs.add((prev_char, char))
92
- prev_char = char
93
- return pairs
94
-
95
-
96
  def _list_find(
97
  input_list: List[Any],
98
  candidates: Tuple[Any],
@@ -143,14 +93,18 @@ class CheXagentTokenizer(PreTrainedTokenizer):
143
  def __init__(
144
  self,
145
  vocab_file,
146
- merges_file,
147
- errors="replace",
148
- unk_token="<|endoftext|>",
149
- bos_token="<|endoftext|>",
150
- eos_token="<|endoftext|>",
151
  pad_token=None,
152
- add_prefix_space=False,
153
- add_bos_token=False,
 
 
 
 
 
 
154
  image_start_tag='<|img|>',
155
  image_end_tag='<|/img|>',
156
  image_pad_tag='<|imgpad|>',
@@ -162,38 +116,43 @@ class CheXagentTokenizer(PreTrainedTokenizer):
162
  quad_end_tag='<|/quad|>',
163
  **kwargs,
164
  ):
165
- bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
166
- eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
167
- unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
168
- pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
169
- self.add_bos_token = add_bos_token
 
 
 
 
 
 
 
 
 
 
170
 
171
- with open(vocab_file, encoding="utf-8") as vocab_handle:
172
- self.encoder = json.load(vocab_handle)
173
- self.decoder = {v: k for k, v in self.encoder.items()}
174
- self.errors = errors # how to handle errors in decoding
175
- self.byte_encoder = bytes_to_unicode()
176
- self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
177
- with open(merges_file, encoding="utf-8") as merges_handle:
178
- bpe_merges = merges_handle.read().split("\n")[1:-1]
179
- bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
180
- self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
181
- self.cache = {}
182
- self.add_prefix_space = add_prefix_space
183
-
184
- # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
185
- self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
186
  super().__init__(
187
- errors=errors,
188
- unk_token=unk_token,
189
  bos_token=bos_token,
190
  eos_token=eos_token,
 
191
  pad_token=pad_token,
192
- add_prefix_space=add_prefix_space,
193
  add_bos_token=add_bos_token,
 
 
 
 
 
 
194
  **kwargs,
195
  )
196
-
197
  self.image_start_tag = image_start_tag
198
  self.image_end_tag = image_end_tag
199
  self.image_pad_tag = image_pad_tag
@@ -229,69 +188,55 @@ class CheXagentTokenizer(PreTrainedTokenizer):
229
  self.quad_end_id = self.convert_tokens_to_ids(self.quad_end_tag)
230
  self.chat_template = DEFAULT_CHAT_TEMPLATE
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  @property
233
  def vocab_size(self):
234
- return len(self.encoder)
 
235
 
236
  def get_vocab(self):
237
- return dict(self.encoder, **self.added_tokens_encoder)
238
-
239
- def bpe(self, token):
240
- if token in self.cache:
241
- return self.cache[token]
242
- word = tuple(token)
243
- pairs = get_pairs(word)
244
-
245
- if not pairs:
246
- return token
247
-
248
- while True:
249
- bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
250
- if bigram not in self.bpe_ranks:
251
- break
252
- first, second = bigram
253
- new_word = []
254
- i = 0
255
- while i < len(word):
256
- try:
257
- j = word.index(first, i)
258
- except ValueError:
259
- new_word.extend(word[i:])
260
- break
261
- else:
262
- new_word.extend(word[i:j])
263
- i = j
264
-
265
- if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
266
- new_word.append(first + second)
267
- i += 2
268
- else:
269
- new_word.append(word[i])
270
- i += 1
271
- new_word = tuple(new_word)
272
- word = new_word
273
- if len(word) == 1:
274
- break
275
- else:
276
- pairs = get_pairs(word)
277
- word = " ".join(word)
278
- self.cache[token] = word
279
- return word
280
-
281
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
282
- if self.add_bos_token:
283
- bos_token_ids = [self.bos_token_id]
284
- else:
285
- bos_token_ids = []
286
 
287
- output = bos_token_ids + token_ids_0
288
-
289
- if token_ids_1 is None:
290
- return output
291
-
292
- return output + bos_token_ids + token_ids_1
293
 
294
- def tokenize(self, text: TextInput, **kwargs) -> List[str]:
295
  def _encode_imgurl(img_tokens):
296
  assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
297
  img_tokens = img_tokens[1:-1]
@@ -303,126 +248,24 @@ class CheXagentTokenizer(PreTrainedTokenizer):
303
  out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
304
  return out_img_tokens
305
 
306
- tokens = super().tokenize(text, **kwargs)
307
- tokens = _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
308
- return tokens
 
309
 
310
- def _tokenize(self, text):
311
- """Tokenize a string."""
312
 
313
- bpe_tokens = []
314
- for token in re.findall(self.pat, text):
315
- token = "".join(
316
- self.byte_encoder[b] for b in token.encode("utf-8")
317
- ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
318
- bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
319
- return bpe_tokens
320
-
321
- def _convert_token_to_id(self, token):
322
- """Converts a token (str) in an id using the vocab."""
323
- return self.encoder.get(token, self.encoder.get(self.unk_token))
324
-
325
- def _convert_id_to_token(self, index):
326
- """Converts an index (integer) in a token (str) using the vocab."""
327
- return self.decoder.get(index)
328
-
329
- def convert_tokens_to_string(self, tokens):
330
- """Converts a sequence of tokens (string) in a single string."""
331
- text = "".join(tokens)
332
- text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
333
- return text
334
-
335
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
336
- if not os.path.isdir(save_directory):
337
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
338
- return
339
- vocab_file = os.path.join(
340
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
341
- )
342
- merge_file = os.path.join(
343
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
344
- )
345
-
346
- with open(vocab_file, "w", encoding="utf-8") as f:
347
- f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
348
-
349
- index = 0
350
- with open(merge_file, "w", encoding="utf-8") as writer:
351
- writer.write("#version: 0.2\n")
352
- for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
353
- if index != token_index:
354
- logger.warning(
355
- f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
356
- " Please check that the tokenizer is not corrupted!"
357
- )
358
- index = token_index
359
- writer.write(" ".join(bpe_tokens) + "\n")
360
- index += 1
361
-
362
- return vocab_file, merge_file
363
-
364
- def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
365
- add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
366
- if is_split_into_words or add_prefix_space:
367
- text = " " + text
368
- return (text, kwargs)
369
-
370
- def decode(
371
- self,
372
- token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
373
- skip_special_tokens: bool = False,
374
- clean_up_tokenization_spaces: bool = None,
375
- truncate_before_pattern: Optional[List[str]] = None,
376
- **kwargs,
377
- ) -> str:
378
- """
379
- Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
380
- tokens and clean up tokenization spaces.
381
-
382
- Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
383
-
384
- Args:
385
- token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
386
- List of tokenized input ids. Can be obtained using the `__call__` method.
387
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
388
- Whether or not to remove special tokens in the decoding.
389
- clean_up_tokenization_spaces (`bool`, *optional*):
390
- Whether or not to clean up the tokenization spaces. If `None`, will default to
391
- `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
392
- truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
393
- A list of regular expression strings that will be used to truncate the returned string. This can be
394
- used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
395
- of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
396
- kwargs (additional keyword arguments, *optional*):
397
- Will be passed to the underlying model specific decode method.
398
-
399
- Returns:
400
- `str`: The decoded sentence.
401
- """
402
-
403
- token_ids = to_py_obj(token_ids)
404
-
405
- decoded_text = self._decode(
406
- token_ids=token_ids,
407
- skip_special_tokens=skip_special_tokens,
408
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
409
- **kwargs,
410
- )
411
-
412
- if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
413
- decoded_text = self.truncate(decoded_text, truncate_before_pattern)
414
-
415
- return decoded_text
416
 
417
  def _decode(
418
  self,
419
- token_ids: List[int],
420
  skip_special_tokens: bool = False,
421
- clean_up_tokenization_spaces: bool = None,
422
- spaces_between_special_tokens: bool = True,
423
  **kwargs,
424
  ) -> str:
425
-
426
  def _decode_imgurl(img_token_ids):
427
  assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
428
  img_token_ids = img_token_ids[1:-1]
@@ -430,39 +273,37 @@ class CheXagentTokenizer(PreTrainedTokenizer):
430
  return [self.img_start_id] + img_token_ids + [self.img_end_id]
431
 
432
  token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
433
-
434
- return super()._decode(
435
- token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  )
437
 
438
- def truncate(self, completion, truncate_before_pattern):
439
- def find_re(string, pattern, start_pos):
440
- m = pattern.search(string, start_pos)
441
- return m.start() if m else -1
442
-
443
- terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
444
-
445
- prints = list(re.finditer("^print", completion, re.MULTILINE))
446
-
447
- if len(prints) > 1:
448
- completion = completion[: prints[1].start()]
449
-
450
- defs = list(re.finditer("^def", completion, re.MULTILINE))
451
-
452
- if len(defs) > 1:
453
- completion = completion[: defs[1].start()]
454
-
455
- start_pos = 0
456
-
457
- terminals_pos = [
458
- pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
459
- ]
460
-
461
- if len(terminals_pos) > 0:
462
- return completion[: min(terminals_pos)]
463
- else:
464
- return completion
465
-
466
  def from_list_format(self, list_format: List[Dict]):
467
  text = ''
468
  num_images = 0
@@ -535,6 +376,167 @@ class CheXagentTokenizer(PreTrainedTokenizer):
535
  visualizer.draw_text(box['ref'], (x1, y1), color=color, horizontal_alignment="left")
536
  return visualizer.output
537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
 
539
  class VisImage:
540
  def __init__(self, img, scale=1.0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import random
3
+ import unicodedata
4
+ from shutil import copyfile
5
+ from typing import TYPE_CHECKING, Dict, List, Tuple, Union, Any, Callable, Optional
6
 
7
  import matplotlib as mpl
8
  import matplotlib.colors as mcolors
 
10
  import matplotlib.figure as mplfigure
11
  import numpy as np
12
  import requests
13
+ import sentencepiece as spm
14
  import torch
15
  from PIL import Image
16
  from matplotlib.backends.backend_agg import FigureCanvasAgg
17
  from transformers import PreTrainedTokenizer, AddedToken
18
+ from transformers.convert_slow_tokenizer import import_protobuf
19
  from transformers.utils import logging
20
 
21
+ if TYPE_CHECKING:
22
+ from transformers.tokenization_utils_base import TextInput
23
+
24
  logger = logging.get_logger(__name__)
25
 
26
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 
 
 
27
 
28
  PRETRAINED_VOCAB_FILES_MAP = {
29
  "vocab_file": {
30
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
31
  },
32
+ "tokenizer_file": {
33
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
34
  },
35
  }
 
36
  PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
37
+ "hf-internal-testing/llama-tokenizer": 2048,
38
  }
39
+ SPIECE_UNDERLINE = "▁"
40
 
41
+ IMG_TOKEN_SPAN = 256
42
 
43
  DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['from'] == 'human' %}\n{{ '<|user|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'system' %}\n{{ '<|system|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'gpt' %}\n{{ '<|assistant|>\n' + message['value'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def _list_find(
47
  input_list: List[Any],
48
  candidates: Tuple[Any],
 
93
  def __init__(
94
  self,
95
  vocab_file,
96
+ unk_token="<unk>",
97
+ bos_token="<s>",
98
+ eos_token="</s>",
 
 
99
  pad_token=None,
100
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
101
+ add_bos_token=True,
102
+ add_eos_token=False,
103
+ clean_up_tokenization_spaces=False,
104
+ use_default_system_prompt=False,
105
+ spaces_between_special_tokens=False,
106
+ legacy=None,
107
+ errors="replace",
108
  image_start_tag='<|img|>',
109
  image_end_tag='<|/img|>',
110
  image_pad_tag='<|imgpad|>',
 
116
  quad_end_tag='<|/quad|>',
117
  **kwargs,
118
  ):
119
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
120
+ bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
121
+ eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
122
+ unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
123
+ pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
124
+
125
+ if legacy is None:
126
+ logger.warning_once(
127
+ f"You are using the default legacy behaviour of the {self.__class__}. This is"
128
+ " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
129
+ " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
130
+ " means, and thoroughly read the reason why this was added as explained in"
131
+ " https://github.com/huggingface/transformers/pull/24565"
132
+ )
133
+ legacy = True
134
 
135
+ self.legacy = legacy
136
+ self.vocab_file = vocab_file
137
+ self.add_bos_token = add_bos_token
138
+ self.add_eos_token = add_eos_token
139
+ self.use_default_system_prompt = use_default_system_prompt
140
+ self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
 
 
 
 
 
 
 
 
 
141
  super().__init__(
 
 
142
  bos_token=bos_token,
143
  eos_token=eos_token,
144
+ unk_token=unk_token,
145
  pad_token=pad_token,
 
146
  add_bos_token=add_bos_token,
147
+ add_eos_token=add_eos_token,
148
+ sp_model_kwargs=self.sp_model_kwargs,
149
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
150
+ use_default_system_prompt=use_default_system_prompt,
151
+ spaces_between_special_tokens=spaces_between_special_tokens,
152
+ legacy=legacy,
153
  **kwargs,
154
  )
155
+ self.errors = errors # how to handle errors in decoding
156
  self.image_start_tag = image_start_tag
157
  self.image_end_tag = image_end_tag
158
  self.image_pad_tag = image_pad_tag
 
188
  self.quad_end_id = self.convert_tokens_to_ids(self.quad_end_tag)
189
  self.chat_template = DEFAULT_CHAT_TEMPLATE
190
 
191
+ @property
192
+ def unk_token_length(self):
193
+ return len(self.sp_model.encode(str(self.unk_token)))
194
+
195
+ def get_spm_processor(self, from_slow=False):
196
+ tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
197
+ if self.legacy or from_slow: # no dependency on protobuf
198
+ tokenizer.Load(self.vocab_file)
199
+ return tokenizer
200
+
201
+ with open(self.vocab_file, "rb") as f:
202
+ sp_model = f.read()
203
+ model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
204
+ model = model_pb2.ModelProto.FromString(sp_model)
205
+ normalizer_spec = model_pb2.NormalizerSpec()
206
+ normalizer_spec.add_dummy_prefix = False
207
+ model.normalizer_spec.MergeFrom(normalizer_spec)
208
+ sp_model = model.SerializeToString()
209
+ tokenizer.LoadFromSerializedProto(sp_model)
210
+ return tokenizer
211
+
212
+ def __getstate__(self):
213
+ state = self.__dict__.copy()
214
+ state["sp_model"] = None
215
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
216
+ return state
217
+
218
+ def __setstate__(self, d):
219
+ self.__dict__ = d
220
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
221
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
222
+
223
  @property
224
  def vocab_size(self):
225
+ """Returns vocab size"""
226
+ return self.sp_model.get_piece_size()
227
 
228
  def get_vocab(self):
229
+ """Returns vocab as a dict"""
230
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
231
+ vocab.update(self.added_tokens_encoder)
232
+ return vocab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
235
+ """
236
+ Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
237
+ first token is special.
238
+ """
 
239
 
 
240
  def _encode_imgurl(img_tokens):
241
  assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
242
  img_tokens = img_tokens[1:-1]
 
248
  out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
249
  return out_img_tokens
250
 
251
+ if self.legacy or len(text) == 0:
252
+ tokens = super().tokenize(text, **kwargs)
253
+ tokens = _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
254
+ return tokens
255
 
256
+ tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
 
257
 
258
+ if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
259
+ tokens = tokens[1:]
260
+ return _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  def _decode(
263
  self,
264
+ token_ids: Union[int, List[int]],
265
  skip_special_tokens: bool = False,
266
+ errors: str = None,
 
267
  **kwargs,
268
  ) -> str:
 
269
  def _decode_imgurl(img_token_ids):
270
  assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
271
  img_token_ids = img_token_ids[1:-1]
 
273
  return [self.img_start_id] + img_token_ids + [self.img_end_id]
274
 
275
  token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
276
+ return super()._decode(token_ids, errors=errors or self.errors)
277
+
278
+ def to_list_format(self, text: str):
279
+ text = unicodedata.normalize("NFC", text)
280
+ token_ids = self.encode(text)[1:]
281
+
282
+ def _encode_vl_info(tokens):
283
+ if len(tokens) == 0:
284
+ return []
285
+ if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
286
+ key = 'image'
287
+ tokens = tokens[: tokens.index(self.img_pad_id)]
288
+ elif tokens[0] == self.ref_start_id and tokens[-1] == self.ref_end_id:
289
+ key = 'ref'
290
+ elif tokens[0] == self.box_start_id and tokens[-1] == self.box_end_id:
291
+ key = 'box'
292
+ elif tokens[0] == self.quad_start_id and tokens[-1] == self.quad_end_id:
293
+ key = 'quad'
294
+ else:
295
+ key = 'text'
296
+ return [{key: self.decode(tokens)}]
297
+ return [{key: self.decode(tokens[1:-1])}]
298
+
299
+ return _replace_closed_tag(
300
+ token_ids,
301
+ (self.img_start_id, self.ref_start_id, self.box_start_id, self.quad_start_id),
302
+ (self.img_end_id, self.ref_end_id, self.box_end_id, self.quad_end_id),
303
+ _encode_vl_info,
304
+ _encode_vl_info,
305
  )
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  def from_list_format(self, list_format: List[Dict]):
308
  text = ''
309
  num_images = 0
 
376
  visualizer.draw_text(box['ref'], (x1, y1), color=color, horizontal_alignment="left")
377
  return visualizer.output
378
 
379
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
380
+ def _tokenize(self, text, **kwargs):
381
+ """
382
+ Returns a tokenized string.
383
+
384
+ We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
385
+ SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
386
+ `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
387
+ `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
388
+ `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
389
+ """
390
+ tokens = self.sp_model.encode(text, out_type=str)
391
+ if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
392
+ return tokens
393
+
394
+ # 1. Encode string + prefix ex: "<unk> Hey"
395
+ tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
396
+ # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
397
+ return tokens[self.unk_token_length:] if len(tokens) >= self.unk_token_length else tokens
398
+
399
+ def _convert_token_to_id(self, token):
400
+ """Converts a token (str) in an id using the vocab."""
401
+ return self.sp_model.piece_to_id(token)
402
+
403
+ def _convert_id_to_token(self, index):
404
+ """Converts an index (integer) in a token (str) using the vocab."""
405
+ token = self.sp_model.IdToPiece(index)
406
+ return token
407
+
408
+ def convert_tokens_to_string(self, tokens):
409
+ """Converts a sequence of tokens (string) in a single string."""
410
+ # since we manually add the prefix space, we have to remove it when decoding
411
+ if tokens[0].startswith(SPIECE_UNDERLINE):
412
+ tokens[0] = tokens[0][1:]
413
+
414
+ current_sub_tokens = []
415
+ out_string = ""
416
+ prev_is_special = False
417
+ for i, token in enumerate(tokens):
418
+ # make sure that special tokens are not decoded using sentencepiece model
419
+ if token in self.all_special_tokens:
420
+ if not prev_is_special and i != 0 and self.legacy:
421
+ out_string += " "
422
+ out_string += self.sp_model.decode(current_sub_tokens) + token
423
+ prev_is_special = True
424
+ current_sub_tokens = []
425
+ else:
426
+ current_sub_tokens.append(token)
427
+ prev_is_special = False
428
+ out_string += self.sp_model.decode(current_sub_tokens)
429
+ return out_string
430
+
431
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
432
+ """
433
+ Save the vocabulary and special tokens file to a directory.
434
+
435
+ Args:
436
+ save_directory (`str`):
437
+ The directory in which to save the vocabulary.
438
+
439
+ Returns:
440
+ `Tuple(str)`: Paths to the files saved.
441
+ """
442
+ if not os.path.isdir(save_directory):
443
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
444
+ return
445
+ out_vocab_file = os.path.join(
446
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
447
+ )
448
+
449
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
450
+ copyfile(self.vocab_file, out_vocab_file)
451
+ elif not os.path.isfile(self.vocab_file):
452
+ with open(out_vocab_file, "wb") as fi:
453
+ content_spiece_model = self.sp_model.serialized_model_proto()
454
+ fi.write(content_spiece_model)
455
+
456
+ return (out_vocab_file,)
457
+
458
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
459
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
460
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
461
+
462
+ output = bos_token_id + token_ids_0 + eos_token_id
463
+
464
+ if token_ids_1 is not None:
465
+ output = output + bos_token_id + token_ids_1 + eos_token_id
466
+
467
+ return output
468
+
469
+ def get_special_tokens_mask(
470
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
471
+ already_has_special_tokens: bool = False
472
+ ) -> List[int]:
473
+ """
474
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
475
+ special tokens using the tokenizer `prepare_for_model` method.
476
+
477
+ Args:
478
+ token_ids_0 (`List[int]`):
479
+ List of IDs.
480
+ token_ids_1 (`List[int]`, *optional*):
481
+ Optional second list of IDs for sequence pairs.
482
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
483
+ Whether or not the token list is already formatted with special tokens for the model.
484
+
485
+ Returns:
486
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
487
+ """
488
+ if already_has_special_tokens:
489
+ return super().get_special_tokens_mask(
490
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
491
+ )
492
+
493
+ bos_token_id = [1] if self.add_bos_token else []
494
+ eos_token_id = [1] if self.add_eos_token else []
495
+
496
+ if token_ids_1 is None:
497
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
498
+ return (
499
+ bos_token_id
500
+ + ([0] * len(token_ids_0))
501
+ + eos_token_id
502
+ + bos_token_id
503
+ + ([0] * len(token_ids_1))
504
+ + eos_token_id
505
+ )
506
+
507
+ def create_token_type_ids_from_sequences(
508
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
509
+ ) -> List[int]:
510
+ """
511
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
512
+ sequence pair mask has the following format:
513
+
514
+ ```
515
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
516
+ | first sequence | second sequence |
517
+ ```
518
+
519
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
520
+
521
+ Args:
522
+ token_ids_0 (`List[int]`):
523
+ List of ids.
524
+ token_ids_1 (`List[int]`, *optional*):
525
+ Optional second list of IDs for sequence pairs.
526
+
527
+ Returns:
528
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
529
+ """
530
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
531
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
532
+
533
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
534
+
535
+ if token_ids_1 is not None:
536
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
537
+
538
+ return output
539
+
540
 
541
  class VisImage:
542
  def __init__(self, img, scale=1.0):