Text Generation
Transformers
Safetensors
llama
text-generation-inference
Inference Endpoints
mfromm commited on
Commit
da9d2c0
1 Parent(s): 9b69801

Update gptx_tokenizer.py

Browse files
Files changed (1) hide show
  1. gptx_tokenizer.py +4 -917
gptx_tokenizer.py CHANGED
@@ -9,7 +9,6 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
9
  import sentencepiece as spm
10
  import numpy as np
11
  import torch
12
-
13
  from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_cache
14
  from transformers.tokenization_utils import PreTrainedTokenizer
15
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
@@ -246,923 +245,11 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
246
  Returns:
247
  str: Decoded string.
248
  """
249
- from __future__ import annotations
250
-
251
- import json
252
- import os
253
- import warnings
254
- from pathlib import Path
255
- from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
256
-
257
- import sentencepiece as spm
258
- from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_cache
259
- from transformers.tokenization_utils import PreTrainedTokenizer
260
- from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
261
-
262
-
263
- REPO_ID = "openGPT-X/Teuken-7B-instruct-commercial-v0.4"
264
-
265
- class HFGPTXTokenizer(PreTrainedTokenizer):
266
- """
267
- A custom tokenizer class that extends Hugging Face's PreTrainedTokenizer.
268
- It is specifically designed to work with SentencePiece models and integrates
269
- with Hugging Face's tokenizer utilities.
270
- """
271
-
272
- model_file_glob = "*tokenizer.json"
273
- vocab_files_names = {"tokenizer_file": "tokenizer.json"}
274
- decode_kwargs: List[str] = []
275
-
276
- def _encode(self, text: str, return_tokens: bool = False, is_continuation: bool = False):
277
- """
278
- Encode a given text using the tokenizer.
279
-
280
- Args:
281
- text (str): The text to encode.
282
- return_tokens (bool): If True, returns token strings instead of token IDs.
283
- is_continuation (bool): If True, uses a continuation tokenizer (if available).
284
- Returns:
285
- List[int] or List[str]: Encoded text as a list of token IDs or token strings.
286
- """
287
- assert self.tok is not None, "No tokenizer is currently loaded"
288
-
289
- # Variant with additional sp processor:
290
- tokenizer = self.continuation_tokenizer if is_continuation else self.tok
291
-
292
- if return_tokens:
293
- return tokenizer.encode_as_pieces(text)
294
- else:
295
- return tokenizer.encode(text)
296
-
297
- def create_list_of_special_tokens(self) -> List[str]:
298
- """
299
- Create a list of special tokens, including the BOS, EOS, PAD, EOD tokens,
300
- and 256 additional placeholder tokens.
301
- Returns:
302
- List[str]: List of special tokens.
303
- """
304
- return [self.bos_token, self.eos_token, self.pad_token, self.eod_token] + [
305
- f"<placeholder_tok_{i}>" for i in range(256)
306
- ]
307
-
308
- def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Optional[Path]:
309
- if not os.path.isfile(config_path):
310
- config_path = try_to_load_from_cache(repo_id=repo_id, filename=Path(config_path).name)
311
- if not config_path:
312
- config_path = self._download_config_from_hub(repo_id=repo_id)
313
-
314
- return config_path
315
-
316
-
317
- def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
318
- """
319
- Load the tokenizer model from a file or download it from a repository.
320
-
321
- Args:
322
- model_file_or_name (str): Path to the model file or the model name.
323
- repo_id (str, optional): Repository ID from which to download the model file.
324
-
325
- Returns:
326
- spm.SentencePieceProcessor: Loaded SentencePieceProcessor instance.
327
-
328
- Raises:
329
- ValueError: If repo_id is not provided when model_file_or_name is not a file.
330
- OSError: If the model file cannot be loaded or downloaded.
331
- """
332
- if not os.path.isfile(model_file_or_name):
333
- model_file_or_name = try_to_load_from_cache(repo_id=repo_id, filename=Path(model_file_or_name).name)
334
- if not model_file_or_name:
335
- model_file_or_name = self._download_model_from_hub(repo_id=repo_id)
336
-
337
- try:
338
- return spm.SentencePieceProcessor(model_file=model_file_or_name)
339
- except Exception as e:
340
- raise OSError(f"Failed to load tokenizer model: {str(e)}")
341
-
342
- def _download_model_from_hub(self, repo_id: str) -> Optional[str]:
343
- try:
344
- # List all files in the repo
345
- repo_files = list_repo_files(repo_id)
346
-
347
- # Find the tokenizer model file
348
- tokenizer_files = [f for f in repo_files if f.endswith('.model')]
349
- if not tokenizer_files:
350
- raise FileNotFoundError(f"No .model file found in repository {repo_id}")
351
-
352
- # Use the first .model file found
353
- model_file = tokenizer_files[0]
354
- print(f"Found tokenizer model file: {model_file}")
355
-
356
- # Download the file
357
- model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
358
- print(f"Downloaded tokenizer model to: {model_file_or_name}")
359
- except Exception as e:
360
- raise OSError(f"Failed to download tokenizer model: {str(e)}")
361
-
362
- return model_file_or_name
363
-
364
- def _download_config_from_hub(self, repo_id: str):
365
- if repo_id is None:
366
- raise ValueError("repo_id must be provided if config_path is not a local file")
367
-
368
- try:
369
- # List all files in the repo
370
- repo_files = list_repo_files(repo_id)
371
-
372
- # Find the tokenizer config file
373
- tokenizer_files = [f for f in repo_files if f.endswith('tokenizer_config.json')]
374
- if not tokenizer_files:
375
- raise FileNotFoundError(f"No tokenizer_config.json file found in repository {repo_id}")
376
-
377
- # Use the first tokenizer_config.json file found
378
- tokenizer_config_file = tokenizer_files[0]
379
- print(f"Found tokenizer config file: {tokenizer_config_file}")
380
-
381
- # Download the file
382
- tokenizer_config_file_or_name = hf_hub_download(repo_id=repo_id, filename=tokenizer_config_file)
383
- print(f"Downloaded tokenizer config file to: {tokenizer_config_file_or_name}")
384
- return tokenizer_config_file_or_name
385
- except Exception as e:
386
- raise OSError(f"Failed to download tokenizer model: {str(e)}")
387
- def __init__(
388
- self,
389
- model_path: Optional[str] = None,
390
- config_path: Optional[str] = None,
391
- **kwargs: Any,
392
- ) -> None:
393
- """
394
- Initialize the tokenizer.
395
- Args:
396
- model_path (Optional[str]): Path to the tokenizer model file.
397
- config_path (Optional[str]): Path to the tokenizer configuration file.
398
- **kwargs: Additional keyword arguments passed to the superclass.
399
- This method also ensures backward compatibility by setting
400
- `clean_up_tokenization_spaces` to False by default.
401
- """
402
- # Prevent cleanup of tokenization spaces to maintain backward compatibility
403
- self.clean_up_tokenization_spaces = kwargs.setdefault("clean_up_tokenization_spaces", False)
404
- self.vocab = None
405
- cp_path = kwargs.get("name_or_path", ".")
406
- if model_path is None:
407
- model_path = str(Path(cp_path) / self.vocab_files_names["tokenizer_file"])
408
- self.tok = self.instantiate_from_file_or_name(model_path, repo_id=REPO_ID)
409
-
410
- super().__init__(**kwargs)
411
-
412
- # Specify special tokens which we know the value of.
413
- # EOD from `tok` is used as what is called EOS in HuggingFace.
414
- # Since there is no corresponding mapping for EOS from `tok` in
415
- # HuggingFace, it is treated as an additional special token.
416
- # Same for all other special tokens.
417
-
418
-
419
- self.unk_token = "<unk>"
420
- self.eos_token = "</s>"
421
- self.bos_token = "<s>"
422
- self.pad_token = "<pad>"
423
- self.eod_token = "<eod>"
424
 
425
- self.additional_special_tokens = self.create_list_of_special_tokens()
426
-
427
- if config_path is None:
428
- config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
429
-
430
- if os.path.isfile(config_path):
431
- self.tokenizer_config = self.load_json(Path(config_path))
432
- else: # Load from repo
433
- self.tokenizer_config = self.load_json(Path(self.find_tokenizer_config(Path(config_path), repo_id=REPO_ID)))
434
-
435
- @property
436
- def vocab_size(self) -> int:
437
- """
438
- Get the size of the tokenizer vocabulary.
439
- Returns:
440
- int: The size of the vocabulary.
441
- """
442
- return self.tok.GetPieceSize()
443
-
444
- def get_vocab(self) -> Dict[str, int]:
445
- """
446
- Get the vocabulary as a dictionary mapping token strings to their IDs.
447
- Returns:
448
- Dict[str, int]: Vocabulary mapping.
449
- """
450
- if self.vocab is None:
451
- self.vocab = {self.tok.IdToPiece(i): i for i in range(self.vocab_size)}
452
- return self.vocab
453
-
454
- def _tokenize(self, text: str, **kwargs) -> List[int]:
455
- """
456
- Tokenize the input text.
457
- Args:
458
- text (str): Text to tokenize.
459
- **kwargs: Additional keyword arguments.
460
- Returns:
461
- List[int]: List of token IDs.
462
- """
463
- return_tokens = kwargs.pop("return_tokens", True)
464
- return self._encode(text, return_tokens=return_tokens, **kwargs)
465
-
466
- def _convert_token_to_id(self, token: str) -> int:
467
- """
468
- Convert a token string to its corresponding ID.
469
- Args:
470
- token (str): The token to convert.
471
- Returns:
472
- int: The token's ID.
473
- Raises:
474
- ValueError: If the token is unknown and cannot be encoded to a single ID.
475
- """
476
- return self.tok.PieceToId(token)
477
-
478
-
479
- def decode(
480
- self,
481
- token_ids: Union[List[int], List[List[int]]],
482
- num_threads: Optional[int] = None,
483
- skip_special_tokens: bool = False,
484
- clean_up_tokenization_spaces: bool = False,
485
- ) -> str:
486
- """
487
- Decode a list of token IDs into a string.
488
- Args:
489
- token_ids (Union[List[int], List[List[int]]]): List of token IDs or lists of token IDs.
490
- num_threads (Optional[int]): Number of threads to use for decoding.
491
- Returns:
492
- str: Decoded string.
493
- """
494
- from __future__ import annotations
495
-
496
- import json
497
- import os
498
- import warnings
499
- from pathlib import Path
500
- from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
501
-
502
- import sentencepiece as spm
503
- from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_cache
504
- from transformers.tokenization_utils import PreTrainedTokenizer
505
- from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
506
-
507
-
508
- REPO_ID = "openGPT-X/Teuken-7B-instruct-commercial-v0.4"
509
-
510
- class HFGPTXTokenizer(PreTrainedTokenizer):
511
- """
512
- A custom tokenizer class that extends Hugging Face's PreTrainedTokenizer.
513
- It is specifically designed to work with SentencePiece models and integrates
514
- with Hugging Face's tokenizer utilities.
515
- """
516
-
517
- model_file_glob = "*tokenizer.json"
518
- vocab_files_names = {"tokenizer_file": "tokenizer.json"}
519
- decode_kwargs: List[str] = []
520
-
521
- def _encode(self, text: str, return_tokens: bool = False, is_continuation: bool = False):
522
- """
523
- Encode a given text using the tokenizer.
524
-
525
- Args:
526
- text (str): The text to encode.
527
- return_tokens (bool): If True, returns token strings instead of token IDs.
528
- is_continuation (bool): If True, uses a continuation tokenizer (if available).
529
- Returns:
530
- List[int] or List[str]: Encoded text as a list of token IDs or token strings.
531
- """
532
- assert self.tok is not None, "No tokenizer is currently loaded"
533
-
534
- # Variant with additional sp processor:
535
- tokenizer = self.continuation_tokenizer if is_continuation else self.tok
536
-
537
- if return_tokens:
538
- return tokenizer.encode_as_pieces(text)
539
- else:
540
- return tokenizer.encode(text)
541
-
542
- def create_list_of_special_tokens(self) -> List[str]:
543
- """
544
- Create a list of special tokens, including the BOS, EOS, PAD, EOD tokens,
545
- and 256 additional placeholder tokens.
546
- Returns:
547
- List[str]: List of special tokens.
548
- """
549
- return [self.bos_token, self.eos_token, self.pad_token, self.eod_token] + [
550
- f"<placeholder_tok_{i}>" for i in range(256)
551
- ]
552
-
553
- def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Optional[Path]:
554
- if not os.path.isfile(config_path):
555
- config_path = try_to_load_from_cache(repo_id=repo_id, filename=Path(config_path).name)
556
- if not config_path:
557
- config_path = self._download_config_from_hub(repo_id=repo_id)
558
-
559
- return config_path
560
-
561
-
562
- def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
563
- """
564
- Load the tokenizer model from a file or download it from a repository.
565
-
566
- Args:
567
- model_file_or_name (str): Path to the model file or the model name.
568
- repo_id (str, optional): Repository ID from which to download the model file.
569
-
570
- Returns:
571
- spm.SentencePieceProcessor: Loaded SentencePieceProcessor instance.
572
-
573
- Raises:
574
- ValueError: If repo_id is not provided when model_file_or_name is not a file.
575
- OSError: If the model file cannot be loaded or downloaded.
576
- """
577
- if not os.path.isfile(model_file_or_name):
578
- model_file_or_name = try_to_load_from_cache(repo_id=repo_id, filename=Path(model_file_or_name).name)
579
- if not model_file_or_name:
580
- model_file_or_name = self._download_model_from_hub(repo_id=repo_id)
581
-
582
- try:
583
- return spm.SentencePieceProcessor(model_file=model_file_or_name)
584
- except Exception as e:
585
- raise OSError(f"Failed to load tokenizer model: {str(e)}")
586
-
587
- def _download_model_from_hub(self, repo_id: str) -> Optional[str]:
588
- try:
589
- # List all files in the repo
590
- repo_files = list_repo_files(repo_id)
591
-
592
- # Find the tokenizer model file
593
- tokenizer_files = [f for f in repo_files if f.endswith('.model')]
594
- if not tokenizer_files:
595
- raise FileNotFoundError(f"No .model file found in repository {repo_id}")
596
-
597
- # Use the first .model file found
598
- model_file = tokenizer_files[0]
599
- print(f"Found tokenizer model file: {model_file}")
600
-
601
- # Download the file
602
- model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
603
- print(f"Downloaded tokenizer model to: {model_file_or_name}")
604
- except Exception as e:
605
- raise OSError(f"Failed to download tokenizer model: {str(e)}")
606
-
607
- return model_file_or_name
608
-
609
- def _download_config_from_hub(self, repo_id: str):
610
- if repo_id is None:
611
- raise ValueError("repo_id must be provided if config_path is not a local file")
612
-
613
- try:
614
- # List all files in the repo
615
- repo_files = list_repo_files(repo_id)
616
-
617
- # Find the tokenizer config file
618
- tokenizer_files = [f for f in repo_files if f.endswith('tokenizer_config.json')]
619
- if not tokenizer_files:
620
- raise FileNotFoundError(f"No tokenizer_config.json file found in repository {repo_id}")
621
-
622
- # Use the first tokenizer_config.json file found
623
- tokenizer_config_file = tokenizer_files[0]
624
- print(f"Found tokenizer config file: {tokenizer_config_file}")
625
-
626
- # Download the file
627
- tokenizer_config_file_or_name = hf_hub_download(repo_id=repo_id, filename=tokenizer_config_file)
628
- print(f"Downloaded tokenizer config file to: {tokenizer_config_file_or_name}")
629
- return tokenizer_config_file_or_name
630
- except Exception as e:
631
- raise OSError(f"Failed to download tokenizer model: {str(e)}")
632
- def __init__(
633
- self,
634
- model_path: Optional[str] = None,
635
- config_path: Optional[str] = None,
636
- **kwargs: Any,
637
- ) -> None:
638
- """
639
- Initialize the tokenizer.
640
- Args:
641
- model_path (Optional[str]): Path to the tokenizer model file.
642
- config_path (Optional[str]): Path to the tokenizer configuration file.
643
- **kwargs: Additional keyword arguments passed to the superclass.
644
- This method also ensures backward compatibility by setting
645
- `clean_up_tokenization_spaces` to False by default.
646
- """
647
- # Prevent cleanup of tokenization spaces to maintain backward compatibility
648
- self.clean_up_tokenization_spaces = kwargs.setdefault("clean_up_tokenization_spaces", False)
649
- self.vocab = None
650
- cp_path = kwargs.get("name_or_path", ".")
651
- if model_path is None:
652
- model_path = str(Path(cp_path) / self.vocab_files_names["tokenizer_file"])
653
- self.tok = self.instantiate_from_file_or_name(model_path, repo_id=REPO_ID)
654
-
655
- super().__init__(**kwargs)
656
-
657
- # Specify special tokens which we know the value of.
658
- # EOD from `tok` is used as what is called EOS in HuggingFace.
659
- # Since there is no corresponding mapping for EOS from `tok` in
660
- # HuggingFace, it is treated as an additional special token.
661
- # Same for all other special tokens.
662
-
663
-
664
- self.unk_token = "<unk>"
665
- self.eos_token = "</s>"
666
- self.bos_token = "<s>"
667
- self.pad_token = "<pad>"
668
- self.eod_token = "<eod>"
669
-
670
- self.additional_special_tokens = self.create_list_of_special_tokens()
671
-
672
- if config_path is None:
673
- config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
674
-
675
- if os.path.isfile(config_path):
676
- self.tokenizer_config = self.load_json(Path(config_path))
677
- else: # Load from repo
678
- self.tokenizer_config = self.load_json(Path(self.find_tokenizer_config(Path(config_path), repo_id=REPO_ID)))
679
-
680
- @property
681
- def vocab_size(self) -> int:
682
- """
683
- Get the size of the tokenizer vocabulary.
684
- Returns:
685
- int: The size of the vocabulary.
686
- """
687
- return self.tok.GetPieceSize()
688
-
689
- def get_vocab(self) -> Dict[str, int]:
690
- """
691
- Get the vocabulary as a dictionary mapping token strings to their IDs.
692
- Returns:
693
- Dict[str, int]: Vocabulary mapping.
694
- """
695
- if self.vocab is None:
696
- self.vocab = {self.tok.IdToPiece(i): i for i in range(self.vocab_size)}
697
- return self.vocab
698
-
699
- def _tokenize(self, text: str, **kwargs) -> List[int]:
700
- """
701
- Tokenize the input text.
702
- Args:
703
- text (str): Text to tokenize.
704
- **kwargs: Additional keyword arguments.
705
- Returns:
706
- List[int]: List of token IDs.
707
- """
708
- return_tokens = kwargs.pop("return_tokens", True)
709
- return self._encode(text, return_tokens=return_tokens, **kwargs)
710
-
711
- def _convert_token_to_id(self, token: str) -> int:
712
- """
713
- Convert a token string to its corresponding ID.
714
- Args:
715
- token (str): The token to convert.
716
- Returns:
717
- int: The token's ID.
718
- Raises:
719
- ValueError: If the token is unknown and cannot be encoded to a single ID.
720
- """
721
- return self.tok.PieceToId(token)
722
-
723
-
724
- def decode(
725
- self,
726
- token_ids: Union[List[int], List[List[int]]],
727
- num_threads: Optional[int] = None,
728
- skip_special_tokens: bool = False,
729
- clean_up_tokenization_spaces: bool = False,
730
- ) -> str:
731
- """
732
- Decode a list of token IDs into a string.
733
- Args:
734
- token_ids (Union[List[int], List[List[int]]]): List of token IDs or lists of token IDs.
735
- num_threads (Optional[int]): Number of threads to use for decoding.
736
- Returns:
737
- str: Decoded string.
738
- """
739
- if isinstance(token_ids, torch.Tensor): # For PyTorch tensors
740
- token_ids = token_ids.tolist()
741
- elif isinstance(token_ids, np.ndarray): # For NumPy arrays
742
- token_ids = token_ids.tolist()
743
-
744
-
745
- output = self.tok.decode(input=token_ids, num_threads=num_threads)
746
- if skip_special_tokens:
747
- for substring in self.additional_special_tokens:
748
- output = output.replace(substring, "")
749
-
750
- if clean_up_tokenization_spaces:
751
- warnings.warn(
752
- "when cleaning up tokenization spaces, this will not behave "
753
- "like the original `GPTXTokenizer`., Please supply "
754
- "`clean_up_tokenization_spaces=False` for decoding."
755
- )
756
- output = self.clean_up_tokenization(output)
757
-
758
- return output
759
-
760
-
761
- def _convert_id_to_token(self, index: int) -> str:
762
- """
763
- Convert a token ID to its corresponding token string.
764
- Args:
765
- index (int): Token ID.
766
- Returns:
767
- str: Corresponding token string.
768
- """
769
- return self.tok.IdToPiece(index)
770
-
771
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
772
- """
773
- Convert a list of tokens into a single string.
774
- Args:
775
- tokens (List[str]): List of token strings.
776
- Returns:
777
- str: Concatenated string of tokens.
778
- """
779
- return self.tok.DecodePieces(tokens)
780
-
781
- def _tok_decode(self, token_ids: List[int], **kwargs: Any) -> str:
782
- """
783
- Internal method to decode token IDs with additional arguments.
784
- Args:
785
- token_ids (List[int]): List of token IDs.
786
- **kwargs: Additional arguments to pass to the decode method.
787
- Returns:
788
- str: Decoded string.
789
- This method also issues a warning if unsupported arguments are provided.
790
- """
791
- passed_kwargs = {key: value for (key, value) in kwargs.items() if key in self.decode_kwargs}
792
- if len(passed_kwargs) != len(kwargs):
793
- warnings.warn("silently ignoring some arguments to `decode` due to missing " "support from the tokenizer.")
794
- text = self.decode(token_ids, **passed_kwargs)
795
- return text
796
-
797
- def save_tokenizer(self, save_dir: str) -> None:
798
- if not os.path.isdir(save_dir):
799
- print(f"Vocabulary path ({save_dir}) should be a directory")
800
- return
801
- out_vocab_file = os.path.join(save_dir, "tokenizer.model")
802
-
803
- # if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
804
- # copyfile(self.vocab_file, out_vocab_file)
805
- # elif not os.path.isfile(self.vocab_file):
806
- with open(out_vocab_file, "wb") as f:
807
- content_spiece_model = self.tok.serialized_model_proto()
808
- f.write(content_spiece_model)
809
-
810
- return (out_vocab_file,)
811
-
812
- def _decode(
813
- self,
814
- token_ids: List[int],
815
- skip_special_tokens: bool = False,
816
- clean_up_tokenization_spaces: bool = None,
817
- spaces_between_special_tokens: bool = True,
818
- **kwargs: Any,
819
- ) -> str:
820
- text = self._tok_decode(
821
- token_ids,
822
- skip_special_tokens=skip_special_tokens,
823
- spaces_between_special_tokens=spaces_between_special_tokens,
824
- **kwargs,
825
- )
826
-
827
- clean_up_tokenization_spaces = (
828
- clean_up_tokenization_spaces
829
- if clean_up_tokenization_spaces is not None
830
- else self.clean_up_tokenization_spaces
831
- )
832
- if clean_up_tokenization_spaces:
833
- warnings.warn(
834
- "when cleaning up tokenization spaces, this will not behave "
835
- "like the original `GPTXTokenizer`., Please supply "
836
- "`clean_up_tokenization_spaces=False` for decoding."
837
- )
838
- clean_text = self.clean_up_tokenization(text)
839
- return clean_text
840
- else:
841
- return text
842
-
843
- def save_vocabulary(
844
- self,
845
- save_directory: str,
846
- filename_prefix: Optional[str] = None,
847
- ) -> Tuple[str]:
848
- filename_prefix = filename_prefix + "-" if filename_prefix else ""
849
- save_directory = Path(save_directory)
850
-
851
- self._save_tokenizer_config(save_directory, filename_prefix)
852
- tokenizer_file_path = self._save_tokenizer(save_directory, filename_prefix)
853
-
854
- return (tokenizer_file_path,)
855
-
856
- def _save_tokenizer_config(
857
- self,
858
- save_directory: Path,
859
- filename_prefix: str,
860
- ) -> str:
861
- self.save_tokenizer_config(save_directory)
862
- old_tokenizer_config_path = save_directory / TOKENIZER_CONFIG_FILE
863
- assert old_tokenizer_config_path.is_file(), "tokenizer config path changed"
864
- new_tokenizer_config_path = save_directory / (filename_prefix + old_tokenizer_config_path.name)
865
- old_tokenizer_config_path.replace(new_tokenizer_config_path)
866
- return str(new_tokenizer_config_path)
867
-
868
- def _find_tokenizer_files(self, save_directory: Path) -> List[Path]:
869
- files = list(Path(save_directory).glob(self.model_file_glob))
870
- return files
871
-
872
- def _get_tokenizer_file(self, files: List[Path]):
873
- assert files, "no saved tokenizer file found"
874
- assert len(files) <= 1, "cannot handle multiple saved tokenizer files"
875
- return files[0]
876
-
877
- def _save_tokenizer(
878
- self,
879
- save_directory: Path,
880
- filename_prefix: str,
881
- ) -> str:
882
- self.save_tokenizer(str(save_directory))
883
- tokenizer_files = self._find_tokenizer_files(save_directory)
884
- old_tokenizer_file_path = self._get_tokenizer_file(tokenizer_files)
885
- assert old_tokenizer_file_path.is_file(), "could not access saved tokenizer file"
886
- new_tokenizer_file_path = save_directory / (filename_prefix + self.vocab_files_names["tokenizer_file"])
887
- old_tokenizer_file_path.replace(new_tokenizer_file_path)
888
- return str(new_tokenizer_file_path)
889
-
890
- def save_tokenizer_config(self, save_dir: Path) -> None:
891
- # convert Path to str
892
- for k in self.tokenizer_config:
893
- if isinstance(self.tokenizer_config[k], Path):
894
- self.tokenizer_config[k] = str(self.tokenizer_config[k])
895
-
896
- info_file = save_dir / "tokenizer_config.json"
897
- with info_file.open("w") as f:
898
- json.dump(self.tokenizer_config, f, indent=4)
899
-
900
- def load_json(self, path: Path) -> dict:
901
- with path.open("r") as f:
902
- return json.load(f)
903
-
904
- class SPTokenizer(HFGPTXTokenizer):
905
- model_file_glob = "*tokenizer.model"
906
- vocab_files_names = {"tokenizer_file": "tokenizer.model"}
907
- decode_kwargs = ["num_threads"]
908
- # `is_continuation` does not work without this, but it doesn't
909
- # implement all APIs of `PreTrainedTokenizer`.
910
- def encode(self, text: str, **kwargs) -> List[int]:
911
- return_tokens = kwargs.pop('return_tokens', False)
912
- is_continuation = kwargs.pop('is_continuation', False)
913
- return self._encode(
914
- text,
915
- return_tokens=return_tokens,
916
- is_continuation=is_continuation,
917
- )
918
-
919
- def __init__(self, *args, **kwargs):
920
- super().__init__(*args, **kwargs)
921
-
922
- self.eos_token = "</s>"
923
- self.eos_token_id = 2
924
- self.system_messages_by_lang = { # translations by deepl / google translate
925
- "BG": "Чат между човек и асистент с изкуствен интелект. Асистентът дава полезни и учтиви отговори на въпросите на човека.", # noqa
926
- "CS": "Chat mezi člověkem a asistentem s umělou inteligencí. Asistent poskytuje vstřícné a zdvořilé odpovědi na otázky člověka.", # noqa
927
- "DA": "En chat mellem et menneske og en assistent med kunstig intelligens, som giver hjælpsomme og høflige svar på menneskets spørgsmål.", # noqa
928
- "DE": "Ein Gespräch zwischen einem Menschen und einem Assistenten mit künstlicher Intelligenz. Der Assistent gibt hilfreiche und höfliche Antworten auf die Fragen des Menschen.", # noqa
929
- "EL": "Μια συνομιλία μεταξύ ενός ανθρώπου και ενός βοηθού τεχνητής νοημοσύνης. Ο βοηθός δίνει χρήσιμες και ευγενικές απαντήσεις στις ερωτήσεις του ανθρώπου.", # noqa
930
- "EN": "A chat between a human and an artificial intelligence assistant.The assistant gives helpful and polite answers to the human's questions.", # noqa
931
- "ES": "Una conversación entre un humano y un asistente de inteligencia artificial. El asistente da respuestas útiles y amables a las preguntas del humano.", # noqa
932
- "ET": "Inimese ja tehisintellekti assistendi vaheline vestlus. Assistent annab inimese küsimustele abivalmis ja viisakaid vastuseid.", # noqa
933
- "FI": "Ihmisen ja tekoälyavustajan välinen keskustelu. Avustaja antaa avuliaita ja kohteliaita vastauksia ihmisen kysymyksiin.", # noqa
934
- "FR": "Conversation entre un humain et un assistant doté d'une intelligence artificielle. L'assistant donne des réponses utiles et polies aux questions de l'homme.", # noqa
935
- "GA": "Comhrá idir duine agus cúntóir hintleachta saorga. Tugann an cúntóir freagraí cabhracha dea-bhéasacha ar cheisteanna an duine.", # noqa
936
- "HR": "Razgovor između čovjeka i pomoćnika umjetne inteligencije. Pomoćnik daje korisne i ljubazne odgovore na ljudska pitanja.", # noqa
937
- "HU": "Egy ember és egy mesterséges intelligencia asszisztens közötti beszélgetés. Az asszisztens segítőkész és udvarias válaszokat ad az ember kérdéseire.", # noqa
938
- "IT": "Una chat tra un umano e un assistente di intelligenza artificiale. L'assistente fornisce risposte utili ed educate alle domande dell'uomo.", # noqa
939
- "LT": "Žmogaus ir dirbtinio intelekto asistento pokalbis. Asistentas naudingai ir mandagiai atsako į žmogaus klausimus.", # noqa
940
- "LV": "Cilvēka un mākslīgā intelekta asistenta tērzēšana. Asistents sniedz noderīgas un pieklājīgas atbildes uz cilvēka jautājumiem.", # noqa
941
- "MT": "Chat bejn bniedem u assistent ta' intelliġenza artifiċjali. L-assistent jagħti tweġibiet ta' għajnuna u edukat għall-mistoqsijiet tal-bniedem.", # noqa
942
- "NL": "Een chat tussen een mens en een assistent met kunstmatige intelligentie. De assistent geeft behulpzame en beleefde antwoorden op de vragen van de mens.", # noqa
943
- "PL": "Czat między człowiekiem a asystentem sztucznej inteligencji. Asystent udziela pomocnych i uprzejmych odpowiedzi na pytania człowieka.", # noqa
944
- "PT": "Uma conversa entre um ser humano e um assistente de inteligência artificial. O assistente dá respostas úteis e educadas às perguntas do utilizador.", # noqa
945
- "RO": "O conversație între un om și un asistent cu inteligență artificială. Asistentul oferă răspunsuri utile și politicoase la întrebările omului.", # noqa
946
- "SK": "Rozhovor medzi človekom a asistentom s umelou inteligenciou. Asistent poskytuje užitočné a zdvorilé odpovede na otázky človeka.", # noqa
947
- "SL": "Pogovor med človekom in pomočnikom z umetno inteligenco. Pomočnik človeku prijazno in vljudno odgovarja na njegova vprašanja.", # noqa
948
- "SV": "En chatt mellan en människa och en assistent med artificiell intelligens. Assistenten ger hjälpsamma och artiga svar på människans frågor.", # noqa
949
- }
950
- chat_template = "{%- for message in messages %}\n{%- if (message['role']|lower == 'user') != (loop.index0 % 2 == 0) %}\n{{- raise_exception('Roles must alternate User/Assistant/User/Assistant/...') }}\n{%- endif %}\n{%-if message['role']|lower == 'user' %}\n{{- message['role']|capitalize + ': ' + message['content'] + '\\n' }}\n{%- elif message['role']|lower == 'assistant' %}\n{{- message['role']|capitalize + ': ' + message['content'] + eos_token + '\\n' }}\n{%- else %}\n{{- raise_exception('Only user and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}{%-if add_generation_prompt %}\n{{- 'Assistant: '}}\n{%- endif %}\n"
951
- self.chat_template = {
952
- lang: f"System: {sys_msg}" + "{{- '\\n'}}\n" + chat_template
953
- for lang, sys_msg in self.system_messages_by_lang.items()
954
- }
955
-
956
- output = self.tok.decode(input=token_ids, num_threads=num_threads)
957
- if skip_special_tokens:
958
- for substring in self.additional_special_tokens:
959
- output = output.replace(substring, "")
960
-
961
- if clean_up_tokenization_spaces:
962
- warnings.warn(
963
- "when cleaning up tokenization spaces, this will not behave "
964
- "like the original `GPTXTokenizer`., Please supply "
965
- "`clean_up_tokenization_spaces=False` for decoding."
966
- )
967
- output = self.clean_up_tokenization(output)
968
-
969
- return output
970
-
971
-
972
- def _convert_id_to_token(self, index: int) -> str:
973
- """
974
- Convert a token ID to its corresponding token string.
975
- Args:
976
- index (int): Token ID.
977
- Returns:
978
- str: Corresponding token string.
979
- """
980
- return self.tok.IdToPiece(index)
981
-
982
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
983
- """
984
- Convert a list of tokens into a single string.
985
- Args:
986
- tokens (List[str]): List of token strings.
987
- Returns:
988
- str: Concatenated string of tokens.
989
- """
990
- return self.tok.DecodePieces(tokens)
991
-
992
- def _tok_decode(self, token_ids: List[int], **kwargs: Any) -> str:
993
- """
994
- Internal method to decode token IDs with additional arguments.
995
- Args:
996
- token_ids (List[int]): List of token IDs.
997
- **kwargs: Additional arguments to pass to the decode method.
998
- Returns:
999
- str: Decoded string.
1000
- This method also issues a warning if unsupported arguments are provided.
1001
- """
1002
- passed_kwargs = {key: value for (key, value) in kwargs.items() if key in self.decode_kwargs}
1003
- if len(passed_kwargs) != len(kwargs):
1004
- warnings.warn("silently ignoring some arguments to `decode` due to missing " "support from the tokenizer.")
1005
- text = self.decode(token_ids, **passed_kwargs)
1006
- return text
1007
-
1008
- def save_tokenizer(self, save_dir: str) -> None:
1009
- if not os.path.isdir(save_dir):
1010
- print(f"Vocabulary path ({save_dir}) should be a directory")
1011
- return
1012
- out_vocab_file = os.path.join(save_dir, "tokenizer.model")
1013
-
1014
- # if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
1015
- # copyfile(self.vocab_file, out_vocab_file)
1016
- # elif not os.path.isfile(self.vocab_file):
1017
- with open(out_vocab_file, "wb") as f:
1018
- content_spiece_model = self.tok.serialized_model_proto()
1019
- f.write(content_spiece_model)
1020
-
1021
- return (out_vocab_file,)
1022
-
1023
- def _decode(
1024
- self,
1025
- token_ids: List[int],
1026
- skip_special_tokens: bool = False,
1027
- clean_up_tokenization_spaces: bool = None,
1028
- spaces_between_special_tokens: bool = True,
1029
- **kwargs: Any,
1030
- ) -> str:
1031
- text = self._tok_decode(
1032
- token_ids,
1033
- skip_special_tokens=skip_special_tokens,
1034
- spaces_between_special_tokens=spaces_between_special_tokens,
1035
- **kwargs,
1036
- )
1037
-
1038
- clean_up_tokenization_spaces = (
1039
- clean_up_tokenization_spaces
1040
- if clean_up_tokenization_spaces is not None
1041
- else self.clean_up_tokenization_spaces
1042
- )
1043
- if clean_up_tokenization_spaces:
1044
- warnings.warn(
1045
- "when cleaning up tokenization spaces, this will not behave "
1046
- "like the original `GPTXTokenizer`., Please supply "
1047
- "`clean_up_tokenization_spaces=False` for decoding."
1048
- )
1049
- clean_text = self.clean_up_tokenization(text)
1050
- return clean_text
1051
- else:
1052
- return text
1053
-
1054
- def save_vocabulary(
1055
- self,
1056
- save_directory: str,
1057
- filename_prefix: Optional[str] = None,
1058
- ) -> Tuple[str]:
1059
- filename_prefix = filename_prefix + "-" if filename_prefix else ""
1060
- save_directory = Path(save_directory)
1061
-
1062
- self._save_tokenizer_config(save_directory, filename_prefix)
1063
- tokenizer_file_path = self._save_tokenizer(save_directory, filename_prefix)
1064
-
1065
- return (tokenizer_file_path,)
1066
-
1067
- def _save_tokenizer_config(
1068
- self,
1069
- save_directory: Path,
1070
- filename_prefix: str,
1071
- ) -> str:
1072
- self.save_tokenizer_config(save_directory)
1073
- old_tokenizer_config_path = save_directory / TOKENIZER_CONFIG_FILE
1074
- assert old_tokenizer_config_path.is_file(), "tokenizer config path changed"
1075
- new_tokenizer_config_path = save_directory / (filename_prefix + old_tokenizer_config_path.name)
1076
- old_tokenizer_config_path.replace(new_tokenizer_config_path)
1077
- return str(new_tokenizer_config_path)
1078
-
1079
- def _find_tokenizer_files(self, save_directory: Path) -> List[Path]:
1080
- files = list(Path(save_directory).glob(self.model_file_glob))
1081
- return files
1082
-
1083
- def _get_tokenizer_file(self, files: List[Path]):
1084
- assert files, "no saved tokenizer file found"
1085
- assert len(files) <= 1, "cannot handle multiple saved tokenizer files"
1086
- return files[0]
1087
-
1088
- def _save_tokenizer(
1089
- self,
1090
- save_directory: Path,
1091
- filename_prefix: str,
1092
- ) -> str:
1093
- self.save_tokenizer(str(save_directory))
1094
- tokenizer_files = self._find_tokenizer_files(save_directory)
1095
- old_tokenizer_file_path = self._get_tokenizer_file(tokenizer_files)
1096
- assert old_tokenizer_file_path.is_file(), "could not access saved tokenizer file"
1097
- new_tokenizer_file_path = save_directory / (filename_prefix + self.vocab_files_names["tokenizer_file"])
1098
- old_tokenizer_file_path.replace(new_tokenizer_file_path)
1099
- return str(new_tokenizer_file_path)
1100
-
1101
- def save_tokenizer_config(self, save_dir: Path) -> None:
1102
- # convert Path to str
1103
- for k in self.tokenizer_config:
1104
- if isinstance(self.tokenizer_config[k], Path):
1105
- self.tokenizer_config[k] = str(self.tokenizer_config[k])
1106
-
1107
- info_file = save_dir / "tokenizer_config.json"
1108
- with info_file.open("w") as f:
1109
- json.dump(self.tokenizer_config, f, indent=4)
1110
-
1111
- def load_json(self, path: Path) -> dict:
1112
- with path.open("r") as f:
1113
- return json.load(f)
1114
-
1115
- class SPTokenizer(HFGPTXTokenizer):
1116
- model_file_glob = "*tokenizer.model"
1117
- vocab_files_names = {"tokenizer_file": "tokenizer.model"}
1118
- decode_kwargs = ["num_threads"]
1119
- # `is_continuation` does not work without this, but it doesn't
1120
- # implement all APIs of `PreTrainedTokenizer`.
1121
- def encode(self, text: str, **kwargs) -> List[int]:
1122
- return_tokens = kwargs.pop('return_tokens', False)
1123
- is_continuation = kwargs.pop('is_continuation', False)
1124
- return self._encode(
1125
- text,
1126
- return_tokens=return_tokens,
1127
- is_continuation=is_continuation,
1128
- )
1129
-
1130
- def __init__(self, *args, **kwargs):
1131
- super().__init__(*args, **kwargs)
1132
-
1133
- self.eos_token = "</s>"
1134
- self.eos_token_id = 2
1135
- self.system_messages_by_lang = { # translations by deepl / google translate
1136
- "BG": "Чат между човек и асистент с изкуствен интелект. Асистентът дава полезни и учтиви отговори на въпросите на човека.", # noqa
1137
- "CS": "Chat mezi člověkem a asistentem s umělou inteligencí. Asistent poskytuje vstřícné a zdvořilé odpovědi na otázky člověka.", # noqa
1138
- "DA": "En chat mellem et menneske og en assistent med kunstig intelligens, som giver hjælpsomme og høflige svar på menneskets spørgsmål.", # noqa
1139
- "DE": "Ein Gespräch zwischen einem Menschen und einem Assistenten mit künstlicher Intelligenz. Der Assistent gibt hilfreiche und höfliche Antworten auf die Fragen des Menschen.", # noqa
1140
- "EL": "Μια συνομιλία μεταξύ ενός ανθρώπου και ενός βοηθού τεχνητής νοημοσύνης. Ο βοηθός δίνει χρήσιμες και ευγενικές απαντήσεις στις ερωτήσεις του ανθρώπου.", # noqa
1141
- "EN": "A chat between a human and an artificial intelligence assistant.The assistant gives helpful and polite answers to the human's questions.", # noqa
1142
- "ES": "Una conversación entre un humano y un asistente de inteligencia artificial. El asistente da respuestas útiles y amables a las preguntas del humano.", # noqa
1143
- "ET": "Inimese ja tehisintellekti assistendi vaheline vestlus. Assistent annab inimese küsimustele abivalmis ja viisakaid vastuseid.", # noqa
1144
- "FI": "Ihmisen ja tekoälyavustajan välinen keskustelu. Avustaja antaa avuliaita ja kohteliaita vastauksia ihmisen kysymyksiin.", # noqa
1145
- "FR": "Conversation entre un humain et un assistant doté d'une intelligence artificielle. L'assistant donne des réponses utiles et polies aux questions de l'homme.", # noqa
1146
- "GA": "Comhrá idir duine agus cúntóir hintleachta saorga. Tugann an cúntóir freagraí cabhracha dea-bhéasacha ar cheisteanna an duine.", # noqa
1147
- "HR": "Razgovor između čovjeka i pomoćnika umjetne inteligencije. Pomoćnik daje korisne i ljubazne odgovore na ljudska pitanja.", # noqa
1148
- "HU": "Egy ember és egy mesterséges intelligencia asszisztens közötti beszélgetés. Az asszisztens segítőkész és udvarias válaszokat ad az ember kérdéseire.", # noqa
1149
- "IT": "Una chat tra un umano e un assistente di intelligenza artificiale. L'assistente fornisce risposte utili ed educate alle domande dell'uomo.", # noqa
1150
- "LT": "Žmogaus ir dirbtinio intelekto asistento pokalbis. Asistentas naudingai ir mandagiai atsako į žmogaus klausimus.", # noqa
1151
- "LV": "Cilvēka un mākslīgā intelekta asistenta tērzēšana. Asistents sniedz noderīgas un pieklājīgas atbildes uz cilvēka jautājumiem.", # noqa
1152
- "MT": "Chat bejn bniedem u assistent ta' intelliġenza artifiċjali. L-assistent jagħti tweġibiet ta' għajnuna u edukat għall-mistoqsijiet tal-bniedem.", # noqa
1153
- "NL": "Een chat tussen een mens en een assistent met kunstmatige intelligentie. De assistent geeft behulpzame en beleefde antwoorden op de vragen van de mens.", # noqa
1154
- "PL": "Czat między człowiekiem a asystentem sztucznej inteligencji. Asystent udziela pomocnych i uprzejmych odpowiedzi na pytania człowieka.", # noqa
1155
- "PT": "Uma conversa entre um ser humano e um assistente de inteligência artificial. O assistente dá respostas úteis e educadas às perguntas do utilizador.", # noqa
1156
- "RO": "O conversație între un om și un asistent cu inteligență artificială. Asistentul oferă răspunsuri utile și politicoase la întrebările omului.", # noqa
1157
- "SK": "Rozhovor medzi človekom a asistentom s umelou inteligenciou. Asistent poskytuje užitočné a zdvorilé odpovede na otázky človeka.", # noqa
1158
- "SL": "Pogovor med človekom in pomočnikom z umetno inteligenco. Pomočnik človeku prijazno in vljudno odgovarja na njegova vprašanja.", # noqa
1159
- "SV": "En chatt mellan en människa och en assistent med artificiell intelligens. Assistenten ger hjälpsamma och artiga svar på människans frågor.", # noqa
1160
- }
1161
- chat_template = "{%- for message in messages %}\n{%- if (message['role']|lower == 'user') != (loop.index0 % 2 == 0) %}\n{{- raise_exception('Roles must alternate User/Assistant/User/Assistant/...') }}\n{%- endif %}\n{%-if message['role']|lower == 'user' %}\n{{- message['role']|capitalize + ': ' + message['content'] + '\\n' }}\n{%- elif message['role']|lower == 'assistant' %}\n{{- message['role']|capitalize + ': ' + message['content'] + eos_token + '\\n' }}\n{%- else %}\n{{- raise_exception('Only user and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}{%-if add_generation_prompt %}\n{{- 'Assistant: '}}\n{%- endif %}\n"
1162
- self.chat_template = {
1163
- lang: f"System: {sys_msg}" + "{{- '\\n'}}\n" + chat_template
1164
- for lang, sys_msg in self.system_messages_by_lang.items()
1165
- }
1166
  output = self.tok.decode(input=token_ids, num_threads=num_threads)
1167
  if skip_special_tokens:
1168
  for substring in self.additional_special_tokens:
 
9
  import sentencepiece as spm
10
  import numpy as np
11
  import torch
 
12
  from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_cache
13
  from transformers.tokenization_utils import PreTrainedTokenizer
14
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
 
245
  Returns:
246
  str: Decoded string.
247
  """
248
+ if isinstance(token_ids, torch.Tensor): # For PyTorch tensors
249
+ token_ids = token_ids.tolist()
250
+ elif isinstance(token_ids, np.ndarray): # For NumPy arrays
251
+ token_ids = token_ids.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  output = self.tok.decode(input=token_ids, num_threads=num_threads)
254
  if skip_special_tokens:
255
  for substring in self.additional_special_tokens: