File size: 1,591 Bytes
1b7fc74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
""" 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准

## reference


## usage

- grok

"""

import sentencepiece as spm
from transformers import PreTrainedTokenizer


class SPTokenizerWrapper(PreTrainedTokenizer):
    """

    ## impl in PreTrainedTokenizer
    - convert_ids_to_tokens
    """

    def __init__(self, vocab_file):
        self.vocab_file = vocab_file
        self.sp_model = spm.SentencePieceProcessor(self.vocab_file)
        super().__init__()

    @property
    def vocab_size(self):
        """Returns vocab size"""
        return self.sp_model.get_piece_size()

    def get_vocab(self):
        """Returns vocab as a dict"""
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        return vocab

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        token = self.sp_model.IdToPiece(index)
        return token

    # def (self, ids, skip_special_tokens=False):  # impl in PreTrainedTokenizer


    def encode(self, *args, **kwargs):
        kwargs.pop("add_special_tokens", None)
        kwargs.pop("allowed_special", None)
        return self.sp_model.Encode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        kwargs.pop("skip_special_tokens", None)
        return self.sp_model.Decode(*args, **kwargs)



# PreTrainedTokenizer.convert_ids_to_tokens