NeoZ123 commited on
Commit
c30dc0f
1 Parent(s): a084b7a

Update tiktoken_tokenizer.py

Browse files
Files changed (1) hide show
  1. tiktoken_tokenizer.py +10 -13
tiktoken_tokenizer.py CHANGED
@@ -53,24 +53,21 @@ class BaseTokenizer(PreTrainedTokenizer):
53
  return NotImplemented
54
 
55
  class TikTokenizer(BaseTokenizer):
56
- @staticmethod
57
- def from_pretrained(path, *inputs, **kwargs):
58
- return TikTokenizer(vocab_file=os.path.join(path, "tokenizer.tiktoken"))
59
 
60
- def __init__(self, vocab_file=None):
61
  pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
62
  self.pat_str = re.compile(pat_str)
63
 
64
  self.b64_vocab = {}
65
- if vocab_file is not None:
66
- mergeable_ranks = {}
67
- with open(vocab_file) as f:
68
- for line in f:
69
- token, rank = line.strip().split()
70
- rank = int(rank)
71
- token = base64.b64decode(token)
72
- mergeable_ranks[token] = rank
73
- self.b64_vocab['%s' % token] = rank
74
 
75
  self.special_tokens = ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
76
  "<|user|>", "<|assistant|>", "<|observation|>"]
 
53
  return NotImplemented
54
 
55
  class TikTokenizer(BaseTokenizer):
56
+ vocab_files_names = {"vocab_file": "tokenizer.tiktoken"}
 
 
57
 
58
+ def __init__(self, vocab_file, **kwargs):
59
  pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
60
  self.pat_str = re.compile(pat_str)
61
 
62
  self.b64_vocab = {}
63
+ mergeable_ranks = {}
64
+ with open(vocab_file) as f:
65
+ for line in f:
66
+ token, rank = line.strip().split()
67
+ rank = int(rank)
68
+ token = base64.b64decode(token)
69
+ mergeable_ranks[token] = rank
70
+ self.b64_vocab['%s' % token] = rank
 
71
 
72
  self.special_tokens = ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
73
  "<|user|>", "<|assistant|>", "<|observation|>"]