File size: 592 Bytes
480ae5d
 
 
 
 
 
 
 
 
 
 
1b7fc74
480ae5d
 
 
 
1b7fc74
480ae5d
1b7fc74
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
"""

## reference
https://github.com/xai-org/grok-1/blob/main/run.py

        vocab_size=128 * 1024,
        pad_token=0,
        eos_token=2,
"""

import os
from patcher.sptokenizer_wrapper import SPTokenizerWrapper

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_FILE = os.path.join(CURRENT_DIR, "tokenizer.model")

tokenizer = SPTokenizerWrapper(MODEL_FILE)

# print(tokenizer.decode([1, 2, 3, 4], skip_special_tokens=True))
# print(tokenizer.decode([3124, 356, 1834, 402], skip_special_tokens=True))
# print(tokenizer.encode("nice to meet you", add_special_tokens=False))