File size: 1,187 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
814ee6b
751936e
 
 
d27a756
 
814ee6b
 
751936e
 
 
814ee6b
751936e
 
 
 
 
 
428b731
 
 
 
f4973d4
751936e
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

https://github.com/openai/tiktoken

词典路径: https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

"""

import json
import tiktoken
# from tokenizer import tiktoken_patch


tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
text = "你好,请告诉我聚乙烯是什么"
# text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
text = "'<|endoftext|>"
encoding = tokenizer.encode(text, allowed_special="all")
decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
print(encoding)
print(decoding_bytes)
# 100256

# for token in tokens:
#     token_str = encoding.decode([token])
#     print(token, token_str, json.dumps(token_str))


tokenizer.decode_tokens_bytes([10])
tokenizer.decode_single_token_bytes(10)
tokenizer.decode_bytes([10])


f_out = open("vocab.jsonl", "w")
# 100255
for i in range(tokenizer.n_vocab):
    # decode_bytes
    # decode_single_token_bytes
    try:
        token_str = tokenizer.decode([i])
    except:
        token_str = None
    f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n")