Spaces:
Running
Running
File size: 1,187 Bytes
751936e 814ee6b 751936e d27a756 814ee6b 751936e 814ee6b 751936e 428b731 f4973d4 751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
"""
https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
https://github.com/openai/tiktoken
词典路径: https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
"""
import json
import tiktoken
# from tokenizer import tiktoken_patch
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
text = "你好,请告诉我聚乙烯是什么"
# text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
text = "'<|endoftext|>"
encoding = tokenizer.encode(text, allowed_special="all")
decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
print(encoding)
print(decoding_bytes)
# 100256
# for token in tokens:
# token_str = encoding.decode([token])
# print(token, token_str, json.dumps(token_str))
tokenizer.decode_tokens_bytes([10])
tokenizer.decode_single_token_bytes(10)
tokenizer.decode_bytes([10])
f_out = open("vocab.jsonl", "w")
# 100255
for i in range(tokenizer.n_vocab):
# decode_bytes
# decode_single_token_bytes
try:
token_str = tokenizer.decode([i])
except:
token_str = None
f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n")
|