Spaces:

xu-song
/

tokenizer-arena

Running

File size: 1,187 Bytes

"""
https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

https://github.com/openai/tiktoken

词典路径： https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

"""

import json
import tiktoken
# from tokenizer import tiktoken_patch


tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
text = "你好，请告诉我聚乙烯是什么"
# text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
text = "'<|endoftext|>"
encoding = tokenizer.encode(text, allowed_special="all")
decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
print(encoding)
print(decoding_bytes)
# 100256

# for token in tokens:
#     token_str = encoding.decode([token])
#     print(token, token_str, json.dumps(token_str))


tokenizer.decode_tokens_bytes([10])
tokenizer.decode_single_token_bytes(10)
tokenizer.decode_bytes([10])


f_out = open("vocab.jsonl", "w")
# 100255
for i in range(tokenizer.n_vocab):
    # decode_bytes
    # decode_single_token_bytes
    try:
        token_str = tokenizer.decode([i])
    except:
        token_str = None
    f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n")