""" https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb https://github.com/openai/tiktoken 词典路径: https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py """ import json import tiktoken # from tokenizer import tiktoken_patch tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') text = "你好,请告诉我聚乙烯是什么" # text = "a bcjik今天天气颗粒剂范大将军发卡卡萨" text = "'<|endoftext|>" encoding = tokenizer.encode(text, allowed_special="all") decoding_bytes = tokenizer.decode_tokens_bytes(encoding) print(encoding) print(decoding_bytes) # 100256 # for token in tokens: # token_str = encoding.decode([token]) # print(token, token_str, json.dumps(token_str)) tokenizer.decode_tokens_bytes([10]) tokenizer.decode_single_token_bytes(10) tokenizer.decode_bytes([10]) f_out = open("vocab.jsonl", "w") # 100255 for i in range(tokenizer.n_vocab): # decode_bytes # decode_single_token_bytes try: token_str = tokenizer.decode([i]) except: token_str = None f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n")