Spaces:
Running
Running
""" | |
最简单的tokenizer | |
""" | |
import json | |
from tokenizers import Tokenizer | |
tokenizer = Tokenizer.from_file("20B_tokenizer.json") | |
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True)) | |
print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False)) | |
vocab = tokenizer.get_vocab() | |
def test_single_token(): | |
""" | |
单个字符的编码(一个字符可能会编码成多个id) | |
""" | |
for word in "发大厦三分赛中国解决方法黑白侗鸩,。!?;": | |
encoding = tokenizer.encode(word) | |
for token_id in encoding.ids: | |
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
token = tokenizer.id_to_token(token_id) | |
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
def test_long_token(): | |
""" | |
""" | |
words = [ | |
"//----------------------------------------------------------------", # 代码里有 | |
"--------------------------", | |
"-------------------------", | |
"-----------------------", | |
] | |
for word in words: | |
encoding = tokenizer.encode(word) | |
for token_id in encoding.ids: | |
decode_str = tokenizer.decode([token_id]) # | |
token = tokenizer.id_to_token(token_id) | |
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
def test_encode(): | |
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里 一 个" | |
encoding = tokenizer.encode(text) | |
for token_id in encoding.ids: | |
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
token = tokenizer.id_to_token(token_id) | |
print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
test_single_token() | |
# test_long_token() | |
# test_encode() | |