Spaces:

eson
/

tokenizer-arena

Running

File size: 5,000 Bytes


"""

special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
                              啥？         啥？                     bos      eos


[MASK] for short blank filling - 150000
[sMASK] for sentence filling -
[gMASK] for left-to-right generation. - 150001


text.replace("\t", f"<|tab|>")
text.replace(" " * i, f"<|blank_{length}|>")
text.replace("\n", "<n>")


"bos_token": "<sop>",   startofpiece
"eop_token": "<eop>",
"eos_token": "</s>",

## 确认

130005 = <eop>

## 源码：

- https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L32

"""
import os
from transformers import AutoTokenizer

os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
# tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("chatglm_6b/", trust_remote_code=True)


def encode_text(text):
    """
    能够编码
    """
    tokens = tokenizer.tokenize(text)
    token_id = tokenizer.encode(text=text, add_special_tokens=False)
    decoded_text = tokenizer.decode(token_id)
    print("tokens: ", tokens, ";\tid: ", token_id, ";\ttext: ", decoded_text)


def test_space():
    # " " 编码后是空的
    for text in ["  ", "\t", "你是谁", "你是\n谁", "你是 谁", "你是  谁", "'[Round 0]\n问：你是谁\n答：我是一个名为 ChatGLM-6B 的人工智能助手,是基于清华大学 KEG 实验室和智谱 AI 公司于 2023 年共同训练的语言模型开发的。我的任务是针对用户的问题和要求提供适当的答复和支持。\n[Round 1]\n问：你会干什么\n答："]:
        encode_text(text)


def test_case():
    for text in ["Good morning", "good morning", "good  morning", "goog morningabc"]:
        encode_text(text)

def export():
    with open("chatglm.vocab", "w", encoding="utf-8") as f_out:
        vocab_size = len(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces)
        for i in range(vocab_size):
            f_out.write(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[i].piece + "\n")


# export()


def test_tokens():
    tokens = [43435]
    tokens = [    53,   6945,      5,      8,     42,      4,  64286,     12,  74874,
              4,  67342,     12,  74874, 130328, 130247, 130233, 130227,     35,
          65806,  68241,  75890,  14132,   5388,    340,     11,     21,    222,
              6,  76693,  66877,  63852,      6,  66430,  68747, 102501,  63823,
              4,     52,   6945,      5,      9,     42,      4,  64286,     12,
          65450,  83400,  64213,  66846,      4,  67342,     12, 130001, 130004,
          74747,  83400,  66115,  90478,  70597,  63826,  68076,      6,  63873,
          68684,  64113, 120922,  73129,  63823,  65056,  63829,  63948,  64124,
          79727,  64447,     12,      4,      4,      9,      7,      5,  64716,
          93067,  95119,  64560,     12,  66524,  63827,  70682,  63944,  89160,
          63826,  71304,      6,  79553,  67155,  63826,  68668,  63843,  91351,
          96846,  63823,      4,      4,     10,      7,      5,  95472,  74107,
          66625,  64285,     12,  64442,  67201,  69609,  63824,  81548,  63824,
          70870,  63826,  66800,      6,  94824,  63959,  65195,  65515,  63824,
          64392,  69584,  63824,  81198,  63914,  63835,  63823,      4,      4,
             13,      7,      5,  66544,  69656,     12,  66533,  63891,  63948,
          66544,  69726,      6,  63906,  86089,  63824,  88419,  63824,  69765,
          63853,  64369, 102753,  64736,  63823,      4,      4,     16,      7,
              5,  65073,  63827,  72151,  64020,  67491,  66469,  63853,  68168,
             12,  65289,  95128,  63826,  68819,      6, 118679,  66115,  64174,
          66625,  63823,      4,      4,     15,      7,      5,  86790,     12,
          70666,  89266,  63878,  66544,  69656,      6,  67623,  73129,  63823,
              4,      4,     21,      7,  71210,  79856,  63912,  63831,  66625,
          69204,  64659,     12,  66312,  63922,  64984,  67427,  63824,  63959,
          65419,  63853,  64384,  63835,  63823,      4,      4,  63976, 106490,
          65921,  64542,  73129,      6,  63852,  80917,  65207,  64678,  63853,
          66625,  64427,      6,  89385,  64124,  79727,  64447,  63823, 130005]
    # print(tokenizer.decode(tokens))
    start_idx = 0  # chatglm里的token_id是从0开始的
    # start_idx = 20000 # 默认词典，前20000是图片
    for i, token in enumerate(tokens):
        # print(i, token, tokenizer.decode([token - start_idx]))
        # print(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[token - start_idx].piece, end=" ")
        print(i, token, tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[token - start_idx].piece)


test_tokens()
encode_text("good job d的 算法")

# tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens

# test_case()
# test_space()




# s