File size: 4,407 Bytes
751936e
 
 
d10ecd7
 
 
 
428b731
 
d10ecd7
 
428b731
 
0ce6477
 
d10ecd7
 
 
 
 
 
 
 
 
 
 
9495a4f
 
 
 
 
 
0ce6477
 
 
428b731
 
d10ecd7
 
 
 
 
 
428b731
 
751936e
 
 
 
 
 
 
 
819cf7f
751936e
 
d10ecd7
 
 
 
751936e
 
 
9495a4f
 
751936e
 
d10ecd7
751936e
 
 
 
 
 
 
9495a4f
428b731
7cb27ea
751936e
 
d10ecd7
0ce6477
 
751936e
 
 
9495a4f
 
d10ecd7
 
9495a4f
 
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d10ecd7
751936e
d10ecd7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import importlib
from enum import Enum, auto

"""Interface:
tokenizer.encode
tokenizer.decode
tokenizer.convert_ids_to_tokens

tokenizer.parent = ""
tokenizer.vocab_size   
tokenizer.get_vocab()   # gpt-neox-20b, llama
tokenizer.type = TokenizerType.ByteBPE.name
tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
  "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py

  - bert
    - 特征
    - 示例:
  - gpt2
    - 特征:
  - sentencepiece: 
    - 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
    - 示例:llama,baichuan
  - tiktoken
  - icetk
  - hf_tokenizer
    - 特征:
      - .model 是 tokenizer.models.BPE 类型
      - 词典有 Ġ  "\u0120" 开头
      - 有1个tokenizer.json(包括 merge vocab),或者分开独立文件
      - .model.from_file  .model.save   .model.token_to_id  .model.tokenize
    - 示例:gpt_neox_20b, moss, bloom
  - tiktoken
    - 特征:空格就是空格,
    - 示例:gpt3.5 gpt4
tokenizer.comments = "split all numbers into individual digits, " \
                     "and fallback to bytes to decompose unknown UTF-8 characters"

tokenizer.all_special_tokens  # baichuan
tokenizer.special_tokens_set   # gpt3.5_turbo
tokenizer.special_tokens_map   

tokenizer.dependency [sentencepiece, tiktoken, icetk] 
"""

Animal = Enum('Animal', 'ANT BEE CAT DOG')

uniq_tokenizers = [
    ""
]

all_tokenizers = [
    "gpt_35_turbo",
    "gpt_4",
    "gpt2",
    "gpt2_chinese",
    "bert_base_cased",
    "bert_base_uncased",
    "bert_base_chinese",
    "kplug",
    "moss",
    #
    # ######
    "chatyuan_large_v2",
    "prompt_clue",
    #
    # #### bloom 系列
    "bloom",
    # "bloomz_6b4_zh",
    # "belle_7b_2m",   # 模型和词典都基于bloom
    #
    "gpt_nexo_20b",
    # "gpt_neox_chinese_v1",
    #
    # ##### glm系列
    "glm_chinese",
    "chatglm_6b",
    "chatglm2_6b",
    #
    # #### llama alpaca系列
    "llama",  # '中文单字': 700, '中文多字': 0
    "chinese_llama",  #
    "chinese_llama2",  #
    # "chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
    # "belle_llama_ext_7b",
    # "alpaca_7b",
    "baichuan",
    "baichuan2",
    "qwen",
    "internlm_chat_7b",
    "falcon_180b",
    # "goat",
]

class TokenizerType(Enum):
    """
    - https://huggingface.co/docs/transformers/tokenizer_summary
    - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
    - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
      - UNIGRAM = 1;  // Unigram language model with dynamic algorithm
      - BPE = 2;      // Byte Pair Encoding
      - WORD = 3;     // Delimitered by whitespace.
      - CHAR = 4;     // tokenizes into character sequence
    """
    BPE = auto()
    ByteBPE = auto()  # BBPE  Byte-Level BPE
    GPT2BPETokenizer = auto()  #
    BERTTokenizer = auto()


# class TokenizerType(Enum):
#
#     # BERTTokenizer
#     # 依赖一个txt文件
#
#
#     # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
#     # 依赖一个json文件,Tokenizer.from_file(vocab_file)
#     # 案例:gpt-neox-20B
#     HFTokenizer = auto()
#
#     # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
#     # 案例:
#     SentencePieceTokenizer = auto()
#
#
#     # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
#     # 源码:
#     #   - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
#     # Byte-level BPE
#     GPT2BPETokenizer = auto()


class TokenizerImpl(Enum):
    """
    """
    SentencePiece = auto()  #

    # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
    # 构造词典:
    #
    GPT2Tokenizer = auto()
    BertTokenizer = auto()  #


def load_tokener(model_name):
    tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
    return tokenizer


if __name__ == "__main__":
    pass