File size: 3,065 Bytes
751936e
 
 
428b731
 
 
 
 
 
 
 
 
 
 
 
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428b731
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import importlib
from enum import Enum, auto


"""
Interface:
- 

tokenizer.parent = ""
tokenizer.type = TokenizerType.ByteBPE.name
tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
tokenizer.comments = "split all numbers into individual digits, " \
                     "and fallback to bytes to decompose unknown UTF-8 characters"
"""

Animal = Enum('Animal', 'ANT BEE CAT DOG')

uniq_tokenizers = [
    ""
]

all_tokenizers = [
    "gpt_35_turbo",
    "gpt2",
    "gpt2_chinese",
    "bert_chinese",
    "moss",
    #
    # ######
    # "chatyuan_large_v2",
    # "prompt_clue",
    #
    # #### bloom 系列
    # "bloom",
    # "bloomz_6b4_zh",
    # "belle_7b_2m",   # 模型和词典都基于bloom
    #
    "gpt_nexo_20b",
    # "gpt_neox_chinese_v1",
    #
    # ##### glm系列
    # "glm_chinese",
    "chatglm_6b",
    #
    # #### llama alpaca系列
    "llama",  #  '中文单字': 700, '中文多字': 0
    "chinese_llama_lora_7b",  #
    # "chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
    # "belle_llama_ext_7b",
    # "alpaca_7b",
    "baichuan_7b",
    "qwen"
]



class TokenizerType(Enum):
    """
    - https://huggingface.co/docs/transformers/tokenizer_summary
    - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
    - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
      - UNIGRAM = 1;  // Unigram language model with dynamic algorithm
      - BPE = 2;      // Byte Pair Encoding
      - WORD = 3;     // Delimitered by whitespace.
      - CHAR = 4;     // tokenizes into character sequence
    """
    BPE = auto()
    ByteBPE = auto()  # BBPE  Byte-Level BPE
    GPT2BPETokenizer = auto()  #
    BERTTokenizer = auto()


# class TokenizerType(Enum):
#
#     # BERTTokenizer
#     # 依赖一个txt文件
#
#
#     # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
#     # 依赖一个json文件,Tokenizer.from_file(vocab_file)
#     # 案例:gpt-neox-20B
#     HFTokenizer = auto()
#
#     # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
#     # 案例:
#     SentencePieceTokenizer = auto()
#
#
#     # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
#     # 源码:
#     #   - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
#     # Byte-level BPE
#     GPT2BPETokenizer = auto()


class TokenizerImpl(Enum):
    """
    """
    SentencePiece = auto()  #

    # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
    # 构造词典:
    #
    GPT2Tokenizer = auto()
    BertTokenizer = auto()  #



def load_tokener(model_name):
    tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
    return tokenizer

if __name__ == "__main__":
    pass