Spaces:
Running
Running
import importlib | |
from enum import Enum, auto | |
"""Interface: | |
tokenizer.encode | |
tokenizer.decode | |
tokenizer.convert_ids_to_tokens | |
tokenizer.parent = "" | |
tokenizer.vocab_size | |
tokenizer.get_vocab() # gpt-neox-20b, llama | |
tokenizer.type = TokenizerType.ByteBPE.name | |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py | |
- bert | |
- 特征 | |
- 示例: | |
- gpt2 | |
- 特征: | |
- sentencepiece: | |
- 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁, | |
- 示例:llama,baichuan | |
- tiktoken | |
- icetk | |
- hf_tokenizer | |
- 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有 merge.txt | |
- 示例:gpt_neox_20b, moss | |
- gpt3.5 gpt4 | |
- 特征:tiktoken | |
tokenizer.comments = "split all numbers into individual digits, " \ | |
"and fallback to bytes to decompose unknown UTF-8 characters" | |
tokenizer.all_special_tokens # baichuan | |
tokenizer.special_tokens_set # gpt3.5_turbo | |
tokenizer.special_tokens_map | |
tokenizer.dependency [sentencepiece, tiktoken, icetk] | |
""" | |
Animal = Enum('Animal', 'ANT BEE CAT DOG') | |
uniq_tokenizers = [ | |
"" | |
] | |
all_tokenizers = [ | |
"gpt_35_turbo", | |
"gpt4", | |
"gpt2", | |
"gpt2_chinese", | |
"bert_base_cased", | |
"bert_base_uncased", | |
"bert_base_chinese", | |
"kplug", | |
"moss", | |
# | |
# ###### | |
# "chatyuan_large_v2", | |
# "prompt_clue", | |
# | |
# #### bloom 系列 | |
"bloom", | |
# "bloomz_6b4_zh", | |
# "belle_7b_2m", # 模型和词典都基于bloom | |
# | |
"gpt_nexo_20b", | |
# "gpt_neox_chinese_v1", | |
# | |
# ##### glm系列 | |
# "glm_chinese", | |
"chatglm_6b", | |
"chatglm2-6b", | |
# | |
# #### llama alpaca系列 | |
"llama", # '中文单字': 700, '中文多字': 0 | |
"chinese_llama_lora_7b", # | |
# "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 | |
# "belle_llama_ext_7b", | |
# "alpaca_7b", | |
"baichuan_7b", | |
"qwen", | |
"internlm_chat_7b", | |
"goat", | |
] | |
class TokenizerType(Enum): | |
""" | |
- https://huggingface.co/docs/transformers/tokenizer_summary | |
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py | |
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48 | |
- UNIGRAM = 1; // Unigram language model with dynamic algorithm | |
- BPE = 2; // Byte Pair Encoding | |
- WORD = 3; // Delimitered by whitespace. | |
- CHAR = 4; // tokenizes into character sequence | |
""" | |
BPE = auto() | |
ByteBPE = auto() # BBPE Byte-Level BPE | |
GPT2BPETokenizer = auto() # | |
BERTTokenizer = auto() | |
# class TokenizerType(Enum): | |
# | |
# # BERTTokenizer | |
# # 依赖一个txt文件 | |
# | |
# | |
# # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231 | |
# # 依赖一个json文件,Tokenizer.from_file(vocab_file) | |
# # 案例:gpt-neox-20B | |
# HFTokenizer = auto() | |
# | |
# # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file) | |
# # 案例: | |
# SentencePieceTokenizer = auto() | |
# | |
# | |
# # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt | |
# # 源码: | |
# # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92 | |
# # Byte-level BPE | |
# GPT2BPETokenizer = auto() | |
class TokenizerImpl(Enum): | |
""" | |
""" | |
SentencePiece = auto() # | |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104 | |
# 构造词典: | |
# | |
GPT2Tokenizer = auto() | |
BertTokenizer = auto() # | |
def load_tokener(model_name): | |
tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer | |
return tokenizer | |
if __name__ == "__main__": | |
pass | |