tokenizer-arena / vocab /__init__.py
eson's picture
update
0ce6477
raw
history blame
No virus
4.28 kB
import importlib
from enum import Enum, auto
"""Interface:
tokenizer.encode
tokenizer.decode
tokenizer.convert_ids_to_tokens
tokenizer.parent = ""
tokenizer.vocab_size
tokenizer.get_vocab() # gpt-neox-20b, llama
tokenizer.type = TokenizerType.ByteBPE.name
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
"HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
- bert
- 特征
- 示例:
- gpt2
- 特征:
- sentencepiece:
- 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
- 示例:llama,baichuan
- tiktoken
- icetk
- hf_tokenizer
- 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有1个tokenizer.json(包括 merge vocab),或者分开独立文件
- 示例:gpt_neox_20b, moss
- tiktoken
- 特征:空格就是空格,
- 示例:gpt3.5 gpt4
tokenizer.comments = "split all numbers into individual digits, " \
"and fallback to bytes to decompose unknown UTF-8 characters"
tokenizer.all_special_tokens # baichuan
tokenizer.special_tokens_set # gpt3.5_turbo
tokenizer.special_tokens_map
tokenizer.dependency [sentencepiece, tiktoken, icetk]
"""
Animal = Enum('Animal', 'ANT BEE CAT DOG')
uniq_tokenizers = [
""
]
all_tokenizers = [
"gpt_35_turbo",
"gpt_4",
"gpt2",
"gpt2_chinese",
"bert_base_cased",
"bert_base_uncased",
"bert_base_chinese",
"kplug",
"moss",
#
# ######
# "chatyuan_large_v2",
# "prompt_clue",
#
# #### bloom 系列
"bloom",
# "bloomz_6b4_zh",
# "belle_7b_2m", # 模型和词典都基于bloom
#
"gpt_nexo_20b",
# "gpt_neox_chinese_v1",
#
# ##### glm系列
# "glm_chinese",
"chatglm_6b",
"chatglm2-6b",
#
# #### llama alpaca系列
"llama", # '中文单字': 700, '中文多字': 0
"chinese_llama", #
"chinese_llama2", #
# "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
# "belle_llama_ext_7b",
# "alpaca_7b",
"baichuan_7b",
"qwen",
"internlm_chat_7b",
"goat",
]
class TokenizerType(Enum):
"""
- https://huggingface.co/docs/transformers/tokenizer_summary
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
- UNIGRAM = 1; // Unigram language model with dynamic algorithm
- BPE = 2; // Byte Pair Encoding
- WORD = 3; // Delimitered by whitespace.
- CHAR = 4; // tokenizes into character sequence
"""
BPE = auto()
ByteBPE = auto() # BBPE Byte-Level BPE
GPT2BPETokenizer = auto() #
BERTTokenizer = auto()
# class TokenizerType(Enum):
#
# # BERTTokenizer
# # 依赖一个txt文件
#
#
# # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
# # 依赖一个json文件,Tokenizer.from_file(vocab_file)
# # 案例:gpt-neox-20B
# HFTokenizer = auto()
#
# # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
# # 案例:
# SentencePieceTokenizer = auto()
#
#
# # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
# # 源码:
# # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
# # Byte-level BPE
# GPT2BPETokenizer = auto()
class TokenizerImpl(Enum):
"""
"""
SentencePiece = auto() #
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
# 构造词典:
#
GPT2Tokenizer = auto()
BertTokenizer = auto() #
def load_tokener(model_name):
tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
return tokenizer
if __name__ == "__main__":
pass