chatmlTest / train_tokenizer.py
fangshengren's picture
Upload 59 files
f4fac26 verified
raw
history blame
5.61 kB
import os
import pandas as pd
import sentencepiece as spm
import tokenizers
from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Punctuation, Digits, Metaspace
from tokenizers.normalizers import NFKC
from transformers import PreTrainedTokenizerFast
from config import PROJECT_ROOT
def check_dir_exits(dir: str) -> None:
'''
检查文件夹是否存在,如果不存在则创建文件夹
'''
if not os.path.exists(dir):
os.makedirs(dir)
def train_my_huggingface_wiki_tokenizer(cropus_file: str, max_train_line: int=None, vocab_size: int=40960,token_type: str='char') -> None:
'''
训练tokenizer with huggingface,至少需要32G内存,运行大概需要半个小时。
'''
tokenizer_slow_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer_slow/hf_bpe_tokenizer.josn'
tokenizer_fast_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer'
check_dir_exits(PROJECT_ROOT + '/model_save/hf_tokenizer_slow')
check_dir_exits(tokenizer_fast_save_path)
def get_training_corpus(buffer_size: int=1000, chunk_len: int=2048) -> list:
'''
一个文本块大小2048
'''
line_cnt = 0
buffer = []
with open(cropus_file, 'r', encoding='utf-8') as f_read:
cur_chunk_txt, txt_len = [], 0
for line in f_read:
cur_chunk_txt.append(line)
txt_len += len(line)
line_cnt += 1
if txt_len >= chunk_len:
buffer.append(
''.join(cur_chunk_txt)
)
cur_chunk_txt, txt_len = [], 0
if len(buffer) >= buffer_size:
yield buffer
buffer = []
if isinstance(max_train_line, int) and line_cnt > max_train_line: break
# yield last
if len(buffer) > 0: yield buffer
special_tokens = ["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"]
if token_type =='char':
model = BPE(unk_token="[UNK]")
tokenizer = Tokenizer(model)
# 用兼容等价分解合并对utf编码进行等价组合,比如全角A转换为半角A
tokenizer.normalizer = tokenizers.normalizers.Sequence([NFKC()])
# 标点符号,数字,及Metaspace预分割(否则decode出来没有空格)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
[Punctuation(), Digits(individual_digits=True), Metaspace()]
)
tokenizer.add_special_tokens(special_tokens)
tokenizer.decoder = decoders.Metaspace()
elif token_type == 'byte':
# byte BPE n不需要unk_token
model = BPE()
tokenizer = Tokenizer(model)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)
tokenizer.add_special_tokens(special_tokens)
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, use_regex=True)
tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False)
else:
raise Exception(f'token type must be `char` or `byte`, but got {token_type}')
trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=100, show_progress=True, special_tokens=special_tokens)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
# add \t \n
if '\t' not in tokenizer.get_vocab():
tokenizer.add_tokens(['\t'])
if '\n' not in tokenizer.get_vocab():
tokenizer.add_tokens(['\n'])
tokenizer.save(tokenizer_slow_save_path)
# 将训练的tokenizer转换为PreTrainedTokenizerFast并保存
# 转换是为了方便作为`AutoTokenizer`传到其他`huggingface`组件使用。
# 转换时要手动指定`pad_token`、`eos_token`等特殊token,因为它不指定你原来的tokenizer中哪些字符是这些特殊字符
slow_tokenizer = tokenizer
fast_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=slow_tokenizer,
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
bos_token='[BOS]',
eos_token='[EOS]',
)
fast_tokenizer.save_pretrained(tokenizer_fast_save_path)
print(f'slow tokenizer save in path: {tokenizer_slow_save_path}')
print(f'fast tokenizer save in path: {tokenizer_fast_save_path}')
print(f"\ntrain tokenizer finished. you can use `AutoTokenizer.from_pretrained('{tokenizer_fast_save_path}')` to load and test your tokenizer.")
def train_my_BPE_tokenizer() -> None:
'''
使用sentencepiece训练BPE,缺点只能加载300万行,16G内存会OOM
'''
txt_corpus_file = PROJECT_ROOT + '/data/my_corpus.txt'
special_tokens = ["[PAD]", "[CLS]","[SEP]", "[MASK]", "[UNK]"]
tokenizer = spm.SentencePieceTrainer.train(
input=txt_corpus_file,
model_prefix='my_tokenizer',
vocab_size=40960,
user_defined_symbols=special_tokens,
max_sentence_length=1024,
shuffle_input_sentence=True,
# character_coverage=1.0,
model_type='bpe',
)
# 模型文件保存在my_tokenizer下
if __name__ == '__main__':
cropus_file = PROJECT_ROOT + '/data/wiki.simple.txt'
train_my_huggingface_wiki_tokenizer(cropus_file=cropus_file, token_type='char') # token_type must be 'char' or 'byte'