Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
import sentencepiece as spm | |
import tokenizers | |
from tokenizers import Tokenizer, decoders | |
from tokenizers.models import BPE | |
from tokenizers.trainers import BpeTrainer | |
from tokenizers.pre_tokenizers import Punctuation, Digits, Metaspace | |
from tokenizers.normalizers import NFKC | |
from transformers import PreTrainedTokenizerFast | |
from config import PROJECT_ROOT | |
def check_dir_exits(dir: str) -> None: | |
''' | |
检查文件夹是否存在,如果不存在则创建文件夹 | |
''' | |
if not os.path.exists(dir): | |
os.makedirs(dir) | |
def train_my_huggingface_wiki_tokenizer(cropus_file: str, max_train_line: int=None, vocab_size: int=40960,token_type: str='char') -> None: | |
''' | |
训练tokenizer with huggingface,至少需要32G内存,运行大概需要半个小时。 | |
''' | |
tokenizer_slow_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer_slow/hf_bpe_tokenizer.josn' | |
tokenizer_fast_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer' | |
check_dir_exits(PROJECT_ROOT + '/model_save/hf_tokenizer_slow') | |
check_dir_exits(tokenizer_fast_save_path) | |
def get_training_corpus(buffer_size: int=1000, chunk_len: int=2048) -> list: | |
''' | |
一个文本块大小2048 | |
''' | |
line_cnt = 0 | |
buffer = [] | |
with open(cropus_file, 'r', encoding='utf-8') as f_read: | |
cur_chunk_txt, txt_len = [], 0 | |
for line in f_read: | |
cur_chunk_txt.append(line) | |
txt_len += len(line) | |
line_cnt += 1 | |
if txt_len >= chunk_len: | |
buffer.append( | |
''.join(cur_chunk_txt) | |
) | |
cur_chunk_txt, txt_len = [], 0 | |
if len(buffer) >= buffer_size: | |
yield buffer | |
buffer = [] | |
if isinstance(max_train_line, int) and line_cnt > max_train_line: break | |
# yield last | |
if len(buffer) > 0: yield buffer | |
special_tokens = ["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"] | |
if token_type =='char': | |
model = BPE(unk_token="[UNK]") | |
tokenizer = Tokenizer(model) | |
# 用兼容等价分解合并对utf编码进行等价组合,比如全角A转换为半角A | |
tokenizer.normalizer = tokenizers.normalizers.Sequence([NFKC()]) | |
# 标点符号,数字,及Metaspace预分割(否则decode出来没有空格) | |
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence( | |
[Punctuation(), Digits(individual_digits=True), Metaspace()] | |
) | |
tokenizer.add_special_tokens(special_tokens) | |
tokenizer.decoder = decoders.Metaspace() | |
elif token_type == 'byte': | |
# byte BPE n不需要unk_token | |
model = BPE() | |
tokenizer = Tokenizer(model) | |
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True) | |
tokenizer.add_special_tokens(special_tokens) | |
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, use_regex=True) | |
tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False) | |
else: | |
raise Exception(f'token type must be `char` or `byte`, but got {token_type}') | |
trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=100, show_progress=True, special_tokens=special_tokens) | |
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) | |
# add \t \n | |
if '\t' not in tokenizer.get_vocab(): | |
tokenizer.add_tokens(['\t']) | |
if '\n' not in tokenizer.get_vocab(): | |
tokenizer.add_tokens(['\n']) | |
tokenizer.save(tokenizer_slow_save_path) | |
# 将训练的tokenizer转换为PreTrainedTokenizerFast并保存 | |
# 转换是为了方便作为`AutoTokenizer`传到其他`huggingface`组件使用。 | |
# 转换时要手动指定`pad_token`、`eos_token`等特殊token,因为它不指定你原来的tokenizer中哪些字符是这些特殊字符 | |
slow_tokenizer = tokenizer | |
fast_tokenizer = PreTrainedTokenizerFast( | |
tokenizer_object=slow_tokenizer, | |
unk_token="[UNK]", | |
pad_token="[PAD]", | |
cls_token="[CLS]", | |
sep_token="[SEP]", | |
mask_token="[MASK]", | |
bos_token='[BOS]', | |
eos_token='[EOS]', | |
) | |
fast_tokenizer.save_pretrained(tokenizer_fast_save_path) | |
print(f'slow tokenizer save in path: {tokenizer_slow_save_path}') | |
print(f'fast tokenizer save in path: {tokenizer_fast_save_path}') | |
print(f"\ntrain tokenizer finished. you can use `AutoTokenizer.from_pretrained('{tokenizer_fast_save_path}')` to load and test your tokenizer.") | |
def train_my_BPE_tokenizer() -> None: | |
''' | |
使用sentencepiece训练BPE,缺点只能加载300万行,16G内存会OOM | |
''' | |
txt_corpus_file = PROJECT_ROOT + '/data/my_corpus.txt' | |
special_tokens = ["[PAD]", "[CLS]","[SEP]", "[MASK]", "[UNK]"] | |
tokenizer = spm.SentencePieceTrainer.train( | |
input=txt_corpus_file, | |
model_prefix='my_tokenizer', | |
vocab_size=40960, | |
user_defined_symbols=special_tokens, | |
max_sentence_length=1024, | |
shuffle_input_sentence=True, | |
# character_coverage=1.0, | |
model_type='bpe', | |
) | |
# 模型文件保存在my_tokenizer下 | |
if __name__ == '__main__': | |
cropus_file = PROJECT_ROOT + '/data/wiki.simple.txt' | |
train_my_huggingface_wiki_tokenizer(cropus_file=cropus_file, token_type='char') # token_type must be 'char' or 'byte' | |