In [None]:
import pyarrow.parquet as pq 
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from rich import progress

#### 训练集数据训练tokenizer,小于16G内存的机器容易OOM

In [None]:
pq_file = '../data/my_dataset.shuffle.parquet'
pf = pq.read_table(pq_file)

def get_training_corpus():
 buffer = []
 for prompt, response in progress.track(zip(pf['prompt'], pf['response']), total=pf.num_rows):

 buffer.append(
 f"{prompt.as_py()}\n{response.as_py()}"
 )

 if len(buffer) >= 1000:
 yield buffer
 buffer = []

 if buffer: yield buffer
iter_training_corpus = get_training_corpus()

## step 1: 加载T5模型自带的tokenizer

In [None]:
old_tokenizer = AutoTokenizer.from_pretrained('t5-base')

## step 2: 加载Wiki中文语料,1.6GB
备注: 全量预训练语料文本大小约7GB

In [None]:
lines = []
with open('../data/raw_data/wiki.simple.txt', 'r', encoding='utf-8') as f:
 lines = f.readlines()

In [None]:
len(lines)

## step 3 定义一个语料的迭代生成器
一个文本块(段落)的最小长度为2048,迭代一次返回1000个文本块

In [None]:
def get_training_corpus():
 buffer = []
 i = 0 
 txt = []
 len_cnt = 0
 for line in progress.track(lines):
 
 len_cnt += len(line)
 txt.append(line)
 if len_cnt >= 2048:
 buffer.append(
 ''.join(txt)
 )
 txt = []
 len_cnt = 0
 
 if len(buffer) >= 1000:
 yield buffer
 buffer = []
 i += 1

 # yield last buffer
 if len(buffer) > 0:
 yield buffer

iter_training_corpus = get_training_corpus()

In [None]:
for i in get_training_corpus():
 print(len(i))
 print([len(t) for t in i][0:20])
 break
## 1000
## [2104, 2053, 2176, 2224, 2172, 2068, 2054, 2258, 2058, 2085, 2142, 2274, 2184, 2246, 2144, 2223, 2075, 2058, 2164, 2178]

## step 4: 训练tokenizer

In [None]:
tokenizer = old_tokenizer.train_new_from_iterator(iter_training_corpus, vocab_size=40960)

# cpu计算密集型任务 13600K大概需要1个小时,最大内存占用20G

## step 5: 保存训练好的tokenizer

In [None]:
tokenizer.save_pretrained('../model_save/my_tokenizer_wiki')

# 补充内容: 自定义模型、及特殊字符训练

In [None]:
from transformers import PreTrainedTokenizerFast
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Digits, ByteLevel, Metaspace
from tokenizers.normalizers import NFKC
from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
import tokenizers

### 字符级别的 BPE toeknizer

In [None]:
model = BPE(unk_token="[UNK]")
tokenizer = Tokenizer(model)

# 用兼容等价分解合并对utf编码进行等价组合,比如全角A转换为半角A
tokenizer.normalizer = tokenizers.normalizers.Sequence([NFKC()])

# 标点符号,数字,及Metaspace预分割(否则decode出来没有空格)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
 [Punctuation(), Digits(individual_digits=True), Metaspace()])

tokenizer.add_special_tokens(["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"])
tokenizer.decoder = decoders.Metaspace()

### 字节级别(ByteLevel) BPE toeknizer

In [None]:
# byte BPE n不需要unk_token
model = BPE() 
tokenizer = Tokenizer(model)

tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False)

tokenizer.add_special_tokens(["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"])
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, use_regex=True)
tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False)

In [None]:
# PreTrainedTokenizerFast类无法从 tokenizer 对象推断出哪个标记是掩码标记、[CLS] 标记等,需要手动指定
# 上文的通过from_pretrained('t5-base')定义的old_tokenizer,自带了特殊标记,不用指定
# 到这一步和上文 step 4 一致了
old_tokenizer = PreTrainedTokenizerFast(
 tokenizer_object=tokenizer,
 unk_token="[UNK]",
 pad_token="[PAD]",
 cls_token="[CLS]",
 sep_token="[SEP]",
 mask_token="[MASK]",
 bos_token='[BOS]',
 eos_token='[EOS]', 
)
tokenizer = old_tokenizer.train_new_from_iterator(iter_training_corpus, vocab_size=40960)

In [None]:
# add \t \n if char level tokenizer
# if '\t' not in tokenizer.vcoab:
# tokenizer.add_tokens(['\t'])
# if '\n' not in tokenizer.vcoab:
# tokenizer.add_tokens(['\n'])

In [None]:
tokenizer.save_pretrained('../model_save/my_tokenizer_wiki')

In [None]:
txt = '这是一段中英混输的句子, (chinese and English, here are words.)'
# toeknize
tokens = tokenizer.tokenize(txt)
print(tokens)
# 字级别输出:
# ['▁这是', '一段', '中英', '混', '输', '的', '句子', '▁,', '▁(', '▁ch', 'inese', '▁and', '▁Eng', 'lish', '▁,', '▁h', 'ere', '▁', 'are', '▁w', 'ord', 's', '▁.', '▁)']

# Byte级别输出
# ['Ġè¿Ļæĺ¯', 'ä¸Ģ段', 'ä¸Ńèĭ±', 'æ··', 'è¾ĵ', 'çļĦ', 'åı¥åŃIJ', 'Ġ,', 'Ġ(', 'Ġch', 'inese', 'Ġand', 'ĠEng', 'lish', 'Ġ,', 'Ġh', 'ere', 'Ġare', 'Ġw', 'ord', 's', 'Ġ.', 'Ġ)']

# decode
ids = tokenizer.encode(txt)
tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)