|
from datasets import load_dataset |
|
from tokenizers import ( |
|
decoders, |
|
models, |
|
normalizers, |
|
pre_tokenizers, |
|
processors, |
|
trainers, |
|
Tokenizer, |
|
Regex, |
|
) |
|
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizerBase |
|
from tqdm import tqdm |
|
|
|
dataset = load_dataset( |
|
"parquet", data_dir="Mxode/IndustryCorpus-Subset-zh-en", split="train") |
|
dataset = dataset.shuffle(seed=3407) |
|
|
|
ds = dataset[:1000000] |
|
ds_val = dataset[-10000:] |
|
char_len = sum(len(x) for x in ds_val['text']) |
|
|
|
|
|
def get_training_corpus(): |
|
for i in range(0, len(ds), 1000): |
|
yield ds["text"][i: i + 1000] |
|
|
|
|
|
def train(): |
|
tokenizer = Tokenizer(models.BPE()) |
|
tokenizer.normalizer = normalizers.NFC() |
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
|
pre_tokenizers.Split( |
|
pattern=Regex( |
|
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"), |
|
behavior="isolated", |
|
invert=False, |
|
), |
|
pre_tokenizers.ByteLevel( |
|
add_prefix_space=False, |
|
use_regex=False, |
|
trim_offsets=False |
|
) |
|
]) |
|
trainer = trainers.BpeTrainer( |
|
vocab_size=16000, |
|
special_tokens=["<|endoftext|>", "<|im_start|>", "<|im_end|>"] |
|
) |
|
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) |
|
tokenizer.post_processor = processors.ByteLevel( |
|
add_prefix_space=False, |
|
use_regex=False, |
|
trim_offsets=False |
|
) |
|
tokenizer.decoder = decoders.ByteLevel( |
|
add_prefix_space=False, |
|
use_regex=False, |
|
trim_offsets=False |
|
) |
|
wrapped_tokenizer = PreTrainedTokenizerFast( |
|
tokenizer_object=tokenizer, |
|
bos_token="<|endoftext|>", |
|
eos_token="<|im_end|>", |
|
pad_token="<|endoftext|>", |
|
model_max_length=4096, |
|
clean_up_tokenization_spaces=False, |
|
errors="replace", |
|
split_special_tokens=False, |
|
) |
|
wrapped_tokenizer.chat_template = """{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}""" |
|
wrapped_tokenizer.save_pretrained( |
|
'Mxode/Bilingual-Tokenizer/BilingualTokenizer-16K') |
|
return wrapped_tokenizer |
|
|
|
|
|
def eval(tokenizer: PreTrainedTokenizerBase): |
|
def get_compress_len(tokenizer): |
|
return sum(len(tokenizer(x, return_tensors=None)['input_ids']) for x in tqdm(ds_val['text'])) |
|
|
|
compress_len = get_compress_len(tokenizer) |
|
compression_rate = compress_len / char_len * 100 |
|
print(f'{len(tokenizer):<40} {compression_rate:.2f}%') |
|
|
|
|
|
if __name__ == "__main__": |
|
tokenizer = train() |
|
eval(tokenizer) |
|
|