|
''' |
|
from tokenizers import Tokenizer |
|
from tokenizers.models import BPE |
|
from tokenizers.trainers import BpeTrainer |
|
from tokenizers.pre_tokenizers import Whitespace |
|
|
|
tokenizer = Tokenizer(BPE(unk_token="<unk>")) |
|
tokenizer.pre_tokenizer = Whitespace() |
|
trainer = BpeTrainer( |
|
vocab_size=50000, |
|
min_frequency=1, |
|
special_tokens=["<unk>", "<s>", "</s>"], |
|
limit_alphabet=8000, |
|
) |
|
files = ["wiki_mrph.txt"] |
|
tokenizer.train(files, trainer) |
|
tokenizer.save("juman-bpe-wiki.json") |
|
''' |
|
|
|
''' |
|
from transformers import AutoTokenizer |
|
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode |
|
from datasets import load_dataset, DownloadConfig |
|
|
|
byte_to_unicode_map = bytes_to_unicode() |
|
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items()) |
|
base_vocab = list(unicode_to_byte_map.keys()) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("gpt2") |
|
|
|
dataset = load_dataset(dataset_name, split="train", download_config=download_config) |
|
|
|
def batch_iterator(batch_size=10): |
|
for i in range(0, length, batch_size): |
|
yield dataset[i : i + batch_size]["content"] |
|
|
|
new_tokenizer_larger = tokenizer.train_new_from_iterator(batch_iterator(), |
|
vocab_size=32768, |
|
initial_alphabet=base_vocab) |
|
|
|
# Saving a Custom Tokenizer on the Hub |
|
model_ckpt = "test-001" |
|
#org = "transformersbook" |
|
org = "" |
|
#new_tokenizer_larger.push_to_hub(model_ckpt, organization=org) |
|
new_tokenizer_larger.push_to_hub(model_ckpt) |
|
''' |
|
|
|
from datasets import load_dataset |
|
from transformers import AutoTokenizer |
|
from more_itertools import chunked |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("gpt2") |
|
|
|
|
|
|
|
|
|
dataset = load_dataset('text', data_files={'train': ["wiki_mrph.txt"]}) |
|
print(dataset) |
|
|
|
|
|
|
|
|
|
length = 29751517 |
|
def batch_iterator(batch_size=10): |
|
for i in range(0, length, batch_size): |
|
yield dataset['train'][i : i + batch_size]['text'] |
|
|
|
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), |
|
vocab_size=32768) |
|
|
|
new_tokenizer.save_pretrained('new_tokenizer') |
|
|
|
|
|
|
|
|