ja-test-001 / train_tokenizer.py
team-nave's picture
Adding source files
64b03f0
raw
history blame
2.27 kB
'''
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
vocab_size=50000,
min_frequency=1,
special_tokens=["<unk>", "<s>", "</s>"],
limit_alphabet=8000,
)
files = ["wiki_mrph.txt"]
tokenizer.train(files, trainer)
tokenizer.save("juman-bpe-wiki.json")
'''
'''
from transformers import AutoTokenizer
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
from datasets import load_dataset, DownloadConfig
byte_to_unicode_map = bytes_to_unicode()
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())
base_vocab = list(unicode_to_byte_map.keys())
tokenizer = AutoTokenizer.from_pretrained("gpt2")
dataset = load_dataset(dataset_name, split="train", download_config=download_config)
def batch_iterator(batch_size=10):
for i in range(0, length, batch_size):
yield dataset[i : i + batch_size]["content"]
new_tokenizer_larger = tokenizer.train_new_from_iterator(batch_iterator(),
vocab_size=32768,
initial_alphabet=base_vocab)
# Saving a Custom Tokenizer on the Hub
model_ckpt = "test-001"
#org = "transformersbook"
org = ""
#new_tokenizer_larger.push_to_hub(model_ckpt, organization=org)
new_tokenizer_larger.push_to_hub(model_ckpt)
'''
from datasets import load_dataset
from transformers import AutoTokenizer
from more_itertools import chunked
tokenizer = AutoTokenizer.from_pretrained("gpt2")
#dataset = load_dataset("mc4", "ja", streaming=True, split='train')
#ds_sub = dataset.take(100000)
#corpus = chunked((x['text'] for x in ds_sub), 1000)
dataset = load_dataset('text', data_files={'train': ["wiki_mrph.txt"]})
print(dataset)
#corpus = chunked((x for x in dataset), 1000)
#new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=32768)
#length = 100000
length = 29751517
def batch_iterator(batch_size=10):
for i in range(0, length, batch_size):
yield dataset['train'][i : i + batch_size]['text']
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(),
vocab_size=32768)
new_tokenizer.save_pretrained('new_tokenizer')