''' from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace tokenizer = Tokenizer(BPE(unk_token="")) tokenizer.pre_tokenizer = Whitespace() trainer = BpeTrainer( vocab_size=50000, min_frequency=1, special_tokens=["", "", ""], limit_alphabet=8000, ) files = ["wiki_mrph.txt"] tokenizer.train(files, trainer) tokenizer.save("juman-bpe-wiki.json") ''' ''' from transformers import AutoTokenizer from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode from datasets import load_dataset, DownloadConfig byte_to_unicode_map = bytes_to_unicode() unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items()) base_vocab = list(unicode_to_byte_map.keys()) tokenizer = AutoTokenizer.from_pretrained("gpt2") dataset = load_dataset(dataset_name, split="train", download_config=download_config) def batch_iterator(batch_size=10): for i in range(0, length, batch_size): yield dataset[i : i + batch_size]["content"] new_tokenizer_larger = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=32768, initial_alphabet=base_vocab) # Saving a Custom Tokenizer on the Hub model_ckpt = "test-001" #org = "transformersbook" org = "" #new_tokenizer_larger.push_to_hub(model_ckpt, organization=org) new_tokenizer_larger.push_to_hub(model_ckpt) ''' from datasets import load_dataset from transformers import AutoTokenizer from more_itertools import chunked tokenizer = AutoTokenizer.from_pretrained("gpt2") #dataset = load_dataset("mc4", "ja", streaming=True, split='train') #ds_sub = dataset.take(100000) #corpus = chunked((x['text'] for x in ds_sub), 1000) dataset = load_dataset('text', data_files={'train': ["wiki_mrph.txt"]}) print(dataset) #corpus = chunked((x for x in dataset), 1000) #new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=32768) #length = 100000 length = 29751517 def batch_iterator(batch_size=10): for i in range(0, length, batch_size): yield dataset['train'][i : i + batch_size]['text'] new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=32768) new_tokenizer.save_pretrained('new_tokenizer')