from datasets import load_dataset from transformers import AutoConfig, AutoTokenizer from tokenizers import BertWordPieceTokenizer config = AutoConfig.from_pretrained("./") # load dataset dataset = load_dataset("flax-community/swahili-safi", split="train") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i: i + batch_size]["text"] # Instantiate tokenizer tokenizer = BertWordPieceTokenizer( clean_text=False, handle_chinese_chars=False, strip_accents=False, lowercase=True, ) # Customized training tokenizer.train_from_iterator( batch_iterator(), vocab_size=config.vocab_size, min_frequency=2, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], limit_alphabet=1000, wordpieces_prefix="##" ) # Save files to disk tokenizer.save("tokenizer.json") tokenizer.save_model("./") # Resave in HF Format tokenizer = AutoTokenizer.from_pretrained("./") tokenizer.save_pretrained("./")