File size: 997 Bytes
7b282b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from datasets import load_dataset
from transformers import AutoConfig, AutoTokenizer
from tokenizers import BertWordPieceTokenizer


config = AutoConfig.from_pretrained("./")

# load dataset
dataset = load_dataset("flax-community/swahili-safi", split="train")


def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i: i + batch_size]["text"]


# Instantiate tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True,
)

# Customized training
tokenizer.train_from_iterator(
    batch_iterator(),
    vocab_size=config.vocab_size,
    min_frequency=2,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
    limit_alphabet=1000,
    wordpieces_prefix="##"
)

# Save files to disk
tokenizer.save("tokenizer.json")
tokenizer.save_model("./")

# Resave in HF Format
tokenizer = AutoTokenizer.from_pretrained("./")
tokenizer.save_pretrained("./")