imthanhlv commited on
Commit
96baa40
1 Parent(s): fa90e8f

added tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -0
  2. train_tokenizer.py +26 -0
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
train_tokenizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
+
4
+ model_dir = "." # ${MODEL_DIR}
5
+
6
+ # load dataset
7
+ dataset = load_dataset("imthanhlv/binhvq_dedup", split="train")
8
+
9
+ # Instantiate tokenizer
10
+ tokenizer = ByteLevelBPETokenizer()
11
+
12
+ def batch_iterator(batch_size=1000):
13
+ for i in range(0, len(dataset), batch_size):
14
+ yield dataset[i: i + batch_size]["text"]
15
+
16
+ # Customized training
17
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
18
+ "<s>",
19
+ "<pad>",
20
+ "</s>",
21
+ "<unk>",
22
+ "<mask>",
23
+ ])
24
+
25
+ # Save files to disk
26
+ tokenizer.save(f"{model_dir}/tokenizer.json")