w11wo commited on
Commit
e33b58c
1 Parent(s): 5d2e627
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. tokenizer.json +0 -0
  3. train_tokenizer.py +26 -0
.gitattributes CHANGED
@@ -14,3 +14,4 @@
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
train_tokenizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import ByteLevelBPETokenizer
3
+
4
+ # load dataset
5
+ dataset = load_dataset("mc4", "id", split="train")
6
+
7
+ # Instantiate tokenizer
8
+ tokenizer = ByteLevelBPETokenizer()
9
+
10
+
11
+ def batch_iterator(batch_size=1000):
12
+ for i in range(0, len(dataset), batch_size):
13
+ yield dataset[i : i + batch_size]["text"]
14
+
15
+
16
+ # Customized training
17
+ tokenizer.train_from_iterator(
18
+ batch_iterator(),
19
+ vocab_size=50265,
20
+ min_frequency=2,
21
+ special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>",],
22
+ )
23
+
24
+ # Save files to disk
25
+ tokenizer.save(f"./tokenizer.json")
26
+