LH-Tech-AI commited on
Commit
2155c30
·
verified ·
1 Parent(s): ad29c0d

Create train_tokenizer.py

Browse files
Files changed (1) hide show
  1. train_tokenizer.py +20 -0
train_tokenizer.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import ByteLevelBPETokenizer
3
+
4
+ dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True)
5
+ def get_training_corpus():
6
+ dataset_iter = iter(dataset)
7
+ for _ in range(50000):
8
+ yield next(dataset_iter)["text"]
9
+
10
+ tokenizer = ByteLevelBPETokenizer()
11
+
12
+ tokenizer.train_from_iterator(
13
+ get_training_corpus(),
14
+ vocab_size=500,
15
+ min_frequency=2,
16
+ special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
17
+ )
18
+
19
+ tokenizer.save_model(".", "custom_llama_tokenizer")
20
+ print("Tokenizer training complete!")