dk-crazydiv commited on
Commit
398ec26
1 Parent(s): 02ca275
Files changed (2) hide show
  1. tokenizer.json +0 -0
  2. train_tokenizer.py +8 -5
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_tokenizer.py CHANGED
@@ -1,19 +1,22 @@
1
  #!/usr/bin/env python3
2
  from datasets import load_dataset
 
3
  from tokenizers import ByteLevelBPETokenizer
4
-
5
  # load dataset
6
- dataset = load_dataset("oscar", "unshuffled_deduplicated_hi", split="train")
 
 
7
 
8
  # Instantiate tokenizer
9
- tokenizer = ByteLevelBPETokenizer()
10
 
11
- def batch_iterator(batch_size=1000):
12
  for i in range(0, len(dataset), batch_size):
13
  yield dataset[i: i + batch_size]["text"]
14
 
15
  # Customized training
16
- tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
17
  "<s>",
18
  "<pad>",
19
  "</s>",
 
1
  #!/usr/bin/env python3
2
  from datasets import load_dataset
3
+ from datasets import load_from_disk
4
  from tokenizers import ByteLevelBPETokenizer
5
+ from tqdm import tqdm
6
  # load dataset
7
+ # dataset = load_dataset("oscar", "unshuffled_deduplicated_hi", split="train")
8
+
9
+ dataset = load_from_disk("/home/rtx/work/dk/hf/vo")
10
 
11
  # Instantiate tokenizer
12
+ tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
13
 
14
+ def batch_iterator(batch_size=100_000):
15
  for i in range(0, len(dataset), batch_size):
16
  yield dataset[i: i + batch_size]["text"]
17
 
18
  # Customized training
19
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=50, special_tokens=[
20
  "<s>",
21
  "<pad>",
22
  "</s>",