versae commited on
Commit
eb4e77c
1 Parent(s): d5cede4

New tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -0
  2. tokens.py +2 -2
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
tokens.py CHANGED
@@ -3,11 +3,11 @@ from datasets import load_dataset
3
  from tokenizers import ByteLevelBPETokenizer
4
 
5
  # Load dataset
6
- dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
7
 
8
  # Instantiate tokenizer
9
  tokenizer = ByteLevelBPETokenizer()
10
- def batch_iterator(batch_size=1_000_000):
11
  for i in range(0, len(dataset), batch_size):
12
  yield dataset["text"][i: i + batch_size]
13
 
3
  from tokenizers import ByteLevelBPETokenizer
4
 
5
  # Load dataset
6
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train[:5000000]")
7
 
8
  # Instantiate tokenizer
9
  tokenizer = ByteLevelBPETokenizer()
10
+ def batch_iterator(batch_size=100_000):
11
  for i in range(0, len(dataset), batch_size):
12
  yield dataset["text"][i: i + batch_size]
13