New tokenizer
Browse files- tokenizer.json +0 -0
- tokens.py +2 -2
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokens.py
CHANGED
@@ -3,11 +3,11 @@ from datasets import load_dataset
|
|
3 |
from tokenizers import ByteLevelBPETokenizer
|
4 |
|
5 |
# Load dataset
|
6 |
-
dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
|
7 |
|
8 |
# Instantiate tokenizer
|
9 |
tokenizer = ByteLevelBPETokenizer()
|
10 |
-
def batch_iterator(batch_size=
|
11 |
for i in range(0, len(dataset), batch_size):
|
12 |
yield dataset["text"][i: i + batch_size]
|
13 |
|
|
|
3 |
from tokenizers import ByteLevelBPETokenizer
|
4 |
|
5 |
# Load dataset
|
6 |
+
dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train[:5000000]")
|
7 |
|
8 |
# Instantiate tokenizer
|
9 |
tokenizer = ByteLevelBPETokenizer()
|
10 |
+
def batch_iterator(batch_size=100_000):
|
11 |
for i in range(0, len(dataset), batch_size):
|
12 |
yield dataset["text"][i: i + batch_size]
|
13 |
|