train with 10K batch_size
Browse files- train_tokenizer.py +1 -1
train_tokenizer.py
CHANGED
@@ -24,7 +24,7 @@ def th_tokenize(text):
|
|
24 |
return result
|
25 |
|
26 |
|
27 |
-
def batch_iterator(batch_size=
|
28 |
for i in range(0, len(raw_dataset), batch_size):
|
29 |
yield [th_tokenize(text) for text in raw_dataset[i : i + batch_size]["text"]]
|
30 |
|
|
|
24 |
return result
|
25 |
|
26 |
|
27 |
+
def batch_iterator(batch_size=10000):
|
28 |
for i in range(0, len(raw_dataset), batch_size):
|
29 |
yield [th_tokenize(text) for text in raw_dataset[i : i + batch_size]["text"]]
|
30 |
|