sakares commited on
Commit
89c14b0
1 Parent(s): a615d86

train with 10K batch_size

Browse files
Files changed (1) hide show
  1. train_tokenizer.py +1 -1
train_tokenizer.py CHANGED
@@ -24,7 +24,7 @@ def th_tokenize(text):
24
  return result
25
 
26
 
27
- def batch_iterator(batch_size=1000):
28
  for i in range(0, len(raw_dataset), batch_size):
29
  yield [th_tokenize(text) for text in raw_dataset[i : i + batch_size]["text"]]
30
 
 
24
  return result
25
 
26
 
27
+ def batch_iterator(batch_size=10000):
28
  for i in range(0, len(raw_dataset), batch_size):
29
  yield [th_tokenize(text) for text in raw_dataset[i : i + batch_size]["text"]]
30