nroggendorff commited on
Commit
14ddc53
1 Parent(s): c7feb81

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +5 -6
train.py CHANGED
@@ -124,17 +124,16 @@ def update_tokenizer(tokenizer, dataset, batch_size=1000):
124
 
125
  for i in range(0, len(dataset['text']), batch_size):
126
  batch = dataset['text'][i : i + batch_size]
127
-
128
- batch_tokens = tokenizer.encode_batch(batch)
129
-
130
- for encoded in batch_tokens:
131
- for token in encoded.tokens:
132
  if token not in existing_vocab:
133
  oov_tokens.add(token)
134
 
135
  tokenizer.add_tokens(list(oov_tokens))
136
 
137
-
138
  def train_model(model, tokenizer, dataset, push, isinst):
139
  args = TrainingArguments(
140
  output_dir="model",
 
124
 
125
  for i in range(0, len(dataset['text']), batch_size):
126
  batch = dataset['text'][i : i + batch_size]
127
+
128
+ for text in batch:
129
+ tokens = tokenizer.encode(text).tokens
130
+
131
+ for token in tokens:
132
  if token not in existing_vocab:
133
  oov_tokens.add(token)
134
 
135
  tokenizer.add_tokens(list(oov_tokens))
136
 
 
137
  def train_model(model, tokenizer, dataset, push, isinst):
138
  args = TrainingArguments(
139
  output_dir="model",