Spaces:
Paused
Paused
nroggendorff
commited on
Commit
•
14ddc53
1
Parent(s):
c7feb81
Update train.py
Browse files
train.py
CHANGED
@@ -124,17 +124,16 @@ def update_tokenizer(tokenizer, dataset, batch_size=1000):
|
|
124 |
|
125 |
for i in range(0, len(dataset['text']), batch_size):
|
126 |
batch = dataset['text'][i : i + batch_size]
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
for token in
|
132 |
if token not in existing_vocab:
|
133 |
oov_tokens.add(token)
|
134 |
|
135 |
tokenizer.add_tokens(list(oov_tokens))
|
136 |
|
137 |
-
|
138 |
def train_model(model, tokenizer, dataset, push, isinst):
|
139 |
args = TrainingArguments(
|
140 |
output_dir="model",
|
|
|
124 |
|
125 |
for i in range(0, len(dataset['text']), batch_size):
|
126 |
batch = dataset['text'][i : i + batch_size]
|
127 |
+
|
128 |
+
for text in batch:
|
129 |
+
tokens = tokenizer.encode(text).tokens
|
130 |
+
|
131 |
+
for token in tokens:
|
132 |
if token not in existing_vocab:
|
133 |
oov_tokens.add(token)
|
134 |
|
135 |
tokenizer.add_tokens(list(oov_tokens))
|
136 |
|
|
|
137 |
def train_model(model, tokenizer, dataset, push, isinst):
|
138 |
args = TrainingArguments(
|
139 |
output_dir="model",
|