Spaces:
Paused
Paused
nroggendorff
commited on
Commit
•
892e2f9
1
Parent(s):
ea002f9
Update train.py
Browse files
train.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
import torch
|
3 |
import trl
|
4 |
from transformers import (
|
@@ -30,7 +31,7 @@ PUSH_TO_HUB = True
|
|
30 |
|
31 |
def load_data():
|
32 |
if not INSTRUCT_FINETUNE_BOOL:
|
33 |
-
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train",
|
34 |
start = INIT * SHARD_SIZE
|
35 |
dataset = Dataset.from_dict({'text': [example['text'] for example in islice(dataset, start, start + SHARD_SIZE)]})
|
36 |
else:
|
@@ -188,25 +189,52 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
188 |
trained_tokenizer.save_pretrained("tokenizer")
|
189 |
|
190 |
def main(push_to_hub=True, is_inst_finetune=False):
|
|
|
191 |
dataset = load_data()
|
|
|
192 |
|
193 |
if not is_inst_finetune and INIT == 0:
|
|
|
194 |
training_corpus = get_training_corpus(dataset)
|
|
|
|
|
|
|
195 |
tokenizer = create_tokenizer(training_corpus)
|
|
|
196 |
else:
|
|
|
197 |
tokenizer = load_tokenizer()
|
|
|
|
|
|
|
198 |
update_tokenizer(tokenizer, dataset)
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
if is_inst_finetune:
|
|
|
203 |
model = load_model()
|
|
|
204 |
else:
|
|
|
|
|
|
|
|
|
205 |
model = create_model(tokenizer) if INIT == 0 else load_model()
|
|
|
206 |
|
|
|
207 |
model.resize_token_embeddings(len(tokenizer))
|
208 |
-
|
|
|
|
|
209 |
train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
|
210 |
|
211 |
if __name__ == "__main__":
|
212 |
-
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|
|
|
|
1 |
import os
|
2 |
+
from sys import exit
|
3 |
import torch
|
4 |
import trl
|
5 |
from transformers import (
|
|
|
31 |
|
32 |
def load_data():
|
33 |
if not INSTRUCT_FINETUNE_BOOL:
|
34 |
+
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
35 |
start = INIT * SHARD_SIZE
|
36 |
dataset = Dataset.from_dict({'text': [example['text'] for example in islice(dataset, start, start + SHARD_SIZE)]})
|
37 |
else:
|
|
|
189 |
trained_tokenizer.save_pretrained("tokenizer")
|
190 |
|
191 |
def main(push_to_hub=True, is_inst_finetune=False):
|
192 |
+
print("Loading Data..")
|
193 |
dataset = load_data()
|
194 |
+
print("Loaded data.")
|
195 |
|
196 |
if not is_inst_finetune and INIT == 0:
|
197 |
+
print("Making Corpus..")
|
198 |
training_corpus = get_training_corpus(dataset)
|
199 |
+
print("Made Corpus.")
|
200 |
+
|
201 |
+
print("Making Tokenizer..")
|
202 |
tokenizer = create_tokenizer(training_corpus)
|
203 |
+
print("Made Tokenizer.")
|
204 |
else:
|
205 |
+
print("Loading Tokenizer..")
|
206 |
tokenizer = load_tokenizer()
|
207 |
+
print("Loaded Tokenizer.")
|
208 |
+
|
209 |
+
print("Adding Tokens..")
|
210 |
update_tokenizer(tokenizer, dataset)
|
211 |
+
print("Added Tokens.")
|
212 |
+
|
213 |
+
|
214 |
+
if INIT == 0:
|
215 |
+
print("Adding Special Tokens..")
|
216 |
+
configure_tokenizer(tokenizer)
|
217 |
+
print("Added Tokens.")
|
218 |
|
219 |
if is_inst_finetune:
|
220 |
+
print("Loading Model..")
|
221 |
model = load_model()
|
222 |
+
print("Loaded Model.")
|
223 |
else:
|
224 |
+
if INIT == 0:
|
225 |
+
print("Creating Model..")
|
226 |
+
else:
|
227 |
+
print("Loading Model..")
|
228 |
model = create_model(tokenizer) if INIT == 0 else load_model()
|
229 |
+
print("Done.")
|
230 |
|
231 |
+
print("Resizing Token Embeddings..")
|
232 |
model.resize_token_embeddings(len(tokenizer))
|
233 |
+
print("Done.")
|
234 |
+
|
235 |
+
print("Training Model..")
|
236 |
train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
|
237 |
|
238 |
if __name__ == "__main__":
|
239 |
+
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|
240 |
+
exit()
|