nroggendorff commited on
Commit
892e2f9
1 Parent(s): ea002f9

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +33 -5
train.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import torch
3
  import trl
4
  from transformers import (
@@ -30,7 +31,7 @@ PUSH_TO_HUB = True
30
 
31
  def load_data():
32
  if not INSTRUCT_FINETUNE_BOOL:
33
- dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", num_proc=BATCH_SIZE, streaming=True)
34
  start = INIT * SHARD_SIZE
35
  dataset = Dataset.from_dict({'text': [example['text'] for example in islice(dataset, start, start + SHARD_SIZE)]})
36
  else:
@@ -188,25 +189,52 @@ def train_model(model, tokenizer, dataset, push, isinst):
188
  trained_tokenizer.save_pretrained("tokenizer")
189
 
190
  def main(push_to_hub=True, is_inst_finetune=False):
 
191
  dataset = load_data()
 
192
 
193
  if not is_inst_finetune and INIT == 0:
 
194
  training_corpus = get_training_corpus(dataset)
 
 
 
195
  tokenizer = create_tokenizer(training_corpus)
 
196
  else:
 
197
  tokenizer = load_tokenizer()
 
 
 
198
  update_tokenizer(tokenizer, dataset)
199
-
200
- configure_tokenizer(tokenizer)
 
 
 
 
 
201
 
202
  if is_inst_finetune:
 
203
  model = load_model()
 
204
  else:
 
 
 
 
205
  model = create_model(tokenizer) if INIT == 0 else load_model()
 
206
 
 
207
  model.resize_token_embeddings(len(tokenizer))
208
-
 
 
209
  train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
210
 
211
  if __name__ == "__main__":
212
- main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
 
 
1
  import os
2
+ from sys import exit
3
  import torch
4
  import trl
5
  from transformers import (
 
31
 
32
  def load_data():
33
  if not INSTRUCT_FINETUNE_BOOL:
34
+ dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
35
  start = INIT * SHARD_SIZE
36
  dataset = Dataset.from_dict({'text': [example['text'] for example in islice(dataset, start, start + SHARD_SIZE)]})
37
  else:
 
189
  trained_tokenizer.save_pretrained("tokenizer")
190
 
191
  def main(push_to_hub=True, is_inst_finetune=False):
192
+ print("Loading Data..")
193
  dataset = load_data()
194
+ print("Loaded data.")
195
 
196
  if not is_inst_finetune and INIT == 0:
197
+ print("Making Corpus..")
198
  training_corpus = get_training_corpus(dataset)
199
+ print("Made Corpus.")
200
+
201
+ print("Making Tokenizer..")
202
  tokenizer = create_tokenizer(training_corpus)
203
+ print("Made Tokenizer.")
204
  else:
205
+ print("Loading Tokenizer..")
206
  tokenizer = load_tokenizer()
207
+ print("Loaded Tokenizer.")
208
+
209
+ print("Adding Tokens..")
210
  update_tokenizer(tokenizer, dataset)
211
+ print("Added Tokens.")
212
+
213
+
214
+ if INIT == 0:
215
+ print("Adding Special Tokens..")
216
+ configure_tokenizer(tokenizer)
217
+ print("Added Tokens.")
218
 
219
  if is_inst_finetune:
220
+ print("Loading Model..")
221
  model = load_model()
222
+ print("Loaded Model.")
223
  else:
224
+ if INIT == 0:
225
+ print("Creating Model..")
226
+ else:
227
+ print("Loading Model..")
228
  model = create_model(tokenizer) if INIT == 0 else load_model()
229
+ print("Done.")
230
 
231
+ print("Resizing Token Embeddings..")
232
  model.resize_token_embeddings(len(tokenizer))
233
+ print("Done.")
234
+
235
+ print("Training Model..")
236
  train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
237
 
238
  if __name__ == "__main__":
239
+ main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
240
+ exit()