nroggendorff commited on
Commit
93fda42
1 Parent(s): 5ad337b

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +38 -27
train.py CHANGED
@@ -10,7 +10,7 @@ from tokenizers import ByteLevelBPETokenizer
10
  from torch.utils.data import DataLoader
11
  from torch.cuda.amp import autocast, GradScaler
12
 
13
- BATCH_SIZE = 32
14
  EPOCHS = 1
15
  LEARNING_RATE = 1e-4
16
  FACTOR = 768
@@ -20,7 +20,7 @@ INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
20
  INSTRUCT_DATASET = "nroggendorff/elephant"
21
  OUTPUT_REPO = "nroggendorff/smallama"
22
  INSTRUCT_FINETUNE_BOOL = False
23
- INIT = 0#/15
24
  SHARD_SIZE = int(5e+5)
25
  FP16 = True
26
  WARMUP_STEPS = 1000
@@ -32,9 +32,9 @@ NUM_WORKERS = 4
32
  def load_data():
33
  if not INSTRUCT_FINETUNE_BOOL:
34
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
35
- dataset = dataset.take(int(8e+6)) # Keep streaming, no conversion to in-memory dataset
36
  else:
37
- dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True)
38
  return dataset
39
 
40
  def create_tokenizer(training_corpus):
@@ -54,6 +54,10 @@ def create_tokenizer(training_corpus):
54
  def load_tokenizer():
55
  return AutoTokenizer.from_pretrained(OUTPUT_REPO)
56
 
 
 
 
 
57
  def format_prompts(examples, tokenizer, isinst):
58
  texts = []
59
  for text in examples['text']:
@@ -129,41 +133,48 @@ def train_model(model, tokenizer, dataset, push, isinst):
129
  save_total_limit=2,
130
  )
131
 
132
- dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
133
-
134
  optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
135
  scheduler = get_cosine_schedule_with_warmup(
136
  optimizer,
137
  num_warmup_steps=args.warmup_steps,
138
- num_training_steps=(SHARD_SIZE // args.per_device_train_batch_size) * args.num_train_epochs
139
  )
140
-
141
- dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
142
 
 
 
 
 
 
 
143
  for batch in dataloader:
144
  batch = format_prompts(batch, tokenizer, isinst)
145
- trainer = trl.SFTTrainer(
146
- model=model,
147
- tokenizer=tokenizer,
148
- args=args,
149
- train_dataset=batch,
150
- dataset_text_field='text',
151
- max_seq_length=MAX_SEQ_LENGTH,
152
- optimizers=(optimizer, scheduler)
153
- )
154
- trainer.train()
155
-
156
- trained_model = trainer.model
157
- trained_tokenizer = trainer.tokenizer
158
 
159
  if push:
160
  repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
161
- msg = "Training completed."
162
- trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
163
- trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
164
  else:
165
- trained_model.save_pretrained("model")
166
- trained_tokenizer.save_pretrained("tokenizer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  if __name__ == "__main__":
169
  main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
 
10
  from torch.utils.data import DataLoader
11
  from torch.cuda.amp import autocast, GradScaler
12
 
13
+ BATCH_SIZE = 8
14
  EPOCHS = 1
15
  LEARNING_RATE = 1e-4
16
  FACTOR = 768
 
20
  INSTRUCT_DATASET = "nroggendorff/elephant"
21
  OUTPUT_REPO = "nroggendorff/smallama"
22
  INSTRUCT_FINETUNE_BOOL = False
23
+ INIT = 0
24
  SHARD_SIZE = int(5e+5)
25
  FP16 = True
26
  WARMUP_STEPS = 1000
 
32
  def load_data():
33
  if not INSTRUCT_FINETUNE_BOOL:
34
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
35
+ dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
36
  else:
37
+ dataset = load_dataset(INSTRUCT_DATASET, split="train")
38
  return dataset
39
 
40
  def create_tokenizer(training_corpus):
 
54
  def load_tokenizer():
55
  return AutoTokenizer.from_pretrained(OUTPUT_REPO)
56
 
57
+ def get_training_corpus(dataset):
58
+ for example in dataset:
59
+ yield example['text']
60
+
61
  def format_prompts(examples, tokenizer, isinst):
62
  texts = []
63
  for text in examples['text']:
 
133
  save_total_limit=2,
134
  )
135
 
 
 
136
  optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
137
  scheduler = get_cosine_schedule_with_warmup(
138
  optimizer,
139
  num_warmup_steps=args.warmup_steps,
140
+ num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
141
  )
 
 
142
 
143
+ dataloader = DataLoader(
144
+ dataset,
145
+ batch_size=BATCH_SIZE,
146
+ num_workers=NUM_WORKERS
147
+ )
148
+
149
  for batch in dataloader:
150
  batch = format_prompts(batch, tokenizer, isinst)
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  if push:
153
  repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
154
+ msg = f"Training loss: {train.training_loss:.4f}"
155
+ model.push_to_hub(repo_id, commit_message=msg, force=True)
156
+ tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
157
  else:
158
+ model.save_pretrained("model")
159
+ tokenizer.save_pretrained("tokenizer")
160
+
161
+ def main(push_to_hub=True, is_inst_finetune=False):
162
+ dataset = load_data()
163
+ if not is_inst_finetune and INIT == 0:
164
+ training_corpus = get_training_corpus(dataset)
165
+ tokenizer = create_tokenizer(training_corpus)
166
+ else:
167
+ tokenizer = load_tokenizer()
168
+
169
+ configure_tokenizer(tokenizer)
170
+
171
+ if is_inst_finetune:
172
+ model = load_model()
173
+ model.resize_token_embeddings(len(tokenizer))
174
+ else:
175
+ model = create_model(tokenizer) if INIT == 0 else load_model()
176
+
177
+ train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
178
 
179
  if __name__ == "__main__":
180
  main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)