nroggendorff commited on
Commit
fc4a559
·
verified ·
1 Parent(s): 2f3f337

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +36 -23
train.py CHANGED
@@ -23,8 +23,14 @@ CLIPPING = 1.0
23
  PUSH_TO_HUB = True
24
 
25
  def load_data():
26
- dataset = load_dataset(INPUT_DATASET, split="train").select(range(int(2.5e+6)))
27
- return dataset
 
 
 
 
 
 
28
 
29
  def create_tokenizer(training_corpus):
30
  tokenizer = ByteLevelBPETokenizer()
@@ -42,19 +48,23 @@ def get_training_corpus(dataset):
42
  for i in range(0, len(dataset), 1000):
43
  yield dataset[i : i + 1000]["text"]
44
 
45
- def format_prompts(examples, tokenizer):
46
  texts = []
47
  for text in examples['text']:
48
- conversation = []
49
- parts = text.split('<|end|>')
50
- for i in range(0, len(parts) - 1, 2):
51
- prompt = parts[i].replace("<|user|>", "")
52
- response = parts[i + 1].replace("<|bot|>", "")
53
- conversation.append({"role": "user", "content": prompt})
54
- conversation.append({"role": "assistant", "content": response})
55
- formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
56
- texts.append(formatted_conversation)
 
 
 
57
  return {"text": texts}
 
58
 
59
  def create_model(tokenizer):
60
  config = LlamaConfig(
@@ -90,10 +100,10 @@ def configure_tokenizer(tokenizer):
90
  tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
91
  tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
92
 
93
- chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{{ eos_token }}"
94
  tokenizer.chat_template = chat_template
95
 
96
- def train_model(model, tokenizer, dataset, push):
97
  args = TrainingArguments(
98
  output_dir="model",
99
  num_train_epochs=EPOCHS,
@@ -104,17 +114,17 @@ def train_model(model, tokenizer, dataset, push):
104
  weight_decay=DECAY,
105
  gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
106
  fp16=FP16,
107
- max_grad_norm=CLIPPING
 
108
  )
109
 
110
  optimizer = AdamW(model.parameters(), lr=args.learning_rate)
111
- scheduler = get_cosine_schedule_with_warmup(
112
- optimizer,
113
  num_warmup_steps=args.warmup_steps,
114
  num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
115
  )
116
-
117
- dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer), batched=True)
118
  trainer = trl.SFTTrainer(
119
  model=model,
120
  tokenizer=tokenizer,
@@ -133,19 +143,22 @@ def train_model(model, tokenizer, dataset, push):
133
  if push:
134
  repo_id = OUTPUT_REPO
135
  msg = str(train.training_loss)
136
- trained_model.push_to_hub(repo_id, commit_message=msg)
137
- trained_tokenizer.push_to_hub(repo_id, commit_message=msg)
138
  else:
139
  trained_model.save_pretrained("model")
140
  trained_tokenizer.save_pretrained("tokenizer")
141
 
142
  def main(push_to_hub=True):
143
  dataset = load_data()
144
- training_corpus = get_training_corpus(dataset)
 
 
145
  tokenizer = create_tokenizer(training_corpus)
146
  configure_tokenizer(tokenizer)
147
  model = create_model(tokenizer)
148
- train_model(model, tokenizer, dataset, push_to_hub)
 
149
 
150
  if __name__ == "__main__":
151
  main(PUSH_TO_HUB)
 
23
  PUSH_TO_HUB = True
24
 
25
  def load_data():
26
+ pretrain = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
27
+ pretrain = Dataset.from_generator(lambda: pretrain.take(200000))
28
+ instruct = load_dataset(INSTRUCT_DATASET, split="train").select(range(200000))
29
+ dataset_dict = DatasetDict({
30
+ 'pretrain': pretrain,
31
+ 'instruct': instruct
32
+ })
33
+ return dataset_dict
34
 
35
  def create_tokenizer(training_corpus):
36
  tokenizer = ByteLevelBPETokenizer()
 
48
  for i in range(0, len(dataset), 1000):
49
  yield dataset[i : i + 1000]["text"]
50
 
51
+ def format_prompts(examples, tokenizer, isinst):
52
  texts = []
53
  for text in examples['text']:
54
+ if isinst:
55
+ conversation = []
56
+ parts = text.split('<|end|>')
57
+ for i in range(0, len(parts) - 1, 2):
58
+ prompt = parts[i].replace("<|user|>", "")
59
+ response = parts[i + 1].replace("<|bot|>", "")
60
+ conversation.append({"role": "user", "content": prompt})
61
+ conversation.append({"role": "assistant", "content": response})
62
+ formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
63
+ texts.append(formatted_conversation)
64
+ else:
65
+ texts.append(text)
66
  return {"text": texts}
67
+
68
 
69
  def create_model(tokenizer):
70
  config = LlamaConfig(
 
100
  tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
101
  tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
102
 
103
+ chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
104
  tokenizer.chat_template = chat_template
105
 
106
+ def train_model(model, tokenizer, dataset, push, isinst):
107
  args = TrainingArguments(
108
  output_dir="model",
109
  num_train_epochs=EPOCHS,
 
114
  weight_decay=DECAY,
115
  gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
116
  fp16=FP16,
117
+ max_grad_norm=CLIPPING,
118
+ logging_steps=100
119
  )
120
 
121
  optimizer = AdamW(model.parameters(), lr=args.learning_rate)
122
+ scheduler = get_linear_schedule_with_warmup(
123
+ optimizer,
124
  num_warmup_steps=args.warmup_steps,
125
  num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
126
  )
127
+ dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
 
128
  trainer = trl.SFTTrainer(
129
  model=model,
130
  tokenizer=tokenizer,
 
143
  if push:
144
  repo_id = OUTPUT_REPO
145
  msg = str(train.training_loss)
146
+ trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
147
+ trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
148
  else:
149
  trained_model.save_pretrained("model")
150
  trained_tokenizer.save_pretrained("tokenizer")
151
 
152
  def main(push_to_hub=True):
153
  dataset = load_data()
154
+ pretrain = dataset['pretrain']
155
+ instruct = dataset['instruct']
156
+ training_corpus = get_training_corpus(pretrain)
157
  tokenizer = create_tokenizer(training_corpus)
158
  configure_tokenizer(tokenizer)
159
  model = create_model(tokenizer)
160
+ train_model(model, tokenizer, pretrain, False, False)
161
+ train_model(model, tokenizer, instruct, push_to_hub, True)
162
 
163
  if __name__ == "__main__":
164
  main(PUSH_TO_HUB)