Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -23,8 +23,14 @@ CLIPPING = 1.0
|
|
23 |
PUSH_TO_HUB = True
|
24 |
|
25 |
def load_data():
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def create_tokenizer(training_corpus):
|
30 |
tokenizer = ByteLevelBPETokenizer()
|
@@ -42,19 +48,23 @@ def get_training_corpus(dataset):
|
|
42 |
for i in range(0, len(dataset), 1000):
|
43 |
yield dataset[i : i + 1000]["text"]
|
44 |
|
45 |
-
def format_prompts(examples, tokenizer):
|
46 |
texts = []
|
47 |
for text in examples['text']:
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
57 |
return {"text": texts}
|
|
|
58 |
|
59 |
def create_model(tokenizer):
|
60 |
config = LlamaConfig(
|
@@ -90,10 +100,10 @@ def configure_tokenizer(tokenizer):
|
|
90 |
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
|
91 |
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
|
92 |
|
93 |
-
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
|
94 |
tokenizer.chat_template = chat_template
|
95 |
|
96 |
-
def train_model(model, tokenizer, dataset, push):
|
97 |
args = TrainingArguments(
|
98 |
output_dir="model",
|
99 |
num_train_epochs=EPOCHS,
|
@@ -104,17 +114,17 @@ def train_model(model, tokenizer, dataset, push):
|
|
104 |
weight_decay=DECAY,
|
105 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
|
106 |
fp16=FP16,
|
107 |
-
max_grad_norm=CLIPPING
|
|
|
108 |
)
|
109 |
|
110 |
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
|
111 |
-
scheduler =
|
112 |
-
optimizer,
|
113 |
num_warmup_steps=args.warmup_steps,
|
114 |
num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
|
115 |
)
|
116 |
-
|
117 |
-
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer), batched=True)
|
118 |
trainer = trl.SFTTrainer(
|
119 |
model=model,
|
120 |
tokenizer=tokenizer,
|
@@ -133,19 +143,22 @@ def train_model(model, tokenizer, dataset, push):
|
|
133 |
if push:
|
134 |
repo_id = OUTPUT_REPO
|
135 |
msg = str(train.training_loss)
|
136 |
-
trained_model.push_to_hub(repo_id, commit_message=msg)
|
137 |
-
trained_tokenizer.push_to_hub(repo_id, commit_message=msg)
|
138 |
else:
|
139 |
trained_model.save_pretrained("model")
|
140 |
trained_tokenizer.save_pretrained("tokenizer")
|
141 |
|
142 |
def main(push_to_hub=True):
|
143 |
dataset = load_data()
|
144 |
-
|
|
|
|
|
145 |
tokenizer = create_tokenizer(training_corpus)
|
146 |
configure_tokenizer(tokenizer)
|
147 |
model = create_model(tokenizer)
|
148 |
-
train_model(model, tokenizer,
|
|
|
149 |
|
150 |
if __name__ == "__main__":
|
151 |
main(PUSH_TO_HUB)
|
|
|
23 |
PUSH_TO_HUB = True
|
24 |
|
25 |
def load_data():
|
26 |
+
pretrain = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
27 |
+
pretrain = Dataset.from_generator(lambda: pretrain.take(200000))
|
28 |
+
instruct = load_dataset(INSTRUCT_DATASET, split="train").select(range(200000))
|
29 |
+
dataset_dict = DatasetDict({
|
30 |
+
'pretrain': pretrain,
|
31 |
+
'instruct': instruct
|
32 |
+
})
|
33 |
+
return dataset_dict
|
34 |
|
35 |
def create_tokenizer(training_corpus):
|
36 |
tokenizer = ByteLevelBPETokenizer()
|
|
|
48 |
for i in range(0, len(dataset), 1000):
|
49 |
yield dataset[i : i + 1000]["text"]
|
50 |
|
51 |
+
def format_prompts(examples, tokenizer, isinst):
|
52 |
texts = []
|
53 |
for text in examples['text']:
|
54 |
+
if isinst:
|
55 |
+
conversation = []
|
56 |
+
parts = text.split('<|end|>')
|
57 |
+
for i in range(0, len(parts) - 1, 2):
|
58 |
+
prompt = parts[i].replace("<|user|>", "")
|
59 |
+
response = parts[i + 1].replace("<|bot|>", "")
|
60 |
+
conversation.append({"role": "user", "content": prompt})
|
61 |
+
conversation.append({"role": "assistant", "content": response})
|
62 |
+
formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
|
63 |
+
texts.append(formatted_conversation)
|
64 |
+
else:
|
65 |
+
texts.append(text)
|
66 |
return {"text": texts}
|
67 |
+
|
68 |
|
69 |
def create_model(tokenizer):
|
70 |
config = LlamaConfig(
|
|
|
100 |
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
|
101 |
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
|
102 |
|
103 |
+
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
104 |
tokenizer.chat_template = chat_template
|
105 |
|
106 |
+
def train_model(model, tokenizer, dataset, push, isinst):
|
107 |
args = TrainingArguments(
|
108 |
output_dir="model",
|
109 |
num_train_epochs=EPOCHS,
|
|
|
114 |
weight_decay=DECAY,
|
115 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
|
116 |
fp16=FP16,
|
117 |
+
max_grad_norm=CLIPPING,
|
118 |
+
logging_steps=100
|
119 |
)
|
120 |
|
121 |
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
|
122 |
+
scheduler = get_linear_schedule_with_warmup(
|
123 |
+
optimizer,
|
124 |
num_warmup_steps=args.warmup_steps,
|
125 |
num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
|
126 |
)
|
127 |
+
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
|
|
|
128 |
trainer = trl.SFTTrainer(
|
129 |
model=model,
|
130 |
tokenizer=tokenizer,
|
|
|
143 |
if push:
|
144 |
repo_id = OUTPUT_REPO
|
145 |
msg = str(train.training_loss)
|
146 |
+
trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
|
147 |
+
trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
|
148 |
else:
|
149 |
trained_model.save_pretrained("model")
|
150 |
trained_tokenizer.save_pretrained("tokenizer")
|
151 |
|
152 |
def main(push_to_hub=True):
|
153 |
dataset = load_data()
|
154 |
+
pretrain = dataset['pretrain']
|
155 |
+
instruct = dataset['instruct']
|
156 |
+
training_corpus = get_training_corpus(pretrain)
|
157 |
tokenizer = create_tokenizer(training_corpus)
|
158 |
configure_tokenizer(tokenizer)
|
159 |
model = create_model(tokenizer)
|
160 |
+
train_model(model, tokenizer, pretrain, False, False)
|
161 |
+
train_model(model, tokenizer, instruct, push_to_hub, True)
|
162 |
|
163 |
if __name__ == "__main__":
|
164 |
main(PUSH_TO_HUB)
|