dsmueller commited on
Commit
e51648a
1 Parent(s): d8a44b5

Update training arguments in app.py

Browse files
Files changed (2) hide show
  1. app.ipynb +67 -124
  2. app.py +44 -20
app.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 7,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -22,7 +22,7 @@
22
  },
23
  {
24
  "cell_type": "code",
25
- "execution_count": 2,
26
  "metadata": {},
27
  "outputs": [],
28
  "source": [
@@ -37,17 +37,9 @@
37
  },
38
  {
39
  "cell_type": "code",
40
- "execution_count": 9,
41
  "metadata": {},
42
- "outputs": [
43
- {
44
- "name": "stdout",
45
- "output_type": "stream",
46
- "text": [
47
- "Model Max Length: 1000000000000000019884624838656\n"
48
- ]
49
- }
50
- ],
51
  "source": [
52
  "# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'\n",
53
  "model_name = 'mistralai/Mistral-7B-v0.1'\n",
@@ -63,20 +55,9 @@
63
  },
64
  {
65
  "cell_type": "code",
66
- "execution_count": 13,
67
  "metadata": {},
68
- "outputs": [
69
- {
70
- "name": "stdout",
71
- "output_type": "stream",
72
- "text": [
73
- "Max token length train: 1121\n",
74
- "Max token length validation: 38\n",
75
- "Block size: 2242\n",
76
- "{'project_name': './llms/ams_data_train-100_4ba55532-e0b2-478b-9f5b-beb082e1b557', 'model_name': 'mistralai/Mistral-7B-v0.1', 'repo_id': 'ai-aerospace/ams-data-train-100-11b94ea4-2b2b-4db3-9e69-acb5a5d9f3e8', 'train_data': 'train_data', 'data_directory': './fine_tune_data/', 'block_size': 2242, 'model_max_length': 1121, 'logging_steps': -1, 'evaluation_strategy': 'epoch', 'save_total_limit': 1, 'save_strategy': 'epoch', 'mixed_precision': 'fp16', 'lr': 3e-05, 'epochs': 3, 'batch_size': 2, 'warmup_ratio': 0.1, 'gradient_accumulation': 1, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0, 'max_grad_norm': 1, 'seed': 42, 'quantization': 'int4', 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}\n"
77
- ]
78
- }
79
- ],
80
  "source": [
81
  "# Write dataset files into data directory\n",
82
  "data_directory = './fine_tune_data/'\n",
@@ -147,7 +128,56 @@
147
  },
148
  {
149
  "cell_type": "code",
150
- "execution_count": 14,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  "metadata": {},
152
  "outputs": [],
153
  "source": [
@@ -221,90 +251,22 @@
221
  "metadata": {},
222
  "outputs": [],
223
  "source": [
224
- "trainer = SFTTrainer(\n",
225
- " model,\n",
226
- " train_dataset=dataset,\n",
227
- " dataset_text_field=\"text\",\n",
228
- " peft_config=peft_config,\n",
229
- " max_seq_length=model_params['model_max_length']\n",
230
- ")\n",
231
  "\n",
232
- "trainer.train()"
233
  ]
234
  },
235
  {
236
  "cell_type": "code",
237
- "execution_count": 8,
238
  "metadata": {},
239
- "outputs": [
240
- {
241
- "data": {
242
- "application/vnd.jupyter.widget-view+json": {
243
- "model_id": "4fbe714ca43d4e53aec27f4ce4fb4706",
244
- "version_major": 2,
245
- "version_minor": 0
246
- },
247
- "text/plain": [
248
- "Downloading builder script: 0%| | 0.00/6.77k [00:00<?, ?B/s]"
249
- ]
250
- },
251
- "metadata": {},
252
- "output_type": "display_data"
253
- },
254
- {
255
- "data": {
256
- "application/vnd.jupyter.widget-view+json": {
257
- "model_id": "826f51589454434b891a94b0d5ef8a73",
258
- "version_major": 2,
259
- "version_minor": 0
260
- },
261
- "text/plain": [
262
- "Downloading builder script: 0%| | 0.00/7.36k [00:00<?, ?B/s]"
263
- ]
264
- },
265
- "metadata": {},
266
- "output_type": "display_data"
267
- },
268
- {
269
- "data": {
270
- "application/vnd.jupyter.widget-view+json": {
271
- "model_id": "81418551f332492293ee9795f98a62f7",
272
- "version_major": 2,
273
- "version_minor": 0
274
- },
275
- "text/plain": [
276
- "Downloading builder script: 0%| | 0.00/4.20k [00:00<?, ?B/s]"
277
- ]
278
- },
279
- "metadata": {},
280
- "output_type": "display_data"
281
- },
282
- {
283
- "data": {
284
- "application/vnd.jupyter.widget-view+json": {
285
- "model_id": "367f897f76f845d782ebc3f9be4eec4d",
286
- "version_major": 2,
287
- "version_minor": 0
288
- },
289
- "text/plain": [
290
- "Downloading builder script: 0%| | 0.00/7.55k [00:00<?, ?B/s]"
291
- ]
292
- },
293
- "metadata": {},
294
- "output_type": "display_data"
295
- },
296
- {
297
- "ename": "NameError",
298
- "evalue": "name 'lora_model' is not defined",
299
- "output_type": "error",
300
- "traceback": [
301
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
302
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
303
- "Cell \u001b[0;32mIn[8], line 18\u001b[0m\n\u001b[1;32m 13\u001b[0m results\u001b[38;5;241m.\u001b[39mupdate(precision_metric\u001b[38;5;241m.\u001b[39mcompute(predictions\u001b[38;5;241m=\u001b[39mpredictions, references \u001b[38;5;241m=\u001b[39m labels, average\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmacro\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m 17\u001b[0m trainer \u001b[38;5;241m=\u001b[39m transformers\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[0;32m---> 18\u001b[0m model\u001b[38;5;241m=\u001b[39m\u001b[43mlora_model\u001b[49m,\n\u001b[1;32m 19\u001b[0m train_dataset\u001b[38;5;241m=\u001b[39mtrain_dataset,\n\u001b[1;32m 20\u001b[0m eval_dataset\u001b[38;5;241m=\u001b[39mval_dataset,\n\u001b[1;32m 21\u001b[0m compute_metrics\u001b[38;5;241m=\u001b[39mcompute_metrics,\n\u001b[1;32m 22\u001b[0m args\u001b[38;5;241m=\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mTrainingArguments(\n\u001b[1;32m 23\u001b[0m per_device_train_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m8\u001b[39m,\n\u001b[1;32m 24\u001b[0m per_device_eval_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m32\u001b[39m,\n\u001b[1;32m 25\u001b[0m gradient_accumulation_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m,\n\u001b[1;32m 26\u001b[0m warmup_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m,\n\u001b[1;32m 27\u001b[0m max_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m12276\u001b[39m,\n\u001b[1;32m 28\u001b[0m learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2e-4\u001b[39m,\n\u001b[1;32m 29\u001b[0m fp16\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 30\u001b[0m eval_steps\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1000\u001b[39m,\n\u001b[1;32m 31\u001b[0m logging_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1000\u001b[39m,\n\u001b[1;32m 32\u001b[0m save_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1000\u001b[39m,\n\u001b[1;32m 33\u001b[0m evaluation_strategy\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msteps\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 34\u001b[0m do_eval\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 35\u001b[0m load_best_model_at_end\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 36\u001b[0m metric_for_best_model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mf1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 37\u001b[0m output_dir\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel_outputs\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 38\u001b[0m logging_dir\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel_outputs\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 39\u001b[0m remove_unused_columns \u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \n\u001b[1;32m 40\u001b[0m report_to\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwandb\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# enable logging to W&B\u001b[39;00m\n\u001b[1;32m 41\u001b[0m ),\n\u001b[1;32m 42\u001b[0m )\n\u001b[1;32m 43\u001b[0m trainer\u001b[38;5;241m.\u001b[39mtrain()\n",
304
- "\u001b[0;31mNameError\u001b[0m: name 'lora_model' is not defined"
305
- ]
306
- }
307
- ],
308
  "source": [
309
  "f1_metric = evaluate.load(\"f1\")\n",
310
  "recall_metric = evaluate.load(\"recall\")\n",
@@ -328,26 +290,7 @@
328
  " train_dataset=model_params['train_data'],\n",
329
  " eval_dataset=model_params['validation_data'],\n",
330
  " compute_metrics=compute_metrics,\n",
331
- " args=transformers.TrainingArguments(\n",
332
- " per_device_train_batch_size=model_params['batch_size'],\n",
333
- " per_device_eval_batch_size=model_params['batch_size'],\n",
334
- " gradient_accumulation_steps=model_params['gradient_accumulation'],\n",
335
- " warmup_steps=100,\n",
336
- " max_steps=12276,\n",
337
- " learning_rate=model_params['lr'],\n",
338
- " fp16=True,\n",
339
- " eval_steps= 1000,\n",
340
- " logging_steps=1000,\n",
341
- " save_steps=1000,\n",
342
- " evaluation_strategy=model_params['evaluation_strategy'],\n",
343
- " do_eval=True,\n",
344
- " load_best_model_at_end=True,\n",
345
- " metric_for_best_model=\"f1\",\n",
346
- " output_dir='model_outputs',\n",
347
- " logging_dir='model_outputs',\n",
348
- " remove_unused_columns =False, \n",
349
- " report_to='wandb' # enable logging to W&B\n",
350
- " ),\n",
351
  ")\n",
352
  "trainer.train()"
353
  ]
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
22
  },
23
  {
24
  "cell_type": "code",
25
+ "execution_count": null,
26
  "metadata": {},
27
  "outputs": [],
28
  "source": [
 
37
  },
38
  {
39
  "cell_type": "code",
40
+ "execution_count": null,
41
  "metadata": {},
42
+ "outputs": [],
 
 
 
 
 
 
 
 
43
  "source": [
44
  "# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'\n",
45
  "model_name = 'mistralai/Mistral-7B-v0.1'\n",
 
55
  },
56
  {
57
  "cell_type": "code",
58
+ "execution_count": null,
59
  "metadata": {},
60
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
61
  "source": [
62
  "# Write dataset files into data directory\n",
63
  "data_directory = './fine_tune_data/'\n",
 
128
  },
129
  {
130
  "cell_type": "code",
131
+ "execution_count": null,
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": [
135
+ "args_custom=transformers.TrainingArguments(\n",
136
+ " per_device_train_batch_size=model_params['batch_size'],\n",
137
+ " per_device_eval_batch_size=model_params['batch_size'],\n",
138
+ " gradient_accumulation_steps=model_params['gradient_accumulation'],\n",
139
+ " warmup_ratio=model_params['warmup_ratio'],\n",
140
+ " num_epochs=model_params['epochs'],\n",
141
+ " learning_rate=model_params['lr'],\n",
142
+ " fp16=True,\n",
143
+ " logging_steps=model_params['logging_steps'],\n",
144
+ " save_total_limit=model_params['save_total_limit'],\n",
145
+ " evaluation_strategy=model_params['evaluation_strategy'],\n",
146
+ " metric_for_best_model=\"f1\",\n",
147
+ " output_dir='model_outputs',\n",
148
+ " logging_dir='model_outputs',\n",
149
+ " optim=model_params['optimizer'],\n",
150
+ " max_grad_norm=model_params['max_grad_norm'],\n",
151
+ " weight_decay=model_params['weight_decay'],\n",
152
+ " lr_scheduler_type=model_params['scheduler']\n",
153
+ ")\n",
154
+ "\n",
155
+ "# Args from medium article\n",
156
+ "args_medium=transformers.TrainingArguments(\n",
157
+ " per_device_train_batch_size=8,\n",
158
+ " per_device_eval_batch_size=32,\n",
159
+ " gradient_accumulation_steps=4,\n",
160
+ " warmup_steps=100,\n",
161
+ " max_steps=12276,\n",
162
+ " learning_rate=2e-4,\n",
163
+ " fp16=True,\n",
164
+ " eval_steps= 1000,\n",
165
+ " logging_steps=1000,\n",
166
+ " save_steps=1000,\n",
167
+ " evaluation_strategy=\"steps\",\n",
168
+ " do_eval=True,\n",
169
+ " load_best_model_at_end=True,\n",
170
+ " metric_for_best_model=\"f1\",\n",
171
+ " output_dir='model_outputs',\n",
172
+ " logging_dir='model_outputs',\n",
173
+ " remove_unused_columns =False, \n",
174
+ " report_to='wandb' # enable logging to W&B\n",
175
+ ")"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": null,
181
  "metadata": {},
182
  "outputs": [],
183
  "source": [
 
251
  "metadata": {},
252
  "outputs": [],
253
  "source": [
254
+ "# trainer = SFTTrainer(\n",
255
+ "# model,\n",
256
+ "# train_dataset=dataset,\n",
257
+ "# dataset_text_field=\"text\",\n",
258
+ "# peft_config=peft_config,\n",
259
+ "# max_seq_length=model_params['model_max_length']\n",
260
+ "# )\n",
261
  "\n",
262
+ "# trainer.train()"
263
  ]
264
  },
265
  {
266
  "cell_type": "code",
267
+ "execution_count": null,
268
  "metadata": {},
269
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  "source": [
271
  "f1_metric = evaluate.load(\"f1\")\n",
272
  "recall_metric = evaluate.load(\"recall\")\n",
 
290
  " train_dataset=model_params['train_data'],\n",
291
  " eval_dataset=model_params['validation_data'],\n",
292
  " compute_metrics=compute_metrics,\n",
293
+ " args=args_custom\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  ")\n",
295
  "trainer.train()"
296
  ]
app.py CHANGED
@@ -105,6 +105,49 @@ for key, value in model_params.items():
105
 
106
  print(model_params)
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  ### Load model and peft config, calculate trainable parameters
109
  model = AutoModelForCausalLM.from_pretrained(
110
  model_name,
@@ -141,25 +184,6 @@ trainer = transformers.Trainer(
141
  train_dataset=model_params['train_data'],
142
  eval_dataset=model_params['validation_data'],
143
  compute_metrics=compute_metrics,
144
- args=transformers.TrainingArguments(
145
- per_device_train_batch_size=model_params['batch_size'],
146
- per_device_eval_batch_size=model_params['batch_size'],
147
- gradient_accumulation_steps=model_params['gradient_accumulation'],
148
- warmup_steps=100,
149
- max_steps=12276,
150
- learning_rate=model_params['lr'],
151
- fp16=True,
152
- eval_steps= 1000,
153
- logging_steps=1000,
154
- save_steps=1000,
155
- evaluation_strategy=model_params['evaluation_strategy'],
156
- do_eval=True,
157
- load_best_model_at_end=True,
158
- metric_for_best_model="f1",
159
- output_dir='model_outputs',
160
- logging_dir='model_outputs',
161
- remove_unused_columns =False,
162
- report_to='wandb' # enable logging to W&B
163
- ),
164
  )
165
  trainer.train()
 
105
 
106
  print(model_params)
107
 
108
+ args_custom=transformers.TrainingArguments(
109
+ per_device_train_batch_size=model_params['batch_size'],
110
+ per_device_eval_batch_size=model_params['batch_size'],
111
+ gradient_accumulation_steps=model_params['gradient_accumulation'],
112
+ warmup_ratio=model_params['warmup_ratio'],
113
+ num_epochs=model_params['epochs'],
114
+ learning_rate=model_params['lr'],
115
+ fp16=True,
116
+ logging_steps=model_params['logging_steps'],
117
+ save_total_limit=model_params['save_total_limit'],
118
+ evaluation_strategy=model_params['evaluation_strategy'],
119
+ metric_for_best_model="f1",
120
+ output_dir='model_outputs',
121
+ logging_dir='model_outputs',
122
+ optim=model_params['optimizer'],
123
+ max_grad_norm=model_params['max_grad_norm'],
124
+ weight_decay=model_params['weight_decay'],
125
+ lr_scheduler_type=model_params['scheduler']
126
+ )
127
+
128
+ ### Args from medium article
129
+ args_medium=transformers.TrainingArguments(
130
+ per_device_train_batch_size=8,
131
+ per_device_eval_batch_size=32,
132
+ gradient_accumulation_steps=4,
133
+ warmup_steps=100,
134
+ max_steps=12276,
135
+ learning_rate=2e-4,
136
+ fp16=True,
137
+ eval_steps= 1000,
138
+ logging_steps=1000,
139
+ save_steps=1000,
140
+ evaluation_strategy="steps",
141
+ do_eval=True,
142
+ load_best_model_at_end=True,
143
+ metric_for_best_model="f1",
144
+ output_dir='model_outputs',
145
+ logging_dir='model_outputs',
146
+ remove_unused_columns =False,
147
+ report_to='wandb' # enable logging to W&B
148
+ )
149
+ ###
150
+
151
  ### Load model and peft config, calculate trainable parameters
152
  model = AutoModelForCausalLM.from_pretrained(
153
  model_name,
 
184
  train_dataset=model_params['train_data'],
185
  eval_dataset=model_params['validation_data'],
186
  compute_metrics=compute_metrics,
187
+ args=args_custom
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  )
189
  trainer.train()