mrsteyk commited on
Commit
e437553
·
1 Parent(s): 9c6d908

Upload finetune.ipynb

Browse files

Notebook I used for local finetune. Grouping was not done for this model (this is 1.0)

Files changed (1) hide show
  1. finetune.ipynb +581 -0
finetune.ipynb ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import datasets\n",
10
+ "import transformers\n",
11
+ "import torch\n",
12
+ "\n",
13
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "MODEL = \"EleutherAI/pythia-125m-deduped\"\n",
23
+ "\n",
24
+ "config = AutoConfig.from_pretrained(MODEL)\n",
25
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)\n",
26
+ "model = AutoModelForCausalLM.from_pretrained(MODEL)"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 3,
32
+ "metadata": {},
33
+ "outputs": [
34
+ {
35
+ "name": "stdout",
36
+ "output_type": "stream",
37
+ "text": [
38
+ "Added 1 tokens!\n"
39
+ ]
40
+ }
41
+ ],
42
+ "source": [
43
+ "# @title Extend model\n",
44
+ "\n",
45
+ "num_added_tokens = tokenizer.add_special_tokens({\"sep_token\": \"<|STK_SP|>\"})\n",
46
+ "print(f\"Added {num_added_tokens} tokens!\")\n",
47
+ "model.resize_token_embeddings(len(tokenizer))\n",
48
+ "\n",
49
+ "# TODO: ???\n",
50
+ "tokenizer.pad_token = tokenizer.eos_token\n",
51
+ "\n",
52
+ "assert tokenizer.sep_token == \"<|STK_SP|>\""
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 4,
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "name": "stderr",
62
+ "output_type": "stream",
63
+ "text": [
64
+ "Using custom data configuration default-b39c74bc29b6f917\n",
65
+ "Found cached dataset json (C:/Users/lego-/.cache/huggingface/datasets/json/default-b39c74bc29b6f917/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
66
+ ]
67
+ },
68
+ {
69
+ "data": {
70
+ "application/vnd.jupyter.widget-view+json": {
71
+ "model_id": "a5ad5093bc064d4096b9646f195e4723",
72
+ "version_major": 2,
73
+ "version_minor": 0
74
+ },
75
+ "text/plain": [
76
+ " 0%| | 0/2 [00:00<?, ?it/s]"
77
+ ]
78
+ },
79
+ "metadata": {},
80
+ "output_type": "display_data"
81
+ }
82
+ ],
83
+ "source": [
84
+ "# @title Load in the dataset\n",
85
+ "\n",
86
+ "from datasets import load_dataset\n",
87
+ "\n",
88
+ "data_files = {\n",
89
+ " \"train\": \"./dataset-r1/train.jsonl\",\n",
90
+ " \"validation\": \"./dataset-r1/valid.jsonl\",\n",
91
+ "}\n",
92
+ "\n",
93
+ "raw_datasets = load_dataset(\n",
94
+ " \"json\",\n",
95
+ " data_files=data_files,\n",
96
+ ")"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 5,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stderr",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "Loading cached processed dataset at C:\\Users\\lego-\\.cache\\huggingface\\datasets\\json\\default-b39c74bc29b6f917\\0.0.0\\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\\cache-d06df8923a2befa8.arrow\n",
109
+ "Loading cached processed dataset at C:\\Users\\lego-\\.cache\\huggingface\\datasets\\json\\default-b39c74bc29b6f917\\0.0.0\\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\\cache-847113bf21349cf9.arrow\n"
110
+ ]
111
+ },
112
+ {
113
+ "name": "stdout",
114
+ "output_type": "stream",
115
+ "text": [
116
+ "Total processed datasets sizes are 2755 150\n"
117
+ ]
118
+ }
119
+ ],
120
+ "source": [
121
+ "# @title Tokenize the dataset\n",
122
+ "tokenized_datasets = raw_datasets.map(\n",
123
+ " lambda e: tokenizer(e[\"input\"] + e[\"output\"] + tokenizer.eos_token),\n",
124
+ " #batched=True,\n",
125
+ " #num_proc=4,\n",
126
+ " remove_columns=[\"input\", \"output\", \"coder\", \"system\", \"god\", \"user\", \"ai\", \"topic\"]\n",
127
+ ")\n",
128
+ "\n",
129
+ "for i in range(len(tokenized_datasets[\"train\"])):\n",
130
+ " if len(tokenized_datasets[\"train\"][i][\"input_ids\"]) > config.max_position_embeddings:\n",
131
+ " print(f\"Error in {i} of train\")\n",
132
+ "for i in range(len(tokenized_datasets[\"validation\"])):\n",
133
+ " if len(tokenized_datasets[\"validation\"][i][\"input_ids\"]) > config.max_position_embeddings:\n",
134
+ " print(f\"Error in {i} of validation\")\n",
135
+ "\n",
136
+ "# [tokenized_datasets[\"train\"][1], tokenized_datasets[\"validation\"][1]]\n",
137
+ "print(\"Total processed datasets sizes are \", len(tokenized_datasets[\"train\"]), len(tokenized_datasets[\"validation\"]))"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 6,
143
+ "metadata": {},
144
+ "outputs": [
145
+ {
146
+ "data": {
147
+ "application/vnd.jupyter.widget-view+json": {
148
+ "model_id": "0cad348a2c094680ac2b0ab5e7dc2c8c",
149
+ "version_major": 2,
150
+ "version_minor": 0
151
+ },
152
+ "text/plain": [
153
+ "Grouping texts in chunks of 2048: 0%| | 0/3 [00:00<?, ?ba/s]"
154
+ ]
155
+ },
156
+ "metadata": {},
157
+ "output_type": "display_data"
158
+ },
159
+ {
160
+ "data": {
161
+ "application/vnd.jupyter.widget-view+json": {
162
+ "model_id": "eef956243d5542fcbf41bfdaa04ad5ea",
163
+ "version_major": 2,
164
+ "version_minor": 0
165
+ },
166
+ "text/plain": [
167
+ "Grouping texts in chunks of 2048: 0%| | 0/1 [00:00<?, ?ba/s]"
168
+ ]
169
+ },
170
+ "metadata": {},
171
+ "output_type": "display_data"
172
+ },
173
+ {
174
+ "name": "stdout",
175
+ "output_type": "stream",
176
+ "text": [
177
+ "Total LM datasets sizes are 628 31\n"
178
+ ]
179
+ }
180
+ ],
181
+ "source": [
182
+ "# TODO: maybe group?\n",
183
+ "\n",
184
+ "from itertools import chain\n",
185
+ "\n",
186
+ "block_size = 2048\n",
187
+ "def group_texts(examples):\n",
188
+ " # Concatenate all texts.\n",
189
+ " #print(list(chain(*examples['input_ids'])))\n",
190
+ " concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
191
+ " total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
192
+ " # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
193
+ " # customize this part to your needs.\n",
194
+ " if total_length >= block_size:\n",
195
+ " total_length = (total_length // block_size) * block_size\n",
196
+ " # Split by chunks of max_len.\n",
197
+ " result = {\n",
198
+ " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
199
+ " for k, t in concatenated_examples.items()\n",
200
+ " }\n",
201
+ " result[\"labels\"] = result[\"input_ids\"].copy()\n",
202
+ " return result\n",
203
+ "\n",
204
+ "lm_datasets = tokenized_datasets.map(\n",
205
+ " group_texts,\n",
206
+ " batched=True,\n",
207
+ " # num_proc=data_args.preprocessing_num_workers,\n",
208
+ " load_from_cache_file=False,\n",
209
+ " desc=f\"Grouping texts in chunks of {block_size}\",\n",
210
+ ")\n",
211
+ "\n",
212
+ "print(\"Total LM datasets sizes are \", len(lm_datasets[\"train\"]), len(lm_datasets[\"validation\"]))"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 7,
218
+ "metadata": {},
219
+ "outputs": [
220
+ {
221
+ "name": "stdout",
222
+ "output_type": "stream",
223
+ "text": [
224
+ "Using magick windows DLL!\n",
225
+ "CUDA SETUP: Loading binary d:\\projects\\python\\distilchatgpt2\\venv\\lib\\site-packages\\bitsandbytes\\libbitsandbytes_cudaall.dll...\n"
226
+ ]
227
+ },
228
+ {
229
+ "name": "stderr",
230
+ "output_type": "stream",
231
+ "text": [
232
+ "Using cuda_amp half precision backend\n"
233
+ ]
234
+ }
235
+ ],
236
+ "source": [
237
+ "from transformers import Trainer, TrainingArguments, default_data_collator, DataCollatorWithPadding\n",
238
+ "from transformers.trainer_pt_utils import get_parameter_names\n",
239
+ "import evaluate\n",
240
+ "\n",
241
+ "import bitsandbytes as bnb\n",
242
+ "from bitsandbytes.optim import GlobalOptimManager\n",
243
+ "\n",
244
+ "def preprocess_logits_for_metrics(logits, labels):\n",
245
+ " if isinstance(logits, tuple):\n",
246
+ " # Depending on the model and config, logits may contain extra tensors,\n",
247
+ " # like past_key_values, but logits always come first\n",
248
+ " logits = logits[0]\n",
249
+ " return logits.argmax(dim=-1)\n",
250
+ "\n",
251
+ "metric = evaluate.load(\"accuracy\")\n",
252
+ "\n",
253
+ "def compute_metrics(eval_preds):\n",
254
+ " preds, labels = eval_preds\n",
255
+ " # preds have the same shape as the labels, after the argmax(-1) has been calculated\n",
256
+ " # by preprocess_logits_for_metrics but we need to shift the labels\n",
257
+ " labels = labels[:, 1:].reshape(-1)\n",
258
+ " preds = preds[:, :-1].reshape(-1)\n",
259
+ " return metric.compute(predictions=preds, references=labels)\n",
260
+ "\n",
261
+ "model.config.use_cache = False\n",
262
+ "\n",
263
+ "#data_collator_pad = DataCollatorWithPadding(tokenizer)\n",
264
+ "def data_collator(data_):\n",
265
+ " data = default_data_collator(data_)\n",
266
+ " #print(data)\n",
267
+ " return {'input_ids': torch.stack([i for i in data['input_ids']]),\n",
268
+ " 'attention_mask': torch.stack([i for i in data['attention_mask']]),\n",
269
+ " 'labels': torch.stack([i for i in data['input_ids']])}\n",
270
+ "\n",
271
+ "training_args = TrainingArguments(\n",
272
+ " \"./openchatgpt-neox-r1.1/\",\n",
273
+ " do_train=True, \n",
274
+ " do_eval=True,\n",
275
+ " \n",
276
+ " push_to_hub=False,\n",
277
+ "\n",
278
+ " # Pulled from examples\n",
279
+ " evaluation_strategy=\"epoch\",\n",
280
+ " #learning_rate=2e-5,\n",
281
+ " #weight_decay=0.01,\n",
282
+ "\n",
283
+ " save_steps=300,\n",
284
+ "\n",
285
+ " per_device_train_batch_size=1,\n",
286
+ " per_device_eval_batch_size=1,\n",
287
+ "\n",
288
+ " gradient_accumulation_steps=2,\n",
289
+ " gradient_checkpointing=True,\n",
290
+ "\n",
291
+ " fp16=True,\n",
292
+ ")\n",
293
+ "\n",
294
+ "optim = bnb.optim.Adam8bit\n",
295
+ "def set_optim_to_run_embedding_in_fp32(model):\n",
296
+ " for module in model.modules():\n",
297
+ " if isinstance(module, torch.nn.Embedding):\n",
298
+ " GlobalOptimManager.get_instance().register_module_override(module, 'weight', {'optim_bits': 32})\n",
299
+ "set_optim_to_run_embedding_in_fp32(model)\n",
300
+ "# model.cuda()\n",
301
+ "\n",
302
+ "decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])\n",
303
+ "decay_parameters = [name for name in decay_parameters if \"bias\" not in name]\n",
304
+ "optimizer_grouped_parameters = [\n",
305
+ " {\n",
306
+ " \"params\": [p for n, p in model.named_parameters() if n in decay_parameters],\n",
307
+ " \"weight_decay\": training_args.weight_decay,\n",
308
+ " },\n",
309
+ " {\n",
310
+ " \"params\": [p for n, p in model.named_parameters() if n not in decay_parameters],\n",
311
+ " \"weight_decay\": 0.0,\n",
312
+ " },\n",
313
+ "]\n",
314
+ "\n",
315
+ "adam_bnb_optim = optim(\n",
316
+ " optimizer_grouped_parameters,\n",
317
+ " betas=(training_args.adam_beta1, training_args.adam_beta2),\n",
318
+ " eps=training_args.adam_epsilon,\n",
319
+ " lr=training_args.learning_rate,\n",
320
+ ")\n",
321
+ "\n",
322
+ "trainer = Trainer(\n",
323
+ " model=model,\n",
324
+ " #train_dataset=tokenized_datasets[\"train\"],\n",
325
+ " #eval_dataset=tokenized_datasets[\"validation\"],\n",
326
+ " train_dataset=lm_datasets[\"train\"],\n",
327
+ " eval_dataset=lm_datasets[\"validation\"],\n",
328
+ " tokenizer=tokenizer,\n",
329
+ "\n",
330
+ " data_collator=data_collator,\n",
331
+ " compute_metrics=compute_metrics,\n",
332
+ " preprocess_logits_for_metrics=preprocess_logits_for_metrics,\n",
333
+ "\n",
334
+ " # data_collator=lambda data: {'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),\n",
335
+ " # 'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),\n",
336
+ " # 'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])},\n",
337
+ "\n",
338
+ " args=training_args,\n",
339
+ "\n",
340
+ " optimizers=(adam_bnb_optim, None),\n",
341
+ ")"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 8,
347
+ "metadata": {},
348
+ "outputs": [
349
+ {
350
+ "name": "stdout",
351
+ "output_type": "stream",
352
+ "text": [
353
+ "No last checkpoint detected!\n"
354
+ ]
355
+ }
356
+ ],
357
+ "source": [
358
+ "# @title Get last model checkpoint if any...\n",
359
+ "\n",
360
+ "from transformers.trainer_utils import get_last_checkpoint\n",
361
+ "\n",
362
+ "last_checkpoint = get_last_checkpoint(\"./openchatgpt-neox-r1.1/\")\n",
363
+ "if last_checkpoint is None:\n",
364
+ " print(\"No last checkpoint detected!\")"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "code",
369
+ "execution_count": 9,
370
+ "metadata": {},
371
+ "outputs": [
372
+ {
373
+ "name": "stderr",
374
+ "output_type": "stream",
375
+ "text": [
376
+ "***** Running training *****\n",
377
+ " Num examples = 628\n",
378
+ " Num Epochs = 3\n",
379
+ " Instantaneous batch size per device = 1\n",
380
+ " Total train batch size (w. parallel, distributed & accumulation) = 2\n",
381
+ " Gradient Accumulation steps = 2\n",
382
+ " Total optimization steps = 942\n",
383
+ " Number of trainable parameters = 162283008\n"
384
+ ]
385
+ },
386
+ {
387
+ "data": {
388
+ "text/html": [
389
+ "\n",
390
+ " <div>\n",
391
+ " \n",
392
+ " <progress value='942' max='942' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
393
+ " [942/942 1:31:15, Epoch 3/3]\n",
394
+ " </div>\n",
395
+ " <table border=\"1\" class=\"dataframe\">\n",
396
+ " <thead>\n",
397
+ " <tr style=\"text-align: left;\">\n",
398
+ " <th>Epoch</th>\n",
399
+ " <th>Training Loss</th>\n",
400
+ " <th>Validation Loss</th>\n",
401
+ " <th>Accuracy</th>\n",
402
+ " </tr>\n",
403
+ " </thead>\n",
404
+ " <tbody>\n",
405
+ " <tr>\n",
406
+ " <td>1</td>\n",
407
+ " <td>No log</td>\n",
408
+ " <td>0.881487</td>\n",
409
+ " <td>0.787100</td>\n",
410
+ " </tr>\n",
411
+ " <tr>\n",
412
+ " <td>2</td>\n",
413
+ " <td>0.811800</td>\n",
414
+ " <td>0.871694</td>\n",
415
+ " <td>0.791922</td>\n",
416
+ " </tr>\n",
417
+ " <tr>\n",
418
+ " <td>3</td>\n",
419
+ " <td>0.811800</td>\n",
420
+ " <td>0.896573</td>\n",
421
+ " <td>0.792001</td>\n",
422
+ " </tr>\n",
423
+ " </tbody>\n",
424
+ "</table><p>"
425
+ ],
426
+ "text/plain": [
427
+ "<IPython.core.display.HTML object>"
428
+ ]
429
+ },
430
+ "metadata": {},
431
+ "output_type": "display_data"
432
+ },
433
+ {
434
+ "name": "stderr",
435
+ "output_type": "stream",
436
+ "text": [
437
+ "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-300\n",
438
+ "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-300\\config.json\n",
439
+ "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-300\\pytorch_model.bin\n",
440
+ "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-300\\tokenizer_config.json\n",
441
+ "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-300\\special_tokens_map.json\n",
442
+ "***** Running Evaluation *****\n",
443
+ " Num examples = 31\n",
444
+ " Batch size = 1\n",
445
+ "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-600\n",
446
+ "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-600\\config.json\n",
447
+ "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-600\\pytorch_model.bin\n",
448
+ "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-600\\tokenizer_config.json\n",
449
+ "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-600\\special_tokens_map.json\n",
450
+ "***** Running Evaluation *****\n",
451
+ " Num examples = 31\n",
452
+ " Batch size = 1\n",
453
+ "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-900\n",
454
+ "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-900\\config.json\n",
455
+ "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-900\\pytorch_model.bin\n",
456
+ "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-900\\tokenizer_config.json\n",
457
+ "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-900\\special_tokens_map.json\n",
458
+ "***** Running Evaluation *****\n",
459
+ " Num examples = 31\n",
460
+ " Batch size = 1\n",
461
+ "\n",
462
+ "\n",
463
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
464
+ "\n",
465
+ "\n"
466
+ ]
467
+ },
468
+ {
469
+ "data": {
470
+ "text/plain": [
471
+ "TrainOutput(global_step=942, training_loss=0.6499279856428726, metrics={'train_runtime': 5481.9853, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.172, 'total_flos': 2863022229946368.0, 'train_loss': 0.6499279856428726, 'epoch': 3.0})"
472
+ ]
473
+ },
474
+ "execution_count": 9,
475
+ "metadata": {},
476
+ "output_type": "execute_result"
477
+ }
478
+ ],
479
+ "source": [
480
+ "trainer.train(resume_from_checkpoint=last_checkpoint)"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": 10,
486
+ "metadata": {},
487
+ "outputs": [
488
+ {
489
+ "name": "stderr",
490
+ "output_type": "stream",
491
+ "text": [
492
+ "***** Running Evaluation *****\n",
493
+ " Num examples = 31\n",
494
+ " Batch size = 1\n"
495
+ ]
496
+ },
497
+ {
498
+ "data": {
499
+ "text/html": [
500
+ "\n",
501
+ " <div>\n",
502
+ " \n",
503
+ " <progress value='31' max='31' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
504
+ " [31/31 00:25]\n",
505
+ " </div>\n",
506
+ " "
507
+ ],
508
+ "text/plain": [
509
+ "<IPython.core.display.HTML object>"
510
+ ]
511
+ },
512
+ "metadata": {},
513
+ "output_type": "display_data"
514
+ },
515
+ {
516
+ "name": "stdout",
517
+ "output_type": "stream",
518
+ "text": [
519
+ "Perplexity: 2.45\n"
520
+ ]
521
+ }
522
+ ],
523
+ "source": [
524
+ "import math\n",
525
+ "eval_results = trainer.evaluate()\n",
526
+ "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
527
+ ]
528
+ },
529
+ {
530
+ "cell_type": "code",
531
+ "execution_count": 11,
532
+ "metadata": {},
533
+ "outputs": [
534
+ {
535
+ "name": "stderr",
536
+ "output_type": "stream",
537
+ "text": [
538
+ "Dropping the following result as it does not have all the necessary fields:\n",
539
+ "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7920008824873537}]}\n",
540
+ "Saving model checkpoint to ./openchatgpt-neox-r1.1/\n",
541
+ "Configuration saved in ./openchatgpt-neox-r1.1/config.json\n",
542
+ "Model weights saved in ./openchatgpt-neox-r1.1/pytorch_model.bin\n",
543
+ "tokenizer config file saved in ./openchatgpt-neox-r1.1/tokenizer_config.json\n",
544
+ "Special tokens file saved in ./openchatgpt-neox-r1.1/special_tokens_map.json\n"
545
+ ]
546
+ }
547
+ ],
548
+ "source": [
549
+ "trainer.save_state()\n",
550
+ "trainer.create_model_card(tasks=\"text-generation\", finetuned_from=MODEL, dataset=\"openchatgpt safe-r1\")\n",
551
+ "trainer.save_model()"
552
+ ]
553
+ }
554
+ ],
555
+ "metadata": {
556
+ "kernelspec": {
557
+ "display_name": "Python 3 (ipykernel)",
558
+ "language": "python",
559
+ "name": "python3"
560
+ },
561
+ "language_info": {
562
+ "codemirror_mode": {
563
+ "name": "ipython",
564
+ "version": 3
565
+ },
566
+ "file_extension": ".py",
567
+ "mimetype": "text/x-python",
568
+ "name": "python",
569
+ "nbconvert_exporter": "python",
570
+ "pygments_lexer": "ipython3",
571
+ "version": "3.9.1"
572
+ },
573
+ "vscode": {
574
+ "interpreter": {
575
+ "hash": "545eac55c68d45fc1a0aaedcc380eacb641aa49675db0309d358f8f72d496c6d"
576
+ }
577
+ }
578
+ },
579
+ "nbformat": 4,
580
+ "nbformat_minor": 2
581
+ }