{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import datasets\n", "import transformers\n", "import torch\n", "\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "MODEL = \"EleutherAI/pythia-125m-deduped\"\n", "\n", "config = AutoConfig.from_pretrained(MODEL)\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)\n", "model = AutoModelForCausalLM.from_pretrained(MODEL)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Added 1 tokens!\n" ] } ], "source": [ "# @title Extend model\n", "\n", "num_added_tokens = tokenizer.add_special_tokens({\"sep_token\": \"<|STK_SP|>\"})\n", "print(f\"Added {num_added_tokens} tokens!\")\n", "model.resize_token_embeddings(len(tokenizer))\n", "\n", "# TODO: ???\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "assert tokenizer.sep_token == \"<|STK_SP|>\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration default-b39c74bc29b6f917\n", "Found cached dataset json (C:/Users/lego-/.cache/huggingface/datasets/json/default-b39c74bc29b6f917/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a5ad5093bc064d4096b9646f195e4723", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/2 [00:00 config.max_position_embeddings:\n", " print(f\"Error in {i} of train\")\n", "for i in range(len(tokenized_datasets[\"validation\"])):\n", " if len(tokenized_datasets[\"validation\"][i][\"input_ids\"]) > config.max_position_embeddings:\n", " print(f\"Error in {i} of validation\")\n", "\n", "# [tokenized_datasets[\"train\"][1], tokenized_datasets[\"validation\"][1]]\n", "print(\"Total processed datasets sizes are \", len(tokenized_datasets[\"train\"]), len(tokenized_datasets[\"validation\"]))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0cad348a2c094680ac2b0ab5e7dc2c8c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Grouping texts in chunks of 2048: 0%| | 0/3 [00:00= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " # Split by chunks of max_len.\n", " result = {\n", " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " return result\n", "\n", "lm_datasets = tokenized_datasets.map(\n", " group_texts,\n", " batched=True,\n", " # num_proc=data_args.preprocessing_num_workers,\n", " load_from_cache_file=False,\n", " desc=f\"Grouping texts in chunks of {block_size}\",\n", ")\n", "\n", "print(\"Total LM datasets sizes are \", len(lm_datasets[\"train\"]), len(lm_datasets[\"validation\"]))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using magick windows DLL!\n", "CUDA SETUP: Loading binary d:\\projects\\python\\distilchatgpt2\\venv\\lib\\site-packages\\bitsandbytes\\libbitsandbytes_cudaall.dll...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Using cuda_amp half precision backend\n" ] } ], "source": [ "from transformers import Trainer, TrainingArguments, default_data_collator, DataCollatorWithPadding\n", "from transformers.trainer_pt_utils import get_parameter_names\n", "import evaluate\n", "\n", "import bitsandbytes as bnb\n", "from bitsandbytes.optim import GlobalOptimManager\n", "\n", "def preprocess_logits_for_metrics(logits, labels):\n", " if isinstance(logits, tuple):\n", " # Depending on the model and config, logits may contain extra tensors,\n", " # like past_key_values, but logits always come first\n", " logits = logits[0]\n", " return logits.argmax(dim=-1)\n", "\n", "metric = evaluate.load(\"accuracy\")\n", "\n", "def compute_metrics(eval_preds):\n", " preds, labels = eval_preds\n", " # preds have the same shape as the labels, after the argmax(-1) has been calculated\n", " # by preprocess_logits_for_metrics but we need to shift the labels\n", " labels = labels[:, 1:].reshape(-1)\n", " preds = preds[:, :-1].reshape(-1)\n", " return metric.compute(predictions=preds, references=labels)\n", "\n", "model.config.use_cache = False\n", "\n", "#data_collator_pad = DataCollatorWithPadding(tokenizer)\n", "def data_collator(data_):\n", " data = default_data_collator(data_)\n", " #print(data)\n", " return {'input_ids': torch.stack([i for i in data['input_ids']]),\n", " 'attention_mask': torch.stack([i for i in data['attention_mask']]),\n", " 'labels': torch.stack([i for i in data['input_ids']])}\n", "\n", "training_args = TrainingArguments(\n", " \"./openchatgpt-neox-r1.1/\",\n", " do_train=True, \n", " do_eval=True,\n", " \n", " push_to_hub=False,\n", "\n", " # Pulled from examples\n", " evaluation_strategy=\"epoch\",\n", " #learning_rate=2e-5,\n", " #weight_decay=0.01,\n", "\n", " save_steps=300,\n", "\n", " per_device_train_batch_size=1,\n", " per_device_eval_batch_size=1,\n", "\n", " gradient_accumulation_steps=2,\n", " gradient_checkpointing=True,\n", "\n", " fp16=True,\n", ")\n", "\n", "optim = bnb.optim.Adam8bit\n", "def set_optim_to_run_embedding_in_fp32(model):\n", " for module in model.modules():\n", " if isinstance(module, torch.nn.Embedding):\n", " GlobalOptimManager.get_instance().register_module_override(module, 'weight', {'optim_bits': 32})\n", "set_optim_to_run_embedding_in_fp32(model)\n", "# model.cuda()\n", "\n", "decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])\n", "decay_parameters = [name for name in decay_parameters if \"bias\" not in name]\n", "optimizer_grouped_parameters = [\n", " {\n", " \"params\": [p for n, p in model.named_parameters() if n in decay_parameters],\n", " \"weight_decay\": training_args.weight_decay,\n", " },\n", " {\n", " \"params\": [p for n, p in model.named_parameters() if n not in decay_parameters],\n", " \"weight_decay\": 0.0,\n", " },\n", "]\n", "\n", "adam_bnb_optim = optim(\n", " optimizer_grouped_parameters,\n", " betas=(training_args.adam_beta1, training_args.adam_beta2),\n", " eps=training_args.adam_epsilon,\n", " lr=training_args.learning_rate,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " #train_dataset=tokenized_datasets[\"train\"],\n", " #eval_dataset=tokenized_datasets[\"validation\"],\n", " train_dataset=lm_datasets[\"train\"],\n", " eval_dataset=lm_datasets[\"validation\"],\n", " tokenizer=tokenizer,\n", "\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", " preprocess_logits_for_metrics=preprocess_logits_for_metrics,\n", "\n", " # data_collator=lambda data: {'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),\n", " # 'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),\n", " # 'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])},\n", "\n", " args=training_args,\n", "\n", " optimizers=(adam_bnb_optim, None),\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No last checkpoint detected!\n" ] } ], "source": [ "# @title Get last model checkpoint if any...\n", "\n", "from transformers.trainer_utils import get_last_checkpoint\n", "\n", "last_checkpoint = get_last_checkpoint(\"./openchatgpt-neox-r1.1/\")\n", "if last_checkpoint is None:\n", " print(\"No last checkpoint detected!\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "***** Running training *****\n", " Num examples = 628\n", " Num Epochs = 3\n", " Instantaneous batch size per device = 1\n", " Total train batch size (w. parallel, distributed & accumulation) = 2\n", " Gradient Accumulation steps = 2\n", " Total optimization steps = 942\n", " Number of trainable parameters = 162283008\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [942/942 1:31:15, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossAccuracy
1No log0.8814870.787100
20.8118000.8716940.791922
30.8118000.8965730.792001

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-300\n", "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-300\\config.json\n", "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-300\\pytorch_model.bin\n", "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-300\\tokenizer_config.json\n", "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-300\\special_tokens_map.json\n", "***** Running Evaluation *****\n", " Num examples = 31\n", " Batch size = 1\n", "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-600\n", "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-600\\config.json\n", "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-600\\pytorch_model.bin\n", "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-600\\tokenizer_config.json\n", "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-600\\special_tokens_map.json\n", "***** Running Evaluation *****\n", " Num examples = 31\n", " Batch size = 1\n", "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-900\n", "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-900\\config.json\n", "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-900\\pytorch_model.bin\n", "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-900\\tokenizer_config.json\n", "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-900\\special_tokens_map.json\n", "***** Running Evaluation *****\n", " Num examples = 31\n", " Batch size = 1\n", "\n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n" ] }, { "data": { "text/plain": [ "TrainOutput(global_step=942, training_loss=0.6499279856428726, metrics={'train_runtime': 5481.9853, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.172, 'total_flos': 2863022229946368.0, 'train_loss': 0.6499279856428726, 'epoch': 3.0})" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.train(resume_from_checkpoint=last_checkpoint)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "***** Running Evaluation *****\n", " Num examples = 31\n", " Batch size = 1\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [31/31 00:25]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Perplexity: 2.45\n" ] } ], "source": [ "import math\n", "eval_results = trainer.evaluate()\n", "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Dropping the following result as it does not have all the necessary fields:\n", "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7920008824873537}]}\n", "Saving model checkpoint to ./openchatgpt-neox-r1.1/\n", "Configuration saved in ./openchatgpt-neox-r1.1/config.json\n", "Model weights saved in ./openchatgpt-neox-r1.1/pytorch_model.bin\n", "tokenizer config file saved in ./openchatgpt-neox-r1.1/tokenizer_config.json\n", "Special tokens file saved in ./openchatgpt-neox-r1.1/special_tokens_map.json\n" ] } ], "source": [ "trainer.save_state()\n", "trainer.create_model_card(tasks=\"text-generation\", finetuned_from=MODEL, dataset=\"openchatgpt safe-r1\")\n", "trainer.save_model()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" }, "vscode": { "interpreter": { "hash": "545eac55c68d45fc1a0aaedcc380eacb641aa49675db0309d358f8f72d496c6d" } } }, "nbformat": 4, "nbformat_minor": 2 }