{ "cells": [ { "cell_type": "markdown", "id": "9c3e4532", "metadata": { "papermill": { "duration": 1.084185, "end_time": "2023-10-21T05:49:17.395684", "exception": false, "start_time": "2023-10-21T05:49:16.311499", "status": "completed" }, "tags": [] }, "source": [ "# Train models using HuggingFace libraries\n", "\n", "This notebook takes parameters from a params.json file which is automatically\n", "created by Substratus K8s operator.\n", "\n", "The following parameters influence what happens in this notebook:\n", "- `dataset_urls`: A comma separated list of URLs. The URLs should point to\n", " json files that contain your training dataset. If unset a json or jsonl\n", " file should be present under the `/content/data/` directory.\n", "- `prompt_template`: The prompt template to use for training\n", "- `push_to_hub`: if this variable is set a repo id, then the trained\n", " model will get pushed to HuggingFace hub. For example,\n", " set it to \"substratusai/my-model\" to publish to substratusai HF org." ] }, { "cell_type": "code", "execution_count": 1, "id": "86ccd646", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:49:19.339339Z", "iopub.status.busy": "2023-10-21T05:49:19.338625Z", "iopub.status.idle": "2023-10-21T05:49:19.351013Z", "shell.execute_reply": "2023-10-21T05:49:19.350424Z" }, "papermill": { "duration": 0.924056, "end_time": "2023-10-21T05:49:19.352494", "exception": false, "start_time": "2023-10-21T05:49:18.428438", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-40/resolve/main/WithRetrieval-SchemaSplit-Train-40.json',\n", " 'logging_steps': 50,\n", " 'modules_to_save': 'embed_tokens, lm_head',\n", " 'num_train_epochs': 3,\n", " 'per_device_eval_batch_size': 1,\n", " 'per_device_train_batch_size': 1,\n", " 'prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n{output}\\n',\n", " 'push_to_hub': 'substratusai/wgql-WithRetrieval-SchemaSplit-Train-40',\n", " 'save_steps': 50,\n", " 'target_modules': 'q_proj, up_proj, o_proj, k_proj, down_proj, gate_proj, v_proj',\n", " 'warmup_steps': 100}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "from pathlib import Path\n", "\n", "params = {}\n", "params_path = Path(\"/content/params.json\")\n", "if params_path.is_file():\n", " with params_path.open(\"r\", encoding=\"UTF-8\") as params_file:\n", " params = json.load(params_file)\n", "\n", "\n", "params" ] }, { "cell_type": "code", "execution_count": 2, "id": "9fafd16b-d8c9-47bf-9116-c27b1d43a019", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:49:21.248001Z", "iopub.status.busy": "2023-10-21T05:49:21.247682Z", "iopub.status.idle": "2023-10-21T05:49:23.661856Z", "shell.execute_reply": "2023-10-21T05:49:23.661065Z" }, "papermill": { "duration": 3.288968, "end_time": "2023-10-21T05:49:23.663559", "exception": false, "start_time": "2023-10-21T05:49:20.374591", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-40/resolve/main/WithRetrieval-SchemaSplit-Train-40.json']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b6969f92a1334ecd9b1f632e5868c724", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00\n" ] } ], "source": [ "default_prompt = \"\"\"\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "### Instruction:\n", "{prompt}\n", "### Response:\n", "{completion}\n", "\"\"\"\n", "\n", "prompt = params.get(\"prompt_template\", default_prompt)\n", "\n", "eos_token = tokenizer.convert_ids_to_tokens(model.config.eos_token_id)\n", "if prompt[-len(eos_token):] != eos_token:\n", " prompt = prompt + eos_token\n", "\n", "print(prompt)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "0abf96e1-3bc1-4ae7-80ac-c2e585e9c7c1", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:54:00.755035Z", "iopub.status.busy": "2023-10-21T05:54:00.754343Z", "iopub.status.idle": "2023-10-21T05:54:01.608931Z", "shell.execute_reply": "2023-10-21T05:54:01.608154Z" }, "papermill": { "duration": 10.709526, "end_time": "2023-10-21T05:54:01.610675", "exception": false, "start_time": "2023-10-21T05:53:50.901149", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sat Oct 21 05:54:00 2023 \r\n", "+-----------------------------------------------------------------------------+\r\n", "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\r\n", "|-------------------------------+----------------------+----------------------+\r\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", "| | | MIG M. |\r\n", "|===============================+======================+======================|\r\n", "| 0 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 |\r\n", "| N/A 76C P0 36W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "| 1 NVIDIA L4 Off | 00000000:00:05.0 Off | 0 |\r\n", "| N/A 75C P0 35W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 2 NVIDIA L4 Off | 00000000:00:06.0 Off | 0 |\r\n", "| N/A 75C P0 35W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 3 NVIDIA L4 Off | 00000000:00:07.0 Off | 0 |\r\n", "| N/A 75C P0 33W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", " \r\n", "+-----------------------------------------------------------------------------+\r\n", "| Processes: |\r\n", "| GPU GI CI PID Type Process name GPU Memory |\r\n", "| ID ID Usage |\r\n", "|=============================================================================|\r\n", "+-----------------------------------------------------------------------------+\r\n" ] } ], "source": [ "! nvidia-smi" ] }, { "attachments": {}, "cell_type": "markdown", "id": "4d1e1795-c783-4ddf-999e-f1de19258928", "metadata": { "papermill": { "duration": 1.044535, "end_time": "2023-10-21T05:54:03.603440", "exception": false, "start_time": "2023-10-21T05:54:02.558905", "status": "completed" }, "tags": [] }, "source": [ "Prompt before fine tuning" ] }, { "cell_type": "code", "execution_count": 7, "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:54:05.579378Z", "iopub.status.busy": "2023-10-21T05:54:05.578687Z", "iopub.status.idle": "2023-10-21T05:54:05.601523Z", "shell.execute_reply": "2023-10-21T05:54:05.600770Z" }, "papermill": { "duration": 0.997872, "end_time": "2023-10-21T05:54:05.603060", "exception": false, "start_time": "2023-10-21T05:54:04.605188", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t32000: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from typing import Dict\n", "# source: https://github.com/artidoro/qlora\n", "DEFAULT_PAD_TOKEN = params.get(\"pad_token\", \"[PAD]\")\n", "\n", "def smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict: Dict,\n", " tokenizer: transformers.PreTrainedTokenizer,\n", " model: transformers.PreTrainedModel,\n", "):\n", " \"\"\"Resize tokenizer and embedding.\n", "\n", " Note: This is the unoptimized version that may make your embedding size not be divisible by 64.\n", " \"\"\"\n", " num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)\n", " model.resize_token_embeddings(len(tokenizer))\n", " if num_new_tokens > 0:\n", " input_embeddings_data = model.get_input_embeddings().weight.data\n", " output_embeddings_data = model.get_output_embeddings().weight.data\n", "\n", " input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", " output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", "\n", " input_embeddings_data[-num_new_tokens:] = input_embeddings_avg\n", " output_embeddings_data[-num_new_tokens:] = output_embeddings_avg\n", "\n", "if tokenizer._pad_token is None:\n", " smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),\n", " tokenizer=tokenizer,\n", " model=model,\n", " )\n", "\n", "if isinstance(tokenizer, transformers.LlamaTokenizer):\n", " # LLaMA tokenizer may not have correct special tokens set.\n", " # Check and add them if missing to prevent them from being parsed into different tokens.\n", " # Note that these are present in the vocabulary.\n", " # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token.\n", " print('Adding special tokens.')\n", " tokenizer.add_special_tokens({\n", " \"eos_token\": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),\n", " \"bos_token\": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),\n", " \"unk_token\": tokenizer.convert_ids_to_tokens(\n", " model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id\n", " ),\n", " })\n", "\n", "tokenizer" ] }, { "cell_type": "code", "execution_count": 8, "id": "e78b510d", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:54:07.604577Z", "iopub.status.busy": "2023-10-21T05:54:07.603812Z", "iopub.status.idle": "2023-10-21T05:54:10.896691Z", "shell.execute_reply": "2023-10-21T05:54:10.896027Z" }, "papermill": { "duration": 4.319511, "end_time": "2023-10-21T05:54:10.898941", "exception": false, "start_time": "2023-10-21T05:54:06.579430", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c8b882b3f67b457a921a73aa350a8aee", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1493 [00:00, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144\n" ] } ], "source": [ "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n", "\n", "target_modules = params.get(\"target_modules\")\n", "if target_modules:\n", " target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n", "\n", "modules_to_save = params.get(\"modules_to_save\")\n", "if modules_to_save:\n", " modules_to_save = [mod.strip() for mod in modules_to_save.split(\",\")]\n", "\n", "lora_config2 = LoraConfig(\n", " r=16,\n", " lora_alpha=16,\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\",\n", " target_modules=target_modules,\n", " modules_to_save = modules_to_save\n", ")\n", "print(lora_config2)\n", "\n", "model = prepare_model_for_kbit_training(model)\n", "\n", "# add LoRA adaptor\n", "model = get_peft_model(model, lora_config2)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 10, "id": "70a3e36c-62cf-45aa-8f37-0db0e40857dc", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:55:51.796898Z", "iopub.status.busy": "2023-10-21T05:55:51.795890Z", "iopub.status.idle": "2023-10-21T05:55:51.815502Z", "shell.execute_reply": "2023-10-21T05:55:51.814850Z" }, "papermill": { "duration": 1.051739, "end_time": "2023-10-21T05:55:51.817016", "exception": false, "start_time": "2023-10-21T05:55:50.765277", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "TrainingArguments(\n", "_n_gpu=4,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_backend=None,\n", "ddp_broadcast_buffers=None,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "dispatch_batches=None,\n", "do_eval=False,\n", "do_predict=False,\n", "do_train=False,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=None,\n", "evaluation_strategy=no,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=4,\n", "gradient_checkpointing=False,\n", "greater_is_better=None,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_always_push=False,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "include_tokens_per_second=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=3e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=False,\n", "local_rank=0,\n", "log_level=passive,\n", "log_level_replica=warning,\n", "log_on_each_node=True,\n", "logging_dir=/content/artifacts/checkpoints/runs/Oct21_05-55-51_wgqlg-withretrieval-schemasplit-train-40-modeller-zmvfq,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=50,\n", "logging_strategy=steps,\n", "lr_scheduler_type=cosine,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=3.0,\n", "optim=paged_adamw_32bit,\n", "optim_args=None,\n", "output_dir=/content/artifacts/checkpoints,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=1,\n", "per_device_train_batch_size=1,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=/content/artifacts/checkpoints,\n", "save_on_each_node=False,\n", "save_safetensors=False,\n", "save_steps=50,\n", "save_strategy=steps,\n", "save_total_limit=None,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_cpu=False,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.02,\n", "warmup_steps=100,\n", "weight_decay=0.0,\n", ")" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils import parse_training_args\n", "\n", "training_args = parse_training_args(params)\n", "training_args" ] }, { "cell_type": "code", "execution_count": 11, "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:55:53.899792Z", "iopub.status.busy": "2023-10-21T05:55:53.899027Z", "iopub.status.idle": "2023-10-21T05:55:53.902455Z", "shell.execute_reply": "2023-10-21T05:55:53.901834Z" }, "papermill": { "duration": 1.100827, "end_time": "2023-10-21T05:55:53.903903", "exception": false, "start_time": "2023-10-21T05:55:52.803076", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# data = data[\"train\"].train_test_split(test_size=0.1)\n", "# data\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "5bc91439-6108-445c-8f85-e6558c9f0677", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:55:56.641848Z", "iopub.status.busy": "2023-10-21T05:55:56.641104Z", "iopub.status.idle": "2023-10-21T05:55:56.906471Z", "shell.execute_reply": "2023-10-21T05:55:56.905658Z" }, "papermill": { "duration": 1.310772, "end_time": "2023-10-21T05:55:56.908127", "exception": false, "start_time": "2023-10-21T05:55:55.597355", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! mkdir -p {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 13, "id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T05:55:58.965151Z", "iopub.status.busy": "2023-10-21T05:55:58.964343Z", "iopub.status.idle": "2023-10-21T07:47:28.532976Z", "shell.execute_reply": "2023-10-21T07:47:28.532251Z" }, "papermill": { "duration": 6690.621815, "end_time": "2023-10-21T07:47:28.534533", "exception": false, "start_time": "2023-10-21T05:55:57.912718", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [1119/1119 1:51:20, Epoch 2/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
501.057700
1000.489600
1500.260400
2000.147400
2500.081700
3000.057000
3500.041700
4000.037400
4500.033400
5000.029200
5500.027600
6000.027100
6500.025000
7000.024600
7500.024500
8000.020400
8500.020500
9000.020800
9500.020800
10000.020900
10500.021100
11000.020300

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=1119, training_loss=0.11247422747893245, metrics={'train_runtime': 6689.036, 'train_samples_per_second': 0.67, 'train_steps_per_second': 0.167, 'total_flos': 1.4445806420565197e+17, 'train_loss': 0.11247422747893245, 'epoch': 3.0})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer = transformers.Trainer(\n", " model=model,\n", " train_dataset=data[\"train\"],\n", "# eval_dataset=data[\"test\"],\n", " args=training_args,\n", " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", ")\n", "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", "\n", "checkpoint_path = Path(\"/content/artifacts/checkpoints\")\n", "\n", "# Only set resume_from_checkpoint True when directory exists and contains files\n", "resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n", "if resume_from_checkpoint:\n", " print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n", "trainer.train(resume_from_checkpoint=resume_from_checkpoint)" ] }, { "cell_type": "code", "execution_count": 14, "id": "172e47a7-400e-4f82-a5e3-38135ecf532f", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T07:47:30.428814Z", "iopub.status.busy": "2023-10-21T07:47:30.428055Z", "iopub.status.idle": "2023-10-21T07:47:46.873882Z", "shell.execute_reply": "2023-10-21T07:47:46.873193Z" }, "papermill": { "duration": 17.445662, "end_time": "2023-10-21T07:47:46.875405", "exception": false, "start_time": "2023-10-21T07:47:29.429743", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "PeftModelForCausalLM(\n", " (base_model): LoraModel(\n", " (model): LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): ModulesToSaveWrapper(\n", " (original_module): Embedding(32001, 4096)\n", " (modules_to_save): ModuleDict(\n", " (default): Embedding(32001, 4096)\n", " )\n", " )\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (k_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (v_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (o_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (up_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (down_proj): Linear(\n", " in_features=11008, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=11008, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): ModulesToSaveWrapper(\n", " (original_module): Linear(in_features=4096, out_features=32001, bias=False)\n", " (modules_to_save): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=32001, bias=False)\n", " )\n", " )\n", " )\n", " )\n", ")" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path_lora)\n", "model" ] }, { "cell_type": "code", "execution_count": 15, "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T07:47:48.699946Z", "iopub.status.busy": "2023-10-21T07:47:48.699212Z", "iopub.status.idle": "2023-10-21T07:47:48.949489Z", "shell.execute_reply": "2023-10-21T07:47:48.948666Z" }, "papermill": { "duration": 1.175557, "end_time": "2023-10-21T07:47:48.950963", "exception": false, "start_time": "2023-10-21T07:47:47.775406", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 1.2G\r\n", " 512 -rw-r--r-- 1 root 3003 88 Oct 21 07:47 README.md\r\n", "1.0K -rw-r--r-- 1 root 3003 550 Oct 21 07:47 adapter_config.json\r\n", "1.2G -rw-r--r-- 1 root 3003 1.2G Oct 21 07:47 adapter_model.bin\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 16, "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T07:47:50.799568Z", "iopub.status.busy": "2023-10-21T07:47:50.798899Z", "iopub.status.idle": "2023-10-21T07:48:42.484011Z", "shell.execute_reply": "2023-10-21T07:48:42.483286Z" }, "papermill": { "duration": 53.672087, "end_time": "2023-10-21T07:48:43.522023", "exception": false, "start_time": "2023-10-21T07:47:49.849936", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(32001, 4096)\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): Linear(in_features=4096, out_features=32001, bias=False)\n", ")" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = model.merge_and_unload().half()\n", "model" ] }, { "cell_type": "code", "execution_count": 17, "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T07:48:45.317087Z", "iopub.status.busy": "2023-10-21T07:48:45.316765Z", "iopub.status.idle": "2023-10-21T07:48:45.559747Z", "shell.execute_reply": "2023-10-21T07:48:45.558874Z" }, "papermill": { "duration": 1.11534, "end_time": "2023-10-21T07:48:45.561396", "exception": false, "start_time": "2023-10-21T07:48:44.446056", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 0\r\n", "drwxr-xr-x 1 root 3003 0 Oct 21 05:55 checkpoints\r\n", "drwxr-xr-x 1 root 3003 0 Oct 21 05:55 lora\r\n", "drwxr-xr-x 1 root 3003 0 Oct 21 05:49 src\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -l {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 18, "id": "260e9d79-6eb8-4516-bf8f-825a25606391", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T07:48:47.429854Z", "iopub.status.busy": "2023-10-21T07:48:47.429062Z", "iopub.status.idle": "2023-10-21T07:51:23.634942Z", "shell.execute_reply": "2023-10-21T07:51:23.634264Z" }, "papermill": { "duration": 158.141645, "end_time": "2023-10-21T07:51:24.665966", "exception": false, "start_time": "2023-10-21T07:48:46.524321", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "('/content/artifacts/tokenizer_config.json',\n", " '/content/artifacts/special_tokens_map.json',\n", " '/content/artifacts/tokenizer.model',\n", " '/content/artifacts/added_tokens.json',\n", " '/content/artifacts/tokenizer.json')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path)\n", "tokenizer.save_pretrained(trained_model_path)" ] }, { "cell_type": "code", "execution_count": 19, "id": "6d90a920-fb22-4291-8466-411ff41e31be", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T07:51:26.557278Z", "iopub.status.busy": "2023-10-21T07:51:26.556503Z", "iopub.status.idle": "2023-10-21T07:51:26.796901Z", "shell.execute_reply": "2023-10-21T07:51:26.796120Z" }, "papermill": { "duration": 1.217017, "end_time": "2023-10-21T07:51:26.798456", "exception": false, "start_time": "2023-10-21T07:51:25.581439", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 13G\r\n", " 512 -rw-r--r-- 1 root 3003 21 Oct 21 07:51 added_tokens.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 21 05:55 checkpoints\r\n", "1.0K -rw-r--r-- 1 root 3003 648 Oct 21 07:48 config.json\r\n", " 512 -rw-r--r-- 1 root 3003 183 Oct 21 07:48 generation_config.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 21 05:55 lora\r\n", "9.3G -rw-r--r-- 1 root 3003 9.3G Oct 21 07:49 pytorch_model-00001-of-00002.bin\r\n", "3.3G -rw-r--r-- 1 root 3003 3.3G Oct 21 07:50 pytorch_model-00002-of-00002.bin\r\n", " 24K -rw-r--r-- 1 root 3003 24K Oct 21 07:51 pytorch_model.bin.index.json\r\n", "1.0K -rw-r--r-- 1 root 3003 552 Oct 21 07:51 special_tokens_map.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 21 05:49 src\r\n", "1.8M -rw-r--r-- 1 root 3003 1.8M Oct 21 07:51 tokenizer.json\r\n", "489K -rw-r--r-- 1 root 3003 489K Oct 21 07:51 tokenizer.model\r\n", "1.5K -rw-r--r-- 1 root 3003 1.1K Oct 21 07:51 tokenizer_config.json\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 20, "id": "202a694a", "metadata": { "execution": { "iopub.execute_input": "2023-10-21T07:51:28.659968Z", "iopub.status.busy": "2023-10-21T07:51:28.659180Z" }, "papermill": { "duration": null, "end_time": null, "exception": false, "start_time": "2023-10-21T07:51:27.744359", "status": "running" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "671a3174ba724efd8ea3d2b141b75b98", "version_major": 2, "version_minor": 0 }, "text/plain": [ "pytorch_model-00002-of-00002.bin: 0%| | 0.00/3.50G [00:00