{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "20b1e7bd", "metadata": {}, "outputs": [], "source": [ "import torch" ] }, { "cell_type": "code", "execution_count": 2, "id": "4e92fff5", "metadata": {}, "outputs": [], "source": [ "import transformers\n", "\n", "tokenizer = transformers.AutoTokenizer.from_pretrained('cerebras/Cerebras-GPT-2.7B')\n", "tokenizer.pad_token_id = 0" ] }, { "cell_type": "code", "execution_count": 13, "id": "77637440", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Found cached dataset json (/root/.cache/huggingface/datasets/json/default-8d265dbd6f34ccd3/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1f5bceec2f7540f9b46c29f8074c4760", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00 259\u001b[0m response\u001b[39m.\u001b[39;49mraise_for_status()\n\u001b[1;32m 260\u001b[0m \u001b[39mexcept\u001b[39;00m HTTPError \u001b[39mas\u001b[39;00m e:\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/requests/models.py:1021\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[39mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1021\u001b[0m \u001b[39mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m)\n", "\u001b[0;31mHTTPError\u001b[0m: 404 Client Error: Not Found for url: https://huggingface.co/lora-cerebras-gpt2.7b-alpaca/resolve/main/adapter_config.json", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mRepositoryNotFoundError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/utils/config.py:99\u001b[0m, in \u001b[0;36mPeftConfigMixin.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 99\u001b[0m config_file \u001b[39m=\u001b[39m hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)\n\u001b[1;32m 100\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:120\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 118\u001b[0m kwargs \u001b[39m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[39m=\u001b[39mfn\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m, has_token\u001b[39m=\u001b[39mhas_token, kwargs\u001b[39m=\u001b[39mkwargs)\n\u001b[0;32m--> 120\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/file_download.py:1160\u001b[0m, in \u001b[0;36mhf_hub_download\u001b[0;34m(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, local_dir_use_symlinks, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout)\u001b[0m\n\u001b[1;32m 1159\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1160\u001b[0m metadata \u001b[39m=\u001b[39m get_hf_file_metadata(\n\u001b[1;32m 1161\u001b[0m url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m 1162\u001b[0m token\u001b[39m=\u001b[39;49mtoken,\n\u001b[1;32m 1163\u001b[0m proxies\u001b[39m=\u001b[39;49mproxies,\n\u001b[1;32m 1164\u001b[0m timeout\u001b[39m=\u001b[39;49metag_timeout,\n\u001b[1;32m 1165\u001b[0m )\n\u001b[1;32m 1166\u001b[0m \u001b[39mexcept\u001b[39;00m EntryNotFoundError \u001b[39mas\u001b[39;00m http_error:\n\u001b[1;32m 1167\u001b[0m \u001b[39m# Cache the non-existence of the file and raise\u001b[39;00m\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:120\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 118\u001b[0m kwargs \u001b[39m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[39m=\u001b[39mfn\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m, has_token\u001b[39m=\u001b[39mhas_token, kwargs\u001b[39m=\u001b[39mkwargs)\n\u001b[0;32m--> 120\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/file_download.py:1501\u001b[0m, in \u001b[0;36mget_hf_file_metadata\u001b[0;34m(url, token, proxies, timeout)\u001b[0m\n\u001b[1;32m 1492\u001b[0m r \u001b[39m=\u001b[39m _request_wrapper(\n\u001b[1;32m 1493\u001b[0m method\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mHEAD\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 1494\u001b[0m url\u001b[39m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1499\u001b[0m timeout\u001b[39m=\u001b[39mtimeout,\n\u001b[1;32m 1500\u001b[0m )\n\u001b[0;32m-> 1501\u001b[0m hf_raise_for_status(r)\n\u001b[1;32m 1503\u001b[0m \u001b[39m# Return\u001b[39;00m\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:291\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 283\u001b[0m message \u001b[39m=\u001b[39m (\n\u001b[1;32m 284\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mresponse\u001b[39m.\u001b[39mstatus_code\u001b[39m}\u001b[39;00m\u001b[39m Client Error.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 285\u001b[0m \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m make sure you are authenticated.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 290\u001b[0m )\n\u001b[0;32m--> 291\u001b[0m \u001b[39mraise\u001b[39;00m RepositoryNotFoundError(message, response) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 293\u001b[0m \u001b[39melif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39m==\u001b[39m \u001b[39m400\u001b[39m:\n", "\u001b[0;31mRepositoryNotFoundError\u001b[0m: 404 Client Error. (Request ID: Root=1-6424c7f5-7796bb54152221004f83dc73)\n\nRepository Not Found for url: https://huggingface.co/lora-cerebras-gpt2.7b-alpaca/resolve/main/adapter_config.json.\nPlease make sure you specified the correct `repo_id` and `repo_type`.\nIf you are trying to access a private or gated repo, make sure you are authenticated.", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[20], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpeft\u001b[39;00m\n\u001b[1;32m 3\u001b[0m output_dir \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mlora-cerebras-gpt2.7b-alpaca\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 5\u001b[0m model \u001b[39m=\u001b[39m peft\u001b[39m.\u001b[39;49mPeftModel\u001b[39m.\u001b[39;49mfrom_pretrained(\n\u001b[1;32m 6\u001b[0m model,\n\u001b[1;32m 7\u001b[0m \u001b[39m# 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',\u001b[39;49;00m\n\u001b[1;32m 8\u001b[0m output_dir,\n\u001b[1;32m 9\u001b[0m torch_dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat16\n\u001b[1;32m 10\u001b[0m )\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/peft_model.py:135\u001b[0m, in \u001b[0;36mPeftModel.from_pretrained\u001b[0;34m(cls, model, model_id, **kwargs)\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mmapping\u001b[39;00m \u001b[39mimport\u001b[39;00m MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING\n\u001b[1;32m 134\u001b[0m \u001b[39m# load the config\u001b[39;00m\n\u001b[0;32m--> 135\u001b[0m config \u001b[39m=\u001b[39m PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig\u001b[39m.\u001b[39;49mfrom_pretrained(model_id)\u001b[39m.\u001b[39mpeft_type]\u001b[39m.\u001b[39mfrom_pretrained(model_id)\n\u001b[1;32m 137\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(model, \u001b[39m\"\u001b[39m\u001b[39mhf_device_map\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m) \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 138\u001b[0m remove_hook_from_submodules(model)\n", "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/utils/config.py:101\u001b[0m, in \u001b[0;36mPeftConfigMixin.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m 99\u001b[0m config_file \u001b[39m=\u001b[39m hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)\n\u001b[1;32m 100\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mCan\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt find config.json at \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mpretrained_model_name_or_path\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 103\u001b[0m loaded_attributes \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mfrom_json_file(config_file)\n\u001b[1;32m 105\u001b[0m config \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n", "\u001b[0;31mValueError\u001b[0m: Can't find config.json at 'lora-cerebras-gpt2.7b-alpaca'" ] } ], "source": [ "import peft\n", "\n", "\n", "\n", "model = peft.PeftModel.from_pretrained(\n", " model,\n", " # 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',\n", " output_dir,\n", " torch_dtype=torch.float16\n", ")" ] }, { "cell_type": "code", "execution_count": 28, "id": "8ec93ed2", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "import os\n", "import wandb \n", "\n", "output_dir = 'lora-cerebras-gpt2.7b-alpaca'\n", "\n", "use_wandb = True,\n", "wandb_run_name = f\"{output_dir}-{wandb.util.generate_id()}\"\n", "\n", "# set the wandb project where this run will be logged\n", "os.environ[\"WANDB_PROJECT\"]=output_dir\n", "\n", "# save your trained model checkpoint to wandb\n", "os.environ[\"WANDB_LOG_MODEL\"]=\"true\"\n", "\n", "# turn off watch to log faster\n", "os.environ[\"WANDB_WATCH\"]=\"false\"\n", "\n", "training_args = transformers.TrainingArguments(\n", " per_device_train_batch_size=16, \n", " gradient_accumulation_steps=8, \n", " num_train_epochs=3, \n", " learning_rate=1e-4, \n", " fp16=True,\n", " optim=\"adamw_torch\",\n", " logging_steps=10, \n", " evaluation_strategy=\"steps\",\n", " save_strategy=\"steps\",\n", " eval_steps=200,\n", " save_steps=200,\n", " output_dir=output_dir, \n", " save_total_limit=3,\n", "\n", " report_to=\"wandb\" if use_wandb else None,\n", " run_name=wandb_run_name if use_wandb else None,\n", ")" ] }, { "cell_type": "code", "execution_count": 32, "id": "2686ecf2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [972/972 27:33, Epoch 2/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining LossValidation Loss

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Waiting for W&B process to finish... (success)." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "

Run history:


eval/loss█▄▂▁
eval/runtime▅█▄▁
eval/samples_per_second▄▁▅█
eval/steps_per_second▄▁▅█
train/epoch▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███
train/global_step▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███
train/learning_rate████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▂▂▂▂▁▁
train/loss█▃▃▂▂▂▂▂▂▂▂▁▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos▁█
train/train_loss█▁
train/train_runtime█▁
train/train_samples_per_second▁█
train/train_steps_per_second▁█

Run summary:


eval/loss1.69353
eval/runtime213.477
eval/samples_per_second48.666
eval/steps_per_second6.085
train/epoch3.0
train/global_step972
train/learning_rate0.0
train/loss1.7007
train/total_flos4.1553623137959936e+17
train/train_loss0.29741
train/train_runtime1642.1473
train/train_samples_per_second75.912
train/train_steps_per_second0.592

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run lora-cerebras-gpt2.7b-alpaca-jecyepye at: https://wandb.ai/lxelxe/lora-cerebras-gpt2.7b-alpaca/runs/3up74y7g
Synced 6 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Find logs at: ./wandb/run-20230329_232219-3up74y7g/logs" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trainer = transformers.Trainer(\n", " model=model, \n", " train_dataset=train_data,\n", " eval_dataset=val_data,\n", " args=training_args, \n", " data_collator=transformers.DataCollatorForSeq2Seq(\n", " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", " ),\n", ")\n", "\n", "model.config.use_cache = False\n", "result = trainer.train('lora-cerebras-gpt2.7b-alpaca/checkpoint-800')\n", "model.save_pretrained(output_dir)\n", "\n", "wandb.finish()" ] }, { "cell_type": "code", "execution_count": 33, "id": "27e9ad70", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.float16\n" ] }, { "data": { "text/plain": [ "PeftModelForCausalLM(\n", " (base_model): LoraModel(\n", " (model): GPT2LMHeadModel(\n", " (transformer): GPT2Model(\n", " (wte): Embedding(50257, 2560)\n", " (wpe): Embedding(2048, 2560)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " (h): ModuleList(\n", " (0): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (1): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (2): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (3): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (4): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (5): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (6): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (7): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (8): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (9): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (10): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (11): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (12): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (13): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (14): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (15): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (16): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (17): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (18): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (19): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (20): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (21): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (22): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (23): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (24): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (25): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (26): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (27): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (28): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (29): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (30): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (31): GPT2Block(\n", " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): MergedLinear(\n", " in_features=2560, out_features=7680, bias=True\n", " (lora_dropout): Dropout(p=0.05, inplace=False)\n", " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n", " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n", " )\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.0, inplace=False)\n", " (resid_dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): GELUActivation()\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (lm_head): CastOutputToFloat(\n", " (0): Linear(in_features=2560, out_features=50257, bias=False)\n", " )\n", " )\n", " )\n", ")" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.config\n", "print(model.dtype)\n", "\n", "model.half()" ] }, { "cell_type": "code", "execution_count": 35, "id": "9cca3b03", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n", "/root/miniconda3/envs/llama/lib/python3.10/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Human: Can I run inference on my local machine?\n", "Assistant: Yes, you can. You should be able to use the same model and data as your local machine for inference. The only difference is that you will need to download the necessary packages from the cloud or install them locally.\n" ] } ], "source": [ "text = \"Human: Can I run inference on my local machine?\\nAssistant:\"\n", "\n", "inputs = tokenizer(text, return_tensors=\"pt\")\n", "input_ids = inputs[\"input_ids\"].to(model.device)\n", "\n", "generation_config = transformers.GenerationConfig(\n", " max_new_tokens=100,\n", " temperature=0.2,\n", " top_p=0.75,\n", " top_k=50,\n", " repetition_penalty=1.2,\n", " do_sample=True,\n", " early_stopping=True,\n", "# num_beams=5,\n", " \n", " pad_token_id=model.config.pad_token_id,\n", " eos_token_id=model.config.eos_token_id,\n", ")\n", "\n", "with torch.no_grad():\n", " output = model.generate(\n", " input_ids=input_ids,\n", " attention_mask=torch.ones_like(input_ids),\n", " generation_config=generation_config\n", " )[0].cuda()\n", "\n", "result = tokenizer.decode(output, skip_special_tokens=True).strip()\n", "print(result)" ] }, { "cell_type": "code", "execution_count": null, "id": "be542e91", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }