{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3enGMumyYBL-", "outputId": "5376160e-7cd0-4a3c-c8b7-18bd4de159f7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "W3qYi9JvYHR3", "outputId": "112cc897-c015-49e0-e176-d4cb9ade8f60" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "drive sample_data\n" ] } ], "source": [ "ls" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yxda7dkuYkCK", "outputId": "fb77aec8-cf92-45ba-8b09-752fd0a0e327" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 917 ms, sys: 149 ms, total: 1.07 s\n", "Wall time: 2min 19s\n" ] } ], "source": [ "%%time\n", "\n", "from IPython.display import clear_output\n", "\n", "! pip install sentence_transformers==2.2.2\n", "\n", "! pip install -qq -U langchain\n", "! pip install -qq -U tiktoken\n", "! pip install -qq -U pypdf\n", "! pip install -qq -U faiss-gpu\n", "! pip install -qq -U InstructorEmbedding\n", "\n", "! pip install -qq -U transformers\n", "! pip install -qq -U accelerate\n", "! pip install -qq -U bitsandbytes\n", "! pip install -U langchain-community\n", "clear_output()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LEb5k2ZDYoBQ", "outputId": "65dcde15-75d5-4054-ca43-96b2c8f8aa27" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 8.36 s, sys: 1.18 s, total: 9.54 s\n", "Wall time: 17.2 s\n" ] } ], "source": [ "%%time\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "import os\n", "import glob\n", "import textwrap\n", "import time\n", "\n", "import langchain\n", "\n", "### loaders\n", "from langchain.document_loaders import PyPDFLoader, DirectoryLoader\n", "\n", "### splits\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", "### prompts\n", "from langchain import PromptTemplate, LLMChain\n", "\n", "### vector stores\n", "from langchain.vectorstores import FAISS\n", "\n", "### models\n", "from langchain.llms import HuggingFacePipeline\n", "from langchain.embeddings import HuggingFaceInstructEmbeddings\n", "\n", "### retrievers\n", "from langchain.chains import RetrievalQA\n", "\n", "import torch\n", "import transformers\n", "from transformers import (\n", " AutoTokenizer, AutoModelForCausalLM,\n", " BitsAndBytesConfig,\n", " pipeline\n", ")\n", "\n", "clear_output()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9vUjhUcsYsgF", "outputId": "fb33bacf-8aa9-4990-bb9e-2bafa147f6b9" }, "outputs": [ { "data": { "text/plain": [ "['/content/drive/MyDrive/data/llm_paper/2305.13048.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2305.14314.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2305.19268.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2306.07042.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2306.07629.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2306.09782.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2306.11222.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2306.11695.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2306.12929.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2306.14048.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2307.02973.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2307.03172.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2307.09288.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2307.10169.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2307.13304.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2308.10792.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2308.16898v1.pdf',\n", " '/content/drive/MyDrive/data/llm_paper/2309.06180.pdf']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(glob.glob('/content/drive/MyDrive/data/llm_paper/*'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Tsq1H_QhaOHz" }, "outputs": [], "source": [ "class CFG:\n", " # LLMs\n", " model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B\n", " temperature = 0\n", " top_p = 0.95\n", " repetition_penalty = 1.15\n", "\n", " # splitting\n", " split_chunk_size = 800\n", " split_overlap = 0\n", "\n", " # embeddings\n", " embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'\n", "\n", " # similar passages\n", " k = 6\n", "\n", " # paths\n", " PDFs_path = '/content/drive/MyDrive/data/llm_paper/'\n", " Embeddings_path = '/content/drive/MyDrive/data/faiss-hp-sentence-transformers'\n", " Output_folder = '/content/drive/MyDrive/data/output/'" ] }, { "cell_type": "markdown", "metadata": { "id": "IfbxgsCVY2zd" }, "source": [] }, { "cell_type": "markdown", "metadata": { "id": "0CVLePPrY09t" }, "source": [ "# CFG\n", "\n", "- CFG class enables easy and organized experimentation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "f1-4odedacXD" }, "outputs": [], "source": [ "def get_model(model = CFG.model_name):\n", "\n", " print('\\nDownloading model: ', model, '\\n\\n')\n", "\n", " if model == 'wizardlm':\n", " model_repo = 'TheBloke/wizardLM-7B-HF'\n", "\n", " tokenizer = AutoTokenizer.from_pretrained(model_repo)\n", "\n", " bnb_config = BitsAndBytesConfig(\n", " load_in_4bit = True,\n", " bnb_4bit_quant_type = \"nf4\",\n", " bnb_4bit_compute_dtype = torch.float16,\n", " bnb_4bit_use_double_quant = True,\n", " )\n", "\n", " model = AutoModelForCausalLM.from_pretrained(\n", " model_repo,\n", " quantization_config = bnb_config,\n", " device_map = 'auto',\n", " low_cpu_mem_usage = True\n", " )\n", "\n", " max_len = 1024\n", "\n", " elif model == 'llama2-7b-chat':\n", " model_repo = 'daryl149/llama-2-7b-chat-hf'\n", "\n", " tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)\n", "\n", " bnb_config = BitsAndBytesConfig(\n", " load_in_4bit = True,\n", " bnb_4bit_quant_type = \"nf4\",\n", " bnb_4bit_compute_dtype = torch.float16,\n", " bnb_4bit_use_double_quant = True,\n", " )\n", "\n", " model = AutoModelForCausalLM.from_pretrained(\n", " model_repo,\n", " quantization_config = bnb_config,\n", " device_map = 'auto',\n", " low_cpu_mem_usage = True,\n", " trust_remote_code = True\n", " )\n", "\n", " max_len = 2048\n", "\n", " elif model == 'llama2-13b-chat':\n", " model_repo = 'daryl149/llama-2-13b-chat-hf'\n", "\n", " tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)\n", "\n", " bnb_config = BitsAndBytesConfig(\n", " load_in_4bit = True,\n", " bnb_4bit_quant_type = \"nf4\",\n", " bnb_4bit_compute_dtype = torch.float16,\n", " bnb_4bit_use_double_quant = True,\n", " )\n", "\n", " model = AutoModelForCausalLM.from_pretrained(\n", " model_repo,\n", " quantization_config = bnb_config,\n", " device_map = 'auto',\n", " low_cpu_mem_usage = True,\n", " trust_remote_code = True\n", " )\n", "\n", " max_len = 2048 # 8192\n", "\n", " elif model == 'mistral-7B':\n", " model_repo = 'mistralai/Mistral-7B-v0.1'\n", "\n", " tokenizer = AutoTokenizer.from_pretrained(model_repo)\n", "\n", " bnb_config = BitsAndBytesConfig(\n", " load_in_4bit = True,\n", " bnb_4bit_quant_type = \"nf4\",\n", " bnb_4bit_compute_dtype = torch.float16,\n", " bnb_4bit_use_double_quant = True,\n", " )\n", "\n", " model = AutoModelForCausalLM.from_pretrained(\n", " model_repo,\n", " quantization_config = bnb_config,\n", " device_map = 'auto',\n", " low_cpu_mem_usage = True,\n", " )\n", "\n", " max_len = 1024\n", "\n", " else:\n", " print(\"Not implemented model (tokenizer and backbone)\")\n", "\n", " return tokenizer, model, max_len" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 508, "referenced_widgets": [ "f719f256e4784e3cb9c081365a66d0d7", "da86b53e5df644c7a14591394053b133", "552ac490116f499cba4e9a06e8314ba0", "59d2ccfc6a394faf803a605516a549a9", "931eb8b832664707889cf81aed58af12", "d0324f03cc624aaa8740016169aa5daa", "2df0c2cd96ec4fb8af7c610dd4ceb6e7", "cb8ec1f12f87495f9e96a9ca57f4cd38", "878ae7130b444191baf9364b5c38bb4f", "86dc224603ed46138163f33c50419e9d", "317e3b07632247d6bbe4cf2fd719070a", "f51a020ac0d64fce8c794845cf3abab6", "07ded9388723404d940c07da04fbe913", "28f6d57a9cf0421483e1928fd937a635", "f34626a6dfca46449511bee95dc193f8", "df19c44b942d4cdd8c2cc39dddde124e", "b79649b78ef14b1884186b6475da35a7", "d9b70b90272344f2851a9fafe29c16c1", "042dc8f294ad4403affffedaf45af1e6", "176423d437bc437d96832d6bff9c470f", "7a0fca887ed246918517a167328acbd1", "60be9da417c84be8baa78448544b4969", "14e02f5f7d8c4cc5833f6dbbddbc3493", "c26b08ac5e2449538173cd1d5b9bd2e9", "c2c7840f4d124e4598873abf7948948f", "76edb42f44ea44c7881bae3863e9a0e5", "3c9138bdb061486cb9eac9dcc12ccd49", "697309766319465c8e9315662caa6d06", "9abe15e314d040fa9d8a53bca2e5935a", "63e0ec94cd1e4bd8a3888a3ce90619e0", "3652f432867b4df68ce3e9e5daf5e99d", "8c7d0c05653240b1ae8e033e3ab195d3", "05aed4e77b1c454ab290261a68916d69", "d8bccd8d67c94df885bd032207695345", "9a8276a958284a6cb09c597f42775eba", "1fb5d2d1fe8d4dfab9a7147709296bff", "ea508b80909c4314ad765a8ff1bbe1fe", "d1b9a8e1ac2148e893aa1d19eabf2491", "c9261ef04c2d4e65a8b7bb20f87d2fb1", "074f950ae71046448d5d21004bbd3598", "0a67e9572c144b01ad2298da61e0ae61", "e138b081b92e4c3ba4cf862a47ff6c71", "cb90e4080f8346838c82d7af1540867b", "df36dcf99c2346318f3dd61ae06ec988", "64130345c798493b8e3056c15588f60b", "00ce09c5edc240f99db9f0422e8ee102", "45f9ae59385b4a8a96e9c88229ce3ade", "2fd167b244ef4b54b4f9033497caf949", "9dec70aa967042c08f181acbccfd4fe8", "f8e8b4029a534b8fa857dac517f86619", "c94127ce503f42e2b6541413c101e0c2", "d2fe0085be5c40789e2daa9484e7e270", "598996154ad2425791f5c504e5d59d29", "042a3dc6ece74ef5ba4c97bfcd557a8f", "e6a1577989184ddeb0a86ff94ed3f88f", "428561bd5c1444d7913a8e629427d226", "f0890a6877564dbdad2be90bfa4b4d09", "6ddfa64c75e34ffbb0217e700458044d", "255cecdeb7404823843648bdcdc193ce", "84282662e4bf4d60bf4324d3efdf89bc", "b60e0845687d404088050991668a2002", "6b64cbf6687f42b983d19f740e5723ae", "9fbe39a1068d48fc9a1b8079a64511c3", "5ad3361a001640d7b8d48681a590130b", "ff03c3df65ef47d58b736a9246b0ed16", "468d3660c4c346898e564a839ef3eecf", "b7d340605f584a7cad9473aac7869048", "54022271d7924aea98a030126824f352", "c5d98eb408464d5d9814c19ba96cc54d", "2764e2bc639e40f2b860a297cfdda81f", "e870423496bc493cb691dec4a5aa2750", "059fdae03c0f4de0b1e8fbbf20779f9f", "e58ccfcdf2e34d53a9f4d797b012fb58", "4bc77833ea714fc9a7f7a461683740c2", "cf5023b6e6fc418eb36e51d463e25017", "5c087b40912245e78cd79f2e1cf28a53", "02d6b6ec5a264df1b3fb88417bfe5acd", "0e6e0fba57084ebbbd82a04f7212ecfd", "576ef93dbd734068b7e88b397d35765f", "94581c3bb65f4ae6ba0477f8c6a02d79", "a9fc246f81b540d396f857107e71c075", "62e99e2f4dcd44098a887a7d9fd9b14f", "127ba56e7d3e4c1d8ad1307b9963ef65", "6a12e24705e54b4eb4a4e641118704ba", "362be36c89e246d68f93c55245a867b5", "3d013451ad8d4673bc930b77a65da10b", "1039f284c7e74fd682e54a0ddf24b18c", "ca3e0602e87f45cc96a89e0d0a9de86b", "e6f1c2cde3b849e49af75144b1dd340c", "a5168f28082f4136b54bf7efcda185e8", "97422699b9694e30b50f851c11c131f8", "6f1ff231c44544a09d8dc5bc1d6e90ec", "d39335ee84194e9ca4484ac00c19eb2e", "d11aa0dd78874f09a43cd0a61d7c0c51", "43d52163b7824ad7ba4db93d1fd1e017", "ff8d0d77becf475799ef30328bd226bd", "9ef7c21bbd5f426bb85fc34fbc220232", "5e6f1b7fc5704c1c96b3b4676c852b0d", "337fd549e92b4ea0b32be31280279a9d", "c8714ba0f28d4201b093bd9482378843", "300a2b86c118482abbc8741dc8c4203d", "5d946a2202bf41da8b9e59d939c9b856", "bb08abb633d44204972c84d987ea7ff8", "22c99bf1acf5470aadeb709519c7b0ca", "c0944648249248759f9ae97302cb2e3f", "08703352a1c44864ad02ef75102fb025", "178fbbdb70094bd080e7de749290becb", "ee80874b311442b1a92c110d97d4b4ba", "418b297b96cd48e382726fdedacf50b6", "cb4ec568420f4f2eacef440e7cefc4ae", "d5aa93763ec54a30bee22a66a4a54286", "1f6e4f8c343844eb8da9b8710c800c6a", "7dcab0d9df014609a84534a80bd86443", "2b1c9354c33d4834a40764d9f16c5eec", "f38fbbd239b24a6988c990a1f8935dca", "e59f9cc375c2483a89c1a8b654ed386f", "05c05274fa624566a9120c87b72d917e", "e7e96d3ccf6147919bc83b075a73716f", "a9ba0130cc8240039b4404c6b87aa487", "2d3cf0595424483ab7d90ca785c9fce4", "5f5d93b22063402385a59312395cc5fb", "1097d720e7174b41b88ffea7936c412d", "52857d80430a42f7b7e65a15ccbada05", "15a81596adbe498eb24c4ddbeb1bd942", "b72c6ca0e42f4d598bdd61345e21e001", "daecd54e7997485f8acd7ffad92a06ae", "676545a89ae3413093c74479fb03e9d3", "3e3a93bd946b4b278bdfa214d6f35360", "e6e93c0f5f904ac6836cfb014877ee03", "454ef83e43384dd781c7f62908247d04", "9e8e6457a66a4db090acca8eff4ac517", "1dd249c54572491faf7b0032394f1316" ] }, "id": "PFqCDHDDaiXz", "outputId": "f317c606-4524-427c-a739-a016bb271489" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 45.3 s, sys: 53.9 s, total: 1min 39s\n", "Wall time: 6min 34s\n" ] } ], "source": [ "%%time\n", "\n", "tokenizer, model, max_len = get_model(model = CFG.model_name)\n", "\n", "clear_output()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R9fWwcw4bHTY", "outputId": "b2968f49-d4c4-4be3-a35f-de9237726cf1" }, "outputs": [ { "data": { "text/plain": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(32000, 5120, padding_idx=0)\n", " (layers): ModuleList(\n", " (0-39): 40 x LlamaDecoderLayer(\n", " (self_attn): LlamaSdpaAttention(\n", " (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)\n", " (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)\n", " (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)\n", " (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)\n", " (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)\n", " (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)\n", " (act_fn): SiLU()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): Linear(in_features=5120, out_features=32000, bias=False)\n", ")" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.eval()" ] }, { "cell_type": "markdown", "metadata": { "id": "DiIZE7BVbLEh" }, "source": [ "# 🤗 pipeline\n", "\n", "- Hugging Face pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vMzm07yjbMGE", "outputId": "7e4a12a5-ab8a-46c4-aca4-6327de6e1168" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `HuggingFacePipeline` was deprecated in LangChain 0.0.37 and will be removed in 0.3. An updated version of the class exists in the langchain-huggingface package and should be used instead. To use it run `pip install -U langchain-huggingface` and import as `from langchain_huggingface import HuggingFacePipeline`.\n", " warn_deprecated(\n" ] } ], "source": [ "### hugging face pipeline\n", "pipe = pipeline(\n", " task = \"text-generation\",\n", " model = model,\n", " tokenizer = tokenizer,\n", " pad_token_id = tokenizer.eos_token_id,\n", "# do_sample = True,\n", " max_length = max_len,\n", " temperature = CFG.temperature,\n", " top_p = CFG.top_p,\n", " repetition_penalty = CFG.repetition_penalty\n", ")\n", "\n", "### langchain pipeline\n", "llm = HuggingFacePipeline(pipeline = pipe)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 198 }, "id": "0aO7ICQLbR_U", "outputId": "e761cbb9-dbdc-4032-8135-82b3207c8bcf" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 35.1 s, sys: 289 ms, total: 35.4 s\n", "Wall time: 37.6 s\n" ] }, { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "'Give me 5 things about Transformer.\\n\\nHere are five key things to know about the Transformer architecture:\\n\\n1. Self-Attention Mechanism: Transformer uses a self-attention mechanism that allows it to model complex relationships between different parts of the input sequence. This is in contrast to traditional recurrent neural networks (RNNs), which only consider the previous elements in the sequence when making predictions.\\n2. Multi-Head Attention: The self-attention mechanism in Transformer is implemented using multiple \"heads,\" each of which computes a separate attention weight vector. This allows the model to capture different types of relationships between different parts of the input sequence.\\n3. Positional Encoding: Transformer models use positional encoding to preserve the order of the input sequence, since the self-attention mechanism does not inherently maintain this information. Positional encoding adds a unique fixed vector to each input sequence element based on its position in the sequence.\\n4. Encoder-Decoder Structure: Transformer models typically have an encoder and decoder structure. The encoder takes in a sequence of tokens or words and outputs a continuous representation of the input sequence. The decoder then generates the output sequence, one token at a time, based on the continuous representation produced by the encoder.\\n5. Parallelization: Transformer models can be parallelized more easily than RNNs, since the self-attention mechanism allows for computing multiple attention weights in parallel. This makes Transformer models faster and more scalable for large datasets.'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "### testing model, not using the llm paper yet\n", "query = \"Give me 5 things about Transformer.\"\n", "llm.invoke(query)" ] }, { "cell_type": "markdown", "metadata": { "id": "gkD-mrKJbbAa" }, "source": [ "# 🦜🔗 Langchain\n", "\n", "- Multiple document retriever with LangChain\n", "\n", "# Loader\n", "\n", "- [Directory loader](https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory) for multiple files\n", "- This step is not necessary if you are just loading the vector database\n", "- This step is necessary if you are creating embeddings. In this case you need to:\n", " - load de PDF files\n", " - split into chunks\n", " - create embeddings\n", " - save the embeddings in a vector store\n", " - After that you can just load the saved embeddings to do similarity search with the user query, and then use the LLM to answer the question\n", " \n", "You can comment out this section if you use the embeddings I already created." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jo4Y09d6bXaw", "outputId": "2b55c968-6fba-47a0-97d5-da09fbd690c4" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 18/18 [01:18<00:00, 4.38s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 14s, sys: 289 ms, total: 1min 15s\n", "Wall time: 1min 18s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "%%time\n", "\n", "loader = DirectoryLoader(\n", " CFG.PDFs_path,\n", " glob=\"./*.pdf\",\n", " loader_cls=PyPDFLoader,\n", " show_progress=True,\n", " use_multithreading=True\n", ")\n", "\n", "documents = loader.load()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3GCwH-t6bnfN", "outputId": "f51226f7-4b23-4036-b533-a2ec2fae5df0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "We have 610 pages in total\n" ] } ], "source": [ "print(f'We have {len(documents)} pages in total')" ] }, { "cell_type": "markdown", "metadata": { "id": "2shE7pZodGhD" }, "source": [ "# Splitter\n", "\n", "- Splitting the text into chunks so its passages are easily searchable for similarity\n", "- This step is also only necessary if you are creating the embeddings\n", "- [RecursiveCharacterTextSplitter](https://python.langchain.com/en/latest/reference/modules/document_loaders.html?highlight=RecursiveCharacterTextSplitter#langchain.document_loaders.MWDumpLoader)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_a2e0DTWdEqV", "outputId": "da654d48-1b8c-4cac-ca8c-a872346b6f9c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "We have created 2940 chunks from 610 pages\n" ] } ], "source": [ "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size = CFG.split_chunk_size,\n", " chunk_overlap = CFG.split_overlap\n", ")\n", "\n", "texts = text_splitter.split_documents(documents)\n", "\n", "print(f'We have created {len(texts)} chunks from {len(documents)} pages')" ] }, { "cell_type": "markdown", "metadata": { "id": "ZErJecjcdLx5" }, "source": [ "# Create Embeddings\n", "\n", "\n", "- Embedd and store the texts in a Vector database (FAISS)\n", "- [LangChain Vector Stores docs](https://python.langchain.com/docs/modules/data_connection/vectorstores/)\n", "- [FAISS - langchain](https://python.langchain.com/docs/integrations/vectorstores/faiss)\n", "- [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks - paper Aug/2019](https://arxiv.org/pdf/1908.10084.pdf)\n", "- [This is a nice 4 minutes video about vector stores](https://www.youtube.com/watch?v=dN0lsF2cvm4)\n", "\n", "___\n", "\n", "- If you use Chroma vector store it will take ~35 min to create embeddings\n", "- If you use FAISS vector store on GPU it will take just ~3 min\n", "\n", "___\n", "\n", "We need to create the embeddings only once, and then we can just load the vector store and query the database using similarity search.\n", "\n", "Loading the embeddings takes only a few seconds.\n", "\n", "I uploaded the embeddings to a Kaggle Dataset so we just load it from [here](https://www.kaggle.com/datasets/hinepo/faiss-hp-sentence-transformers)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8tdg5F0jdJX0", "outputId": "8fe17390-10f7-42d5-9e9c-2fa5c5a6f74c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "load INSTRUCTOR_Transformer\n", "max_seq_length 512\n", "CPU times: user 12.4 s, sys: 48.9 ms, total: 12.4 s\n", "Wall time: 13.7 s\n" ] } ], "source": [ "%%time\n", "\n", "### we create the embeddings only if they do not exist yet\n", "if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):\n", "\n", " ### download embeddings model\n", " embeddings = HuggingFaceInstructEmbeddings(\n", " model_name = CFG.embeddings_model_repo,\n", " model_kwargs = {\"device\": \"cuda\"}\n", " )\n", "\n", " ### create embeddings and DB\n", " vectordb = FAISS.from_documents(\n", " documents = texts,\n", " embedding = embeddings\n", " )\n", "\n", " ### persist vector database\n", " vectordb.save_local(f\"{CFG.Output_folder}/faiss_index_hp\") # save in output folder\n", "# vectordb.save_local(f\"{CFG.Embeddings_path}/faiss_index_hp\") # save in input folder" ] }, { "cell_type": "markdown", "metadata": { "id": "p0xmEmx0dT3G" }, "source": [ "# Load vector database\n", "\n", "- After saving the vector database, we just load it from the Kaggle Dataset I mentioned\n", "- Obviously, the embeddings function to load the embeddings must be the same as the one used to create the embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xugnkM-bdQjv", "outputId": "20c0c140-1b59-466b-e507-69a11d9054d1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 131 ms, sys: 59.8 ms, total: 191 ms\n", "Wall time: 416 ms\n" ] } ], "source": [ "%%time\n", "\n", "### download embeddings model\n", "embeddings = HuggingFaceInstructEmbeddings(\n", " model_name = CFG.embeddings_model_repo,\n", " model_kwargs = {\"device\": \"cuda\"}\n", ")\n", "\n", "### load vector DB embeddings\n", "vectordb = FAISS.load_local(\n", " #CFG.Embeddings_path, # from input folder\n", " CFG.Output_folder + '/faiss_index_hp', # from output folder\n", " embeddings,\n", " allow_dangerous_deserialization=True\n", ")\n", "\n", "clear_output()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BkEWtEKnegGK", "outputId": "fd471ea3-32f7-4bc5-ca81-444009bdf4e0" }, "outputs": [ { "data": { "text/plain": [ "[Document(page_content='there is still a long way to go before we can fully clarify the mystery of transformer training.\\nB.2 Discussions and Limitations\\nPrevious Attempts. During our experiments, we find several noteworthy observations. In H2O,\\nemploying the accumulated attention score to evict KVembeddings can lead to a potential bias\\nfavoring the least recent tokens. This bias arises because most previous tokens have a higher number\\nof attention scores, resulting in a higher accumulated attention score and, consequently, a greater\\nlikelihood of being retained. To address this concern, we conducted an additional experiment utilizing\\n21', metadata={'source': '/content/drive/MyDrive/data/llm_paper/2306.14048.pdf', 'page': 20}),\n", " Document(page_content='pher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. OPT: Open pre-trained transformer\\nlanguage models. arXiv preprint arXiv:2205.01068 , 2022.\\n13', metadata={'source': '/content/drive/MyDrive/data/llm_paper/2306.07629.pdf', 'page': 12}),\n", " Document(page_content='aboutphysicalcommonsenseinnaturallanguage. In Thirty-Fourth AAAI Conference on Artificial\\nIntelligence , 2020.\\nYelysei Bondarenko, Markus Nagel, and Tijmen Blankevoort. Understanding and overcoming the\\nchallenges of efficient transformer quantization. CoRR, abs/2109.12948, 2021. URL https:\\n//arxiv.org/abs/2109.12948 .\\nAakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam\\nRoberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, Parker Schuh,\\nKensen Shi, Sasha Tsvyashchenko, Joshua Maynez, Abhishek Rao, Parker Barnes, Yi Tay, Noam\\nShazeer, Vinodkumar Prabhakaran, Emily Reif, Nan Du, Ben Hutchinson, Reiner Pope, James\\nBradbury, Jacob Austin, Michael Isard, Guy Gur-Ari, Pengcheng Yin, Toju Duke, Anselm Lev-', metadata={'source': '/content/drive/MyDrive/data/llm_paper/2305.19268.pdf', 'page': 12}),\n", " Document(page_content='Transformer in NLP. Transformers [ 67] as a popular option have been frequently adopted by\\nplenty of natural language processing (NLP) applications with prevailing successes [ 68,69,70,71,72,\\n46,73,13,74,75]. Roughly, modern transformer-based networks can be categorized into two groups:\\n(1) Encoder-Decoder or Encoder-only ( i.e., BERT-style models [ 76]). This type of transformers\\ncommonly leverages the Masked Language Modeling task which encourages models to capture\\nthe intrinsic relationship between words and their context. Notable examples include BERT [ 76],\\nRoBBERTa [ 69] and T5 [ 77]. (2) Decoder-only ( i.e., GPT-style models [ 78]). Usually, this group of\\ntransformers adopts the Casual Language Modeling task, which is optimized to generate the next', metadata={'source': '/content/drive/MyDrive/data/llm_paper/2306.14048.pdf', 'page': 20})]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "### test if vector DB was loaded correctly\n", "vectordb.similarity_search('What is Transformer?')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZQByYGCoel8V" }, "outputs": [], "source": [ "prompt_template = \"\"\"\n", "You are a helpful AI assistant to answer questions about the context provided.\n", "Don't try to make up an answer, if you don't know just say that you don't know.\n", "Answer in the same language the question was asked.\n", "Use only the following pieces of context to answer the question at the end.\n", "\n", "{context}\n", "\n", "Question: {question}\n", "Answer:\"\"\"\n", "\n", "\n", "PROMPT = PromptTemplate(\n", " template = prompt_template,\n", " input_variables = [\"context\", \"question\"]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "T16rZAiQe91j" }, "outputs": [], "source": [ "llm_chain = LLMChain(prompt=PROMPT, llm=llm)" ] }, { "cell_type": "markdown", "metadata": { "id": "sLLO-uvyfCr0" }, "source": [ "# Retriever chain\n", "\n", "- Retriever to retrieve relevant passages\n", "- Chain to answer questions\n", "- [RetrievalQA: Chain for question-answering](https://python.langchain.com/docs/modules/data_connection/retrievers/)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "iTyrf3dce_n2" }, "outputs": [], "source": [ "retriever = vectordb.as_retriever(search_kwargs = {\"k\": CFG.k, \"search_type\" : \"similarity\"})\n", "\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm = llm,\n", " chain_type = \"stuff\", # map_reduce, map_rerank, stuff, refine\n", " retriever = retriever,\n", " chain_type_kwargs = {\"prompt\": PROMPT},\n", " return_source_documents = True,\n", " verbose = False\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "RwbnWtiyfdP7" }, "source": [ "# Post-process outputs\n", "\n", "- Format llm response\n", "- Cite sources (PDFs)\n", "- Change `width` parameter to format the output" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "R7TLF_7Tfc4m" }, "outputs": [], "source": [ "def wrap_text_preserve_newlines(text, width=700):\n", " # Split the input text into lines based on newline characters\n", " lines = text.split('\\n')\n", "\n", " # Wrap each line individually\n", " wrapped_lines = [textwrap.fill(line, width=width) for line in lines]\n", "\n", " # Join the wrapped lines back together using newline characters\n", " wrapped_text = '\\n'.join(wrapped_lines)\n", "\n", " return wrapped_text\n", "\n", "\n", "def process_llm_response(llm_response):\n", " ans = wrap_text_preserve_newlines(llm_response['result'])\n", "\n", " # Get question and answer in llm_response['result']\n", " question = \"Question : \" + ans.split('\\nQuestion: ')[1].split('\\nAnswer: ')[0]\n", " answer = \"Answer : \" + ans.split('\\nAnswer: ')[1]\n", "\n", "\n", " # Print question and answer\n", " result = '\\n\\nQuestion: ' + question + '\\nAnswer: ' + answer\n", "\n", " # Print the sources used\n", " sources_used = ' \\n'.join(\n", " [\n", " source.metadata['source'].split('/')[-1][:-4]\n", " + ' - page: '\n", " + str(source.metadata['page'])\n", " for source in llm_response['source_documents']\n", " ]\n", " )\n", " result = question + '\\n' + answer + '\\n\\nSources: \\n' + sources_used\n", " ans = ans + '\\n\\nSources: \\n' + sources_used\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3sXTLFiuffG9" }, "outputs": [], "source": [ "def llm_ans(query):\n", " start = time.time()\n", "\n", " llm_response = qa_chain.invoke(query)\n", " result = process_llm_response(llm_response)\n", "\n", " end = time.time()\n", "\n", " time_elapsed = int(round(end - start, 0))\n", " time_elapsed_str = f'\\n\\nTime elapsed: {time_elapsed} s'\n", " return result + time_elapsed_str" ] }, { "cell_type": "markdown", "metadata": { "id": "OHFhPwSQfixq" }, "source": [ "# Ask questions\n", "\n", "- Question Answering from multiple documents\n", "- Invoke QA Chain\n", "- Talk to your data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oPVqMA3bfgiZ", "outputId": "2063b8e2-4778-43f0-cf33-30b2b6a8f2aa" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Question : What is the Quantization?\n", "Answer : Quantization is the process of discretizing an input from a representation that holds more information to a representation with less information.\n", "\n", "Sources: \n", "2305.19268 - page: 0 \n", "2305.14314 - page: 2 \n", "2305.14314 - page: 5 \n", "2307.13304 - page: 7 \n", "2307.13304 - page: 16 \n", "2305.19268 - page: 31\n", "\n", "Time elapsed: 6 s\n" ] } ], "source": [ "query = \"What is the Quantization?\"\n", "result = llm_ans(query)\n", "print(result)" ] }, { "cell_type": "markdown", "metadata": { "id": "miVoSrgegLq4" }, "source": [ "# Gradio Chat UI\n", "\n", "- **At the moment this part only works on Google Colab. Gradio and Kaggle started having compatibility issues recently.**\n", "- If you plan to use the interface, it is preferable to do so in Google Colab\n", "- I'll leave this section commented out for now\n", "- Chat UI prints below\n", "\n", "___\n", "\n", "- Create a chat UI with [Gradio](https://www.gradio.app/guides/quickstart)\n", "- [ChatInterface docs](https://www.gradio.app/docs/chatinterface)\n", "- The notebook should be running if you want to use the chat interface" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sW4G0XfFgMSS" }, "outputs": [], "source": [ "import locale\n", "locale.getpreferredencoding = lambda: \"UTF-8\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nCMNY4s3gNkn" }, "outputs": [], "source": [ "! pip install gradio" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 626 }, "id": "A31oQQLYgRGY", "outputId": "8a33d6c1-b985-42e8-bb88-a307c8067781" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", "\n", "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", "Running on public URL: https://2545918982e730623e.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "