diff --git "a/notebooks/07-RAG_Improve_Chunking.ipynb" "b/notebooks/07-RAG_Improve_Chunking.ipynb" --- "a/notebooks/07-RAG_Improve_Chunking.ipynb" +++ "b/notebooks/07-RAG_Improve_Chunking.ipynb" @@ -1,800 +1,1121 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "collapsed_sections": [ - "6Wx-IPSMbSwC" - ], - "authorship_tag": "ABX9TyNlJV4zbpjtN6glOumdzocl", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "language_info": { - "name": "python" + { + "cell_type": "markdown", + "metadata": { + "id": "-zE1h0uQV7uT" + }, + "source": [ + "# Install Packages and Setup Variables" + ] }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "7a469b6821ed458d99a1ed57e72b3d68": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_8c556c8c8ce941c6b433780fd4a6ae54", - "IPY_MODEL_626b1ba98c374987913a7a4384f19fa1", - "IPY_MODEL_a4fad4d11a8941f8b90abb3099e9a090" - ], - "layout": "IPY_MODEL_c3a4b958e4814294801495226697bce2" - } + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "8c556c8c8ce941c6b433780fd4a6ae54": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2e939db189424ab7b5f9095932f2c99f", - "placeholder": "​", - "style": "IPY_MODEL_fd6a36e947ec451a938d266117dab12e", - "value": "Parsing nodes: 100%" - } + "id": "QPJzr-I9XQ7l", + "outputId": "1b699f15-bd3f-473d-dd37-74257e6d263e" + }, + "outputs": [], + "source": [ + "!pip install -q llama-index==0.9.21 openai==1.6.0 tiktoken==0.5.2 chromadb==0.4.21 kaleido==0.2.1 python-multipart==0.0.6 cohere==4.39" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "riuXwpSPcvWC" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n", + "os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "jIEeZzqLbz0J" + }, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bkgi2OrYzF7q" + }, + "source": [ + "# Load a Model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "9oGT6crooSSj" + }, + "outputs": [], + "source": [ + "from llama_index.llms import OpenAI\n", + "\n", + "llm = OpenAI(temperature=0.9, model=\"gpt-3.5-turbo\", max_tokens=512)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0BwVuJXlzHVL" + }, + "source": [ + "# Create a VectoreStore" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "SQP87lHczHKc" + }, + "outputs": [], + "source": [ + "import chromadb\n", + "\n", + "# create client and a new collection\n", + "# chromadb.EphemeralClient saves data in-memory.\n", + "chroma_client = chromadb.PersistentClient(path=\"./mini-llama-articles\")\n", + "chroma_collection = chroma_client.create_collection(\"mini-llama-articles\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "zAaGcYMJzHAN" + }, + "outputs": [], + "source": [ + "from llama_index.vector_stores import ChromaVectorStore\n", + "\n", + "# Define a storage context object using the created vector database.\n", + "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I9JbAzFcjkpn" + }, + "source": [ + "# Load the Dataset (CSV)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ceveDuYdWCYk" + }, + "source": [ + "## Download" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eZwf6pv7WFmD" + }, + "source": [ + "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "626b1ba98c374987913a7a4384f19fa1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e4413564a300469d86c3abc567f24701", - "max": 14, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_64167ae99cd24c729435aefc1ea13519", - "value": 14 - } + "id": "wl_pbPvMlv1h", + "outputId": "38f73ac6-b824-4a5b-9385-e7b1afbd2cc8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 169k 100 169k 0 0 868k 0 --:--:-- --:--:-- --:--:-- 869k\n" + ] + } + ], + "source": [ + "!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv\n", + "# !curl -o ./mini-llama-articles.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VWBLtDbUWJfA" + }, + "source": [ + "## Read File" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "a4fad4d11a8941f8b90abb3099e9a090": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2634e510d3c844d88891a98661beb6a9", - "placeholder": "​", - "style": "IPY_MODEL_6b3d2afb949f4de691ceac601bd96d0e", - "value": " 14/14 [00:00<00:00, 34.02it/s]" - } - }, - "c3a4b958e4814294801495226697bce2": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2e939db189424ab7b5f9095932f2c99f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fd6a36e947ec451a938d266117dab12e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e4413564a300469d86c3abc567f24701": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "64167ae99cd24c729435aefc1ea13519": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2634e510d3c844d88891a98661beb6a9": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6b3d2afb949f4de691ceac601bd96d0e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8cc800fbe6bc4f4da5dd6b93d4a5143a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_812d5d9b04f74592b850b3eb32f88c04", - "IPY_MODEL_ed22c91e813c4351ab1d3eb7e174796c", - "IPY_MODEL_de2088a425104f05b52b7a3236c7baa9" - ], - "layout": "IPY_MODEL_6f9f666836084de7894aa2e65c8dbe07" - } - }, - "812d5d9b04f74592b850b3eb32f88c04": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_63a3dcff335349deacf4abb9b68d76ab", - "placeholder": "​", - "style": "IPY_MODEL_99eb83f4b8904e20b45573bab84aa5f4", - "value": "Generating embeddings: 100%" - } - }, - "ed22c91e813c4351ab1d3eb7e174796c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2c8aef5e8ec848c0a23c72581e5f4b1e", - "max": 108, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7d54abb8f3784a789fd042c2ed2dd685", - "value": 108 - } - }, - "de2088a425104f05b52b7a3236c7baa9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a1a88448b188407b8e4aa2af86fb9345", - "placeholder": "​", - "style": "IPY_MODEL_6a4cc229f5774cb0b4d3def7eee8b56e", - "value": " 108/108 [00:04<00:00, 22.53it/s]" - } - }, - "6f9f666836084de7894aa2e65c8dbe07": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + "id": "0Q9sxuW0g3Gd", + "outputId": "6bd4f786-f888-4d3b-d324-95230ef5f544" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import csv\n", + "\n", + "rows = []\n", + "\n", + "# Load the file as a JSON\n", + "with open(\"./mini-llama-articles.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n", + " csv_reader = csv.reader(file)\n", + "\n", + " for idx, row in enumerate( csv_reader ):\n", + " if idx == 0: continue; # Skip header row\n", + " rows.append( row )\n", + "\n", + "# The number of characters in the dataset.\n", + "len( rows )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S17g2RYOjmf2" + }, + "source": [ + "# Convert to Document obj" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "YizvmXPejkJE" + }, + "outputs": [], + "source": [ + "from llama_index import Document\n", + "\n", + "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n", + "documents = [Document(text=row[1], metadata={\"title\": row[0], \"url\": row[2], \"source_name\": row[3]}) for row in rows]\n", + "\n", + "# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.\n", + "for idx, doc in enumerate(documents):\n", + " doc.id_ = f\"doc_{idx}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qjuLbmFuWsyl" + }, + "source": [ + "# Transforming" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "9z3t70DGWsjO" + }, + "outputs": [], + "source": [ + "from llama_index.text_splitter import TokenTextSplitter\n", + "from llama_index.schema import BaseNode\n", + "import hashlib\n", + "\n", + "def deterministic_id_func(i: int, doc: BaseNode) -> str:\n", + " \"\"\"Deterministic ID function for the text splitter.\n", + " This will be used to generate a unique repeatable identifier for each node.\"\"\"\n", + " unique_identifier = doc.id_ + str(i)\n", + " hasher = hashlib.sha256()\n", + " hasher.update(unique_identifier.encode('utf-8')) \n", + " return hasher.hexdigest()\n", + "\n", + "text_splitter = TokenTextSplitter(separator=\" \", chunk_size=512, chunk_overlap=128, id_func=deterministic_id_func)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 385, + "referenced_widgets": [ + "7a469b6821ed458d99a1ed57e72b3d68", + "8c556c8c8ce941c6b433780fd4a6ae54", + "626b1ba98c374987913a7a4384f19fa1", + "a4fad4d11a8941f8b90abb3099e9a090", + "c3a4b958e4814294801495226697bce2", + "2e939db189424ab7b5f9095932f2c99f", + "fd6a36e947ec451a938d266117dab12e", + "e4413564a300469d86c3abc567f24701", + "64167ae99cd24c729435aefc1ea13519", + "2634e510d3c844d88891a98661beb6a9", + "6b3d2afb949f4de691ceac601bd96d0e", + "8cc800fbe6bc4f4da5dd6b93d4a5143a", + "812d5d9b04f74592b850b3eb32f88c04", + "ed22c91e813c4351ab1d3eb7e174796c", + "de2088a425104f05b52b7a3236c7baa9", + "6f9f666836084de7894aa2e65c8dbe07", + "63a3dcff335349deacf4abb9b68d76ab", + "99eb83f4b8904e20b45573bab84aa5f4", + "2c8aef5e8ec848c0a23c72581e5f4b1e", + "7d54abb8f3784a789fd042c2ed2dd685", + "a1a88448b188407b8e4aa2af86fb9345", + "6a4cc229f5774cb0b4d3def7eee8b56e" + ] }, - "63a3dcff335349deacf4abb9b68d76ab": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + "id": "P9LDJ7o-Wsc-", + "outputId": "2e27e965-fd4c-4754-94f5-3a6e33a72dea" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/omar/Documents/ai_repos/ai-tutor-rag-system/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Parsing nodes: 100%|██████████| 14/14 [00:00<00:00, 14.26it/s]\n", + "100%|██████████| 108/108 [00:39<00:00, 2.70it/s]\n", + "100%|██████████| 108/108 [00:54<00:00, 1.99it/s]\n", + "100%|██████████| 108/108 [00:28<00:00, 3.82it/s]\n", + "Generating embeddings: 100%|██████████| 108/108 [00:02<00:00, 45.21it/s]\n" + ] + } + ], + "source": [ + "from llama_index.extractors import (\n", + " SummaryExtractor,\n", + " QuestionsAnsweredExtractor,\n", + " KeywordExtractor,\n", + ")\n", + "from llama_index.embeddings import OpenAIEmbedding\n", + "from llama_index.ingestion import IngestionPipeline\n", + "\n", + "pipeline = IngestionPipeline(\n", + " transformations=[\n", + " text_splitter,\n", + " QuestionsAnsweredExtractor(questions=3, llm=llm),\n", + " SummaryExtractor(summaries=[\"prev\", \"self\"], llm=llm),\n", + " KeywordExtractor(keywords=10, llm=llm),\n", + " OpenAIEmbedding(),\n", + " ],\n", + " vector_store=vector_store\n", + ")\n", + "\n", + "nodes = pipeline.run(documents=documents, show_progress=True);" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "99eb83f4b8904e20b45573bab84aa5f4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } + "id": "mPGa85hM2P3P", + "outputId": "c106c463-2459-4b11-bbae-5bd5e2246011" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "108" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len( nodes )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OWaT6rL7ksp8" + }, + "source": [ + "# Load Indexes" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "mXi56KTXk2sp" + }, + "outputs": [], + "source": [ + "# Create your index\n", + "db = chromadb.PersistentClient(path=\"./mini-llama-articles\")\n", + "chroma_collection = db.get_or_create_collection(\"mini-llama-articles\")\n", + "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "jKXURvLtkuTS" + }, + "outputs": [], + "source": [ + "# Create your index\n", + "from llama_index import VectorStoreIndex\n", + "\n", + "index = VectorStoreIndex.from_vector_store(vector_store)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8JPD8yAinVSq" + }, + "source": [ + "# Query Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "b0gue7cyctt1" + }, + "outputs": [], + "source": [ + "# Define a query engine that is responsible for retrieving related pieces of text,\n", + "# and using a LLM to formulate the final answer.\n", + "query_engine = index.as_query_engine()\n", + "\n", + "res = query_engine.query(\"How many parameters LLaMA2 model has?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 }, - "2c8aef5e8ec848c0a23c72581e5f4b1e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + "id": "VKK3jMprctre", + "outputId": "3503d4e1-3d1d-4ec2-c593-4eb7306cc370" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'The Llama 2 model is available in four different sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.response" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "7d54abb8f3784a789fd042c2ed2dd685": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } + "id": "465dH4yQc7Ct", + "outputId": "38bbc97b-1a07-427b-d3d4-0a5215b85358" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node ID\t f707756065d1f788b41fb97fcef81979e1fd241dbfa4034a24bec8e57b648482\n", + "Title\t Meta's Llama 2: Revolutionizing Open Source Language Models for Commercial Use\n", + "Text\t I. Llama 2: Revolutionizing Commercial Use Unlike its predecessor Llama 1, which was limited to research use, Llama 2 represents a major advancement as an open-source commercial model. Businesses can now integrate Llama 2 into products to create AI-powered applications. Availability on Azure and AWS facilitates fine-tuning and adoption. However, restrictions apply to prevent exploitation. Companies with over 700 million active daily users cannot use Llama 2. Additionally, its output cannot be used to improve other language models. II. Llama 2 Model Flavors Llama 2 is available in four different model sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters. While 7B, 13B, and 70B have already been released, the 34B model is still awaited. The pretrained variant, trained on a whopping 2 trillion tokens, boasts a context window of 4096 tokens, twice the size of its predecessor Llama 1. Meta also released a Llama 2 fine-tuned model for chat applications that was trained on over 1 million human annotations. Such extensive training comes at a cost, with the 70B model taking a staggering 1720320 GPU hours to train. The context window's length determines the amount of content the model can process at once, making Llama 2 a powerful language model in terms of scale and efficiency. III. Safety Considerations: A Top Priority for Meta Meta's commitment to safety and alignment shines through in Llama 2's design. The model demonstrates exceptionally low AI safety violation percentages, surpassing even ChatGPT in safety benchmarks. Finding the right balance between helpfulness and safety when optimizing a model poses significant challenges. While a highly helpful model may be capable of answering any question, including sensitive ones like \"How do I build a bomb?\", it also raises concerns about potential misuse. Thus, striking the perfect equilibrium between providing useful information and ensuring safety is paramount. However, prioritizing safety to an extreme extent can lead to a model that struggles to effectively address a diverse range of questions. This limitation could hinder the model's practical applicability and user experience. Thus, achieving\n", + "Score\t 0.699388273978391\n", + "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n", + "Node ID\t 636f98cf8754c3a4759da02aa11a3f2aa7cdeb848a4980ec99300ece4a2e92fd\n", + "Title\t Meta's Llama 2: Revolutionizing Open Source Language Models for Commercial Use\n", + "Text\t The model demonstrates exceptionally low AI safety violation percentages, surpassing even ChatGPT in safety benchmarks. Finding the right balance between helpfulness and safety when optimizing a model poses significant challenges. While a highly helpful model may be capable of answering any question, including sensitive ones like \"How do I build a bomb?\", it also raises concerns about potential misuse. Thus, striking the perfect equilibrium between providing useful information and ensuring safety is paramount. However, prioritizing safety to an extreme extent can lead to a model that struggles to effectively address a diverse range of questions. This limitation could hinder the model's practical applicability and user experience. Thus, achieving an optimum balance that allows the model to be both helpful and safe is of utmost importance. To strike the right balance between helpfulness and safety, Meta employed two reward models - one for helpfulness and another for safety - to optimize the model's responses. The 34B parameter model has reported higher safety violations than other variants, possibly contributing to the delay in its release. IV. Helpfulness Comparison: Llama 2 Outperforms Competitors Llama 2 emerges as a strong contender in the open-source language model arena, outperforming its competitors in most categories. The 70B parameter model outperforms all other open-source models, while the 7B and 34B models outshine Falcon in all categories and MPT in all categories except coding. Despite being smaller, Llam a2's performance rivals that of Chat GPT 3.5, a significantly larger closed-source model. While GPT 4 and PalM-2-L, with their larger size, outperform Llama 2, this is expected due to their capacity for handling complex language tasks. Llama 2's impressive ability to compete with larger models highlights its efficiency and potential in the market. However, Llama 2 does face challenges in coding and math problems, where models like Chat GPT 4 excel, given their significantly larger size. Chat GPT 4 performed significantly better than Llama 2 for coding (HumanEval benchmark)and math problem tasks (GSM8k benchmark). Open-source AI technologies, like Llama 2, continue to advance, offering\n", + "Score\t 0.6986276122119972\n", + "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n" + ] + } + ], + "source": [ + "for src in res.source_nodes:\n", + " print(\"Node ID\\t\", src.node_id)\n", + " print(\"Title\\t\", src.metadata['title'])\n", + " print(\"Text\\t\", src.text)\n", + " print(\"Score\\t\", src.score)\n", + " print(\"-_\"*20)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GrqBq8Dfidw6" + }, + "source": [ + "### Trying a different Query" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "MMBQJcPaigA0" + }, + "outputs": [], + "source": [ + "res = query_engine.query(\"Can LLaMA2 do NLU?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 }, - "a1a88448b188407b8e4aa2af86fb9345": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + "id": "N2QbpT0skT75", + "outputId": "18ddac02-218d-432d-8f00-da96e93c8326" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Yes, LLaMA2 can do NLU (Natural Language Understanding) as mentioned in the context information.'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.response" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "6a4cc229f5774cb0b4d3def7eee8b56e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } + "id": "f9HPdfMjqsbQ", + "outputId": "ef558e20-9625-4228-b057-5e1990752d43" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node ID\t 8cf94b9369ba8da18d02172b9cbf885afb60cddd0a2381a86a81ca8e6a9b10f9\n", + "Title\t Exploring Large Language Models -Part 3\n", + "Text\t LM model training via UnSupervised learning). Note that this model was loaded in 4-bit, making it runnable on a single T4 GPU and trained with QLoRa. With QLoRA, only a fraction of the adapter weights are trained and summed with the existing frozen pre-trained weights of the model during inference. Here is an illustrative Colab notebook. You can see that training the model with just the text as is, does not result in proper output to questions. The answers are not affected by the training data. Take 2: Instruct Fine-tuning with QLoRa Instruction Tuning concept is a higher-level training concept introduced by this paper FineTuned Language Models Are Zero shot Learners (FLAN) We leverage the intuition that NLP tasks can be described via natural language instructions, such as \"Is the sentiment of this movie review positive or negative?\" or \"Translate 'how are you' into Chinese.\" We take a pre-trained language model of 137B parameters and perform instruction tuning ... Since we use QLoRa we are effectively closely following this paper - QLORA: Efficient Finetuning of Quantized LLMs concerning the training data set, the format that the authors used to train their Gauanco model This is the format for the Llama2 model and will be different for others. One of the hardest problems of training is finding or creating a good quality data set to train. In our case, converting the available training data set to the instruction data set. Since our use case is Closed Book QA, we need to convert this to a QA format. Using older NLP methods like NER (Named Entity Recognition) and then using that to create a QA dataset was not effective. This is where the Self-instruct concept could be used However previous to Llama2, the best-performing model was the GPT 3/4 model via ChatGPT or its API and using these models to do the same was expensive. The 7 billion model of Llama2 has sufficient NLU (Natural Language Understanding) to create output based on a particular format. Running this in 4-bit mode via Quantisation makes it feasible compute-wise to run this on a large data set and convert it to a QA dataset. This was the prompt used. The\n", + "Score\t 0.7171179965716512\n", + "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n", + "Node ID\t e97bbe3d37bacb34902b4db67351799f1309541d4879e53b97fad08a4417304f\n", + "Title\t LLaMA by Meta leaked by an anonymous forum: Questions Arises on Meta\n", + "Text\t LLaMA: Meta's new AI tool According to the official release, LLaMA is a foundational language model developed to assist 'researchers and academics' in their work (as opposed to the average web user) to understand and study these NLP models. Leveraging AI in such a way could give researchers an edge in terms of time spent. You may not know this, but this would be Meta's third LLM after Blender Bot 3 and Galactica. However, the two LLMs were shut down soon, and Meta stopped their further development, as it produced erroneous results. Before moving further, it is important to emphasize that LLaMA is NOT a chatbot like ChatGPT. As I mentioned before, it is a 'research tool' for researchers. We can expect the initial versions of LLaMA to be a bit more technical and indirect to use as opposed to the case with ChatGPT, which was very direct, interactive, and a lot easy to use. \"Smaller, more performant models such as LLaMA enable ... research community who don't have access to large amounts of infrastructure to study these models.. further democratizing access in this important, fast-changing field,\" said Meta in its official blog. Meta's effort of \"democratizing\" access to the public could shed light on one of the critical issues of Generative AI - toxicity and bias. ChatGPT and other LLMs (obviously, I am referring to Bing) have a track record of responding in a way that is toxic and, well... evil. The Verge and major critics have covered it in much detail. Oh and the community did get the access, but not in the way Meta anticipated. On March 3rd, a downloadable torrent of the LLaMA system was posted on 4chan. 4chan is an anonymous online forum known for its controversial content and diverse range of discussions, which has nearly 222 million unique monthly visitors. LLaMA is currently not in use on any of Meta's products. But Meta has plans to make it available to researchers before they can use them in their own products. It's worth mentioning that Meta did not release\n", + "Score\t 0.7109836688235909\n", + "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n" + ] + } + ], + "source": [ + "for src in res.source_nodes:\n", + " print(\"Node ID\\t\", src.node_id)\n", + " print(\"Title\\t\", src.metadata['title'])\n", + " print(\"Text\\t\", src.text)\n", + " print(\"Score\\t\", src.score)\n", + " print(\"-_\"*20)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TmkI8BV8rATi" + }, + "source": [ + "From the articles:\n", + " \n", + "> [...]The 7 billion model of Llama2 has sufficient NLU (Natural Language Understanding) to create output based on a particular format[...]\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6Wx-IPSMbSwC" + }, + "source": [ + "# No Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "oGunPKGRbT6H" + }, + "outputs": [], + "source": [ + "documents_no_meta = [Document(text=row[1]) for row in rows]\n", + "\n", + "# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.\n", + "for idx, doc in enumerate(documents_no_meta):\n", + " doc.id_ = f\"doc_{idx}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 331, + "referenced_widgets": [ + "bd4c5bc2c7ee443999058f7f232c50f9", + "74cda13649844f24a2e6ebce82213865", + "dc498ad680d44d1e8e6fd2df2541a8ba", + "0bf0c22fbb024723b3a51dbe6d684c79", + "34fea76878874d67baae4946b8d9b1da", + "bfda4d80ca4f4805be90772690d26fe0", + "a6876009a1fb4bcc83f779eab7a4e3b7", + "4211db3192514c8189db0430779d660a", + "9a5ad060a90c4f14ba05527fdcfe8a72", + "b0c77210699e4f30ae2a2a97860de7bb", + "e31244d1c2b345a9950de74aac576290", + "c58ea3f8afc64b17a553aecfe07b375d", + "da5417a69cb5466db258defea0a70f7c", + "45a7725a8e8b45c1937eca9dffe650d3", + "d33e03cfb6c340bf9c1d661e633afc2e", + "6dc4da2c822c460ca0c2a11266806504", + "b26896dfe0ba4779bf753602039ece5a", + "033ed4123cec43868ada3795d974d895", + "00a715d98c584ca1b540187546128d93", + "5c59aed5b5b244f1bdf80a08837e4bf5", + "e301611efb2b4a19b08c13c76ceb8ab5", + "d39554575910469cb65078ea82c988b6" + ] }, - "bd4c5bc2c7ee443999058f7f232c50f9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_74cda13649844f24a2e6ebce82213865", - "IPY_MODEL_dc498ad680d44d1e8e6fd2df2541a8ba", - "IPY_MODEL_0bf0c22fbb024723b3a51dbe6d684c79" - ], - "layout": "IPY_MODEL_34fea76878874d67baae4946b8d9b1da" - } + "id": "Hxf4jT6afiZt", + "outputId": "2dbf4606-8a4a-45f5-8969-a45744cd388e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Parsing nodes: 100%|██████████| 14/14 [00:00<00:00, 16.34it/s]\n", + "Generating embeddings: 100%|████���█████| 94/94 [00:01<00:00, 67.81it/s]\n" + ] + } + ], + "source": [ + "from llama_index.embeddings import OpenAIEmbedding\n", + "from llama_index.ingestion import IngestionPipeline\n", + "\n", + "pipeline = IngestionPipeline(\n", + " transformations=[\n", + " text_splitter,\n", + " OpenAIEmbedding(),\n", + " ]\n", + ")\n", + "\n", + "nodes_no_meta = pipeline.run(documents=documents_no_meta, show_progress=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "A39Y1Rv6fiXE" + }, + "outputs": [], + "source": [ + "from llama_index import ServiceContext\n", + "\n", + "index_no_metadata = VectorStoreIndex(\n", + " nodes=nodes_no_meta,\n", + " service_context=ServiceContext.from_defaults(llm=OpenAI(model=\"gpt-3.5-turbo\")),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "BOpdZdQufiUu" + }, + "outputs": [], + "source": [ + "query_engine_no_metadata = index_no_metadata.as_query_engine()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "2U2NIE2Yfz8E" + }, + "outputs": [], + "source": [ + "res = query_engine_no_metadata.query(\"Can LLaMA2 do NLU?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 89 }, - "74cda13649844f24a2e6ebce82213865": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bfda4d80ca4f4805be90772690d26fe0", - "placeholder": "​", - "style": "IPY_MODEL_a6876009a1fb4bcc83f779eab7a4e3b7", - "value": "Parsing nodes: 100%" - } + "id": "mxT7_IJ7f1gU", + "outputId": "8580ffab-c32c-4cdb-8125-a0165ed4b164" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'LLaMA2 is a foundational language model developed by Meta. While the context does not explicitly mention whether LLaMA2 can do Natural Language Understanding (NLU), it is described as a research tool for researchers and academics to understand and study NLP models. Therefore, it is possible that LLaMA2 has capabilities related to NLU, but without further information, it cannot be definitively stated.'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.response" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "dc498ad680d44d1e8e6fd2df2541a8ba": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4211db3192514c8189db0430779d660a", - "max": 14, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9a5ad060a90c4f14ba05527fdcfe8a72", - "value": 14 - } + "id": "GD5SQ7VEf2wR", + "outputId": "e1f159e2-f718-493d-c65c-51a370c1702d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node ID\t e97bbe3d37bacb34902b4db67351799f1309541d4879e53b97fad08a4417304f\n", + "Text\t LLaMA: Meta's new AI tool According to the official release, LLaMA is a foundational language model developed to assist 'researchers and academics' in their work (as opposed to the average web user) to understand and study these NLP models. Leveraging AI in such a way could give researchers an edge in terms of time spent. You may not know this, but this would be Meta's third LLM after Blender Bot 3 and Galactica. However, the two LLMs were shut down soon, and Meta stopped their further development, as it produced erroneous results. Before moving further, it is important to emphasize that LLaMA is NOT a chatbot like ChatGPT. As I mentioned before, it is a 'research tool' for researchers. We can expect the initial versions of LLaMA to be a bit more technical and indirect to use as opposed to the case with ChatGPT, which was very direct, interactive, and a lot easy to use. \"Smaller, more performant models such as LLaMA enable ... research community who don't have access to large amounts of infrastructure to study these models.. further democratizing access in this important, fast-changing field,\" said Meta in its official blog. Meta's effort of \"democratizing\" access to the public could shed light on one of the critical issues of Generative AI - toxicity and bias. ChatGPT and other LLMs (obviously, I am referring to Bing) have a track record of responding in a way that is toxic and, well... evil. The Verge and major critics have covered it in much detail. Oh and the community did get the access, but not in the way Meta anticipated. On March 3rd, a downloadable torrent of the LLaMA system was posted on 4chan. 4chan is an anonymous online forum known for its controversial content and diverse range of discussions, which has nearly 222 million unique monthly visitors. LLaMA is currently not in use on any of Meta's products. But Meta has plans to make it available to researchers before they can use them in their own products. It's worth mentioning that Meta did not release LLaMA as a public chatbot. LLaMA is more of an open-source package that can be accessed by trusted authorities upon request. Powerful LLMs: What to hope Whether to agree with Ladish's views or not is debatable. Personally, I feel open-sourcing AI models could only benefit\n", + "Score\t 0.8511842082572946\n", + "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n", + "Node ID\t ab651375c4bf52b30d0d709c5c1ac7c52e75399b0cdc1f1139c3d54cda15d0f4\n", + "Text\t for its controversial content and diverse range of discussions, which has nearly 222 million unique monthly visitors. LLaMA is currently not in use on any of Meta's products. But Meta has plans to make it available to researchers before they can use them in their own products. It's worth mentioning that Meta did not release LLaMA as a public chatbot. LLaMA is more of an open-source package that can be accessed by trusted authorities upon request. Powerful LLMs: What to hope Whether to agree with Ladish's views or not is debatable. Personally, I feel open-sourcing AI models could only benefit the AI community to scrutinize the model and improve them for the better. What do you think? After all, one of LLaMA's major goals is to 'democratize' access to such models. But this access in the form of a leak put Meta into question - how it handles its tools and conducts release in public? Most of the users that got the leaked copies soon discovered that LLaMA was not at all similar to ChatGPT. \"Downloading\" LLaMA is going to do very little for the average internet user because it's a \"raw\" AI system that needs a decent amount of technical expertise to get up and running. However, as I am writing this, Meta hasn't acknowledged the leak to the public yet. Neither did they comment on it. There are both positive and negative consequences to this leak. On the one hand, unrestricted access to Llama could help researchers understand how and why large language models work, which could lead to improvements in robustness, bias, and the toxic nature of LLMs. This could really help in reducing the potential for generating misinformation by these troublesome machines. On the other hand, however, the leak could lead to people misusing the model itself. It is not yet perfect. Hence Meta hasn't released it fully to the public yet. Risks such as spam and phishing could be really hard to tackle if such superintelligent machines are put to the test. Thus, much safeguard must be applied to the use of these models. We can see such tools, like OpenAI Text Classifier, emerging. So there is a positive hope for this. AI is exciting, no doubt. But a lot scarier if we lose our control over it.\n", + "Score\t 0.8494642767398203\n", + "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n" + ] + } + ], + "source": [ + "for src in res.source_nodes:\n", + " print(\"Node ID\\t\", src.node_id)\n", + " print(\"Text\\t\", src.text)\n", + " print(\"Score\\t\", src.score)\n", + " print(\"-_\"*20)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iMkpzH7vvb09" + }, + "source": [ + "# Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "H8a3eKgKvckU" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 108/108 [05:05<00:00, 2.83s/it]\n" + ] + } + ], + "source": [ + "from llama_index.evaluation import generate_question_context_pairs\n", + "from llama_index.llms import OpenAI\n", + "\n", + "llm = OpenAI(model=\"gpt-3.5-turbo\")\n", + "rag_eval_dataset = generate_question_context_pairs(\n", + " nodes,\n", + " llm=llm,\n", + " num_questions_per_chunk=1\n", + ")\n", + "\n", + "# We can save the dataset as a json file for later use.\n", + "rag_eval_dataset.save_json(\"./rag_eval_dataset.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "3sA1K84U254o" + }, + "outputs": [], + "source": [ + "from llama_index.finetuning.embeddings.common import (\n", + " EmbeddingQAFinetuneDataset,\n", + ")\n", + "rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json(\n", + " \"./rag_eval_dataset.json\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "H7ubvcbk27vr" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "def display_results_retriever(name, eval_results):\n", + " \"\"\"Display results from evaluate.\"\"\"\n", + "\n", + " metric_dicts = []\n", + " for eval_result in eval_results:\n", + " metric_dict = eval_result.metric_vals_dict\n", + " metric_dicts.append(metric_dict)\n", + "\n", + " full_df = pd.DataFrame(metric_dicts)\n", + "\n", + " hit_rate = full_df[\"hit_rate\"].mean()\n", + " mrr = full_df[\"mrr\"].mean()\n", + "\n", + " metric_df = pd.DataFrame(\n", + " {\"Retriever Name\": [name], \"Hit Rate\": [hit_rate], \"MRR\": [mrr]}\n", + " )\n", + "\n", + " return metric_df" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "0bf0c22fbb024723b3a51dbe6d684c79": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b0c77210699e4f30ae2a2a97860de7bb", - "placeholder": "​", - "style": "IPY_MODEL_e31244d1c2b345a9950de74aac576290", - "value": " 14/14 [00:00<00:00, 21.91it/s]" - } + "id": "uNLxDxoc2-Ac", + "outputId": "ea09c887-3b82-4f59-8818-8bd6d7f0e1e3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_2 0.638646 0.523472\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_4 0.776201 0.564683\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_6 0.819869 0.572726\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_8 0.840611 0.575494\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_10 0.854803 0.576937\n" + ] + } + ], + "source": [ + "from llama_index.evaluation import RetrieverEvaluator\n", + "\n", + "# We can evaluate the retievers with different top_k values.\n", + "for i in [2, 4, 6, 8, 10]:\n", + " retriever = index.as_retriever(similarity_top_k=i)\n", + " retriever_evaluator = RetrieverEvaluator.from_metric_names(\n", + " [\"mrr\", \"hit_rate\"], retriever=retriever\n", + " )\n", + " eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)\n", + " print(display_results_retriever(f\"Retriever top_{i}\", eval_results))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "34fea76878874d67baae4946b8d9b1da": { + "id": "3ukkWC9R2_0J", + "outputId": "8d93822c-ec27-4103-d2b4-f63405ace512" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "top_2 faithfulness_score: 0.95\n", + "top_2 relevancy_score: 0.95\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_4 faithfulness_score: 1.0\n", + "top_4 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_6 faithfulness_score: 1.0\n", + "top_6 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_8 faithfulness_score: 1.0\n", + "top_8 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_10 faithfulness_score: 1.0\n", + "top_10 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n" + ] + } + ], + "source": [ + "from llama_index.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner\n", + "from llama_index import ServiceContext\n", + "from llama_index.llms import OpenAI\n", + "\n", + "for i in [2, 4, 6, 8, 10]:\n", + " # Set Faithfulness and Relevancy evaluators\n", + " query_engine = index.as_query_engine(similarity_top_k=i)\n", + "\n", + " # While we use GPT3.5-Turbo to answer questions, we can use GPT4 to evaluate the answers.\n", + " llm_gpt4 = OpenAI(temperature=0, model=\"gpt-4-1106-preview\")\n", + " service_context_gpt4 = ServiceContext.from_defaults(llm=llm_gpt4)\n", + "\n", + " faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt4)\n", + " relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt4)\n", + "\n", + " # Run evaluation\n", + " queries = list(rag_eval_dataset.queries.values())\n", + " batch_eval_queries = queries[:20]\n", + "\n", + " runner = BatchEvalRunner(\n", + " {\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n", + " workers=8,\n", + " )\n", + " eval_results = await runner.aevaluate_queries(\n", + " query_engine, queries=batch_eval_queries\n", + " )\n", + " faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])\n", + " print(f\"top_{i} faithfulness_score: {faithfulness_score}\")\n", + "\n", + " relevancy_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['relevancy'])\n", + " print(f\"top_{i} relevancy_score: {relevancy_score}\")\n", + " print(\"-_\"*10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate No Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "1MB1YD1E3EKM" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_2 0.394105 0.324236\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_4 0.529476 0.364447\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_6 0.575328 0.372889\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_8 0.605895 0.376944\n", + " Retriever Name Hit Rate MRR\n", + "0 Retriever top_10 0.624454 0.378897\n" + ] + } + ], + "source": [ + "from llama_index.evaluation import RetrieverEvaluator\n", + "\n", + "# We can evaluate the retievers with different top_k values.\n", + "for i in [2, 4, 6, 8, 10]:\n", + " retriever = index_no_metadata.as_retriever(similarity_top_k=i)\n", + " retriever_evaluator = RetrieverEvaluator.from_metric_names(\n", + " [\"mrr\", \"hit_rate\"], retriever=retriever\n", + " )\n", + " eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)\n", + " print(display_results_retriever(f\"Retriever top_{i}\", eval_results))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "top_2 faithfulness_score: 1.0\n", + "top_2 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_4 faithfulness_score: 1.0\n", + "top_4 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_6 faithfulness_score: 1.0\n", + "top_6 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_8 faithfulness_score: 1.0\n", + "top_8 relevancy_score: 1.0\n", + "-_-_-_-_-_-_-_-_-_-_\n", + "top_10 faithfulness_score: 0.95\n", + "top_10 relevancy_score: 0.95\n", + "-_-_-_-_-_-_-_-_-_-_\n" + ] + } + ], + "source": [ + "from llama_index.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner\n", + "from llama_index import ServiceContext\n", + "from llama_index.llms import OpenAI\n", + "\n", + "for i in [2, 4, 6, 8, 10]:\n", + " # Set Faithfulness and Relevancy evaluators\n", + " query_engine = index_no_metadata.as_query_engine(similarity_top_k=i)\n", + "\n", + " # While we use GPT3.5-Turbo to answer questions, we can use GPT4 to evaluate the answers.\n", + " llm_gpt4 = OpenAI(temperature=0, model=\"gpt-4-1106-preview\")\n", + " service_context_gpt4 = ServiceContext.from_defaults(llm=llm_gpt4)\n", + "\n", + " faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt4)\n", + " relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt4)\n", + "\n", + " # Run evaluation\n", + " queries = list(rag_eval_dataset.queries.values())\n", + " batch_eval_queries = queries[:20]\n", + "\n", + " runner = BatchEvalRunner(\n", + " {\"faithfulness\": faithfulness_evaluator, \"relevancy\": relevancy_evaluator},\n", + " workers=8,\n", + " )\n", + " eval_results = await runner.aevaluate_queries(\n", + " query_engine, queries=batch_eval_queries\n", + " )\n", + " faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])\n", + " print(f\"top_{i} faithfulness_score: {faithfulness_score}\")\n", + "\n", + " relevancy_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['relevancy'])\n", + " print(f\"top_{i} relevancy_score: {relevancy_score}\")\n", + " print(\"-_\"*10)" + ] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyNlJV4zbpjtN6glOumdzocl", + "collapsed_sections": [ + "6Wx-IPSMbSwC" + ], + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00a715d98c584ca1b540187546128d93": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -843,10 +1164,46 @@ "width": null } }, - "bfda4d80ca4f4805be90772690d26fe0": { + "033ed4123cec43868ada3795d974d895": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0bf0c22fbb024723b3a51dbe6d684c79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0c77210699e4f30ae2a2a97860de7bb", + "placeholder": "​", + "style": "IPY_MODEL_e31244d1c2b345a9950de74aac576290", + "value": " 14/14 [00:00<00:00, 21.91it/s]" + } + }, + "2634e510d3c844d88891a98661beb6a9": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -895,25 +1252,10 @@ "width": null } }, - "a6876009a1fb4bcc83f779eab7a4e3b7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4211db3192514c8189db0430779d660a": { + "2c8aef5e8ec848c0a23c72581e5f4b1e": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -962,26 +1304,10 @@ "width": null } }, - "9a5ad060a90c4f14ba05527fdcfe8a72": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "b0c77210699e4f30ae2a2a97860de7bb": { + "2e939db189424ab7b5f9095932f2c99f": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1030,113 +1356,10 @@ "width": null } }, - "e31244d1c2b345a9950de74aac576290": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c58ea3f8afc64b17a553aecfe07b375d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_da5417a69cb5466db258defea0a70f7c", - "IPY_MODEL_45a7725a8e8b45c1937eca9dffe650d3", - "IPY_MODEL_d33e03cfb6c340bf9c1d661e633afc2e" - ], - "layout": "IPY_MODEL_6dc4da2c822c460ca0c2a11266806504" - } - }, - "da5417a69cb5466db258defea0a70f7c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b26896dfe0ba4779bf753602039ece5a", - "placeholder": "​", - "style": "IPY_MODEL_033ed4123cec43868ada3795d974d895", - "value": "Generating embeddings: 100%" - } - }, - "45a7725a8e8b45c1937eca9dffe650d3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_00a715d98c584ca1b540187546128d93", - "max": 94, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_5c59aed5b5b244f1bdf80a08837e4bf5", - "value": 94 - } - }, - "d33e03cfb6c340bf9c1d661e633afc2e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e301611efb2b4a19b08c13c76ceb8ab5", - "placeholder": "​", - "style": "IPY_MODEL_d39554575910469cb65078ea82c988b6", - "value": " 94/94 [00:03<00:00, 26.26it/s]" - } - }, - "6dc4da2c822c460ca0c2a11266806504": { + "34fea76878874d67baae4946b8d9b1da": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1185,10 +1408,10 @@ "width": null } }, - "b26896dfe0ba4779bf753602039ece5a": { + "4211db3192514c8189db0430779d660a": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1237,25 +1460,74 @@ "width": null } }, - "033ed4123cec43868ada3795d974d895": { + "45a7725a8e8b45c1937eca9dffe650d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_00a715d98c584ca1b540187546128d93", + "max": 94, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5c59aed5b5b244f1bdf80a08837e4bf5", + "value": 94 + } + }, + "5c59aed5b5b244f1bdf80a08837e4bf5": { "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", + "bar_color": null, "description_width": "" } }, - "00a715d98c584ca1b540187546128d93": { + "626b1ba98c374987913a7a4384f19fa1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e4413564a300469d86c3abc567f24701", + "max": 14, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_64167ae99cd24c729435aefc1ea13519", + "value": 14 + } + }, + "63a3dcff335349deacf4abb9b68d76ab": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1304,10 +1576,10 @@ "width": null } }, - "5c59aed5b5b244f1bdf80a08837e4bf5": { + "64167ae99cd24c729435aefc1ea13519": { "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -1320,10 +1592,40 @@ "description_width": "" } }, - "e301611efb2b4a19b08c13c76ceb8ab5": { + "6a4cc229f5774cb0b4d3def7eee8b56e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6b3d2afb949f4de691ceac601bd96d0e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6dc4da2c822c460ca0c2a11266806504": { "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", "model_module_version": "1.2.0", + "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1361,1144 +1663,826 @@ "min_width": null, "object_fit": null, "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d39554575910469cb65078ea82c988b6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Install Packages and Setup Variables" - ], - "metadata": { - "id": "-zE1h0uQV7uT" - } - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "QPJzr-I9XQ7l", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "1b699f15-bd3f-473d-dd37-74257e6d263e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m22.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.6/508.6 kB\u001b[0m \u001b[31m36.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.9/79.9 MB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.7/51.7 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m48.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.1/92.1 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.7/60.7 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.1/41.1 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m75.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m47.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m37.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m42.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.6/67.6 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━���━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m54.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.5/71.5 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m82.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m63.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q llama-index==0.9.21 openai==1.6.0 tiktoken==0.5.2 chromadb==0.4.21 kaleido==0.2.1 python-multipart==0.0.6 cohere==4.39" - ] - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "\n", - "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n", - "os.environ[\"OPENAI_API_KEY\"] = \"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\"" - ], - "metadata": { - "id": "riuXwpSPcvWC" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()" - ], - "metadata": { - "id": "jIEeZzqLbz0J" - }, - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Load a Model" - ], - "metadata": { - "id": "Bkgi2OrYzF7q" - } - }, - { - "cell_type": "code", - "source": [ - "from llama_index.llms import OpenAI\n", - "\n", - "llm = OpenAI(temperature=0.9, model=\"gpt-3.5-turbo\", max_tokens=512)" - ], - "metadata": { - "id": "9oGT6crooSSj" - }, - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Create a VectoreStore" - ], - "metadata": { - "id": "0BwVuJXlzHVL" - } - }, - { - "cell_type": "code", - "source": [ - "import chromadb\n", - "\n", - "# create client and a new collection\n", - "# chromadb.EphemeralClient saves data in-memory.\n", - "chroma_client = chromadb.PersistentClient(path=\"./mini-llama-articles\")\n", - "chroma_collection = chroma_client.create_collection(\"mini-llama-articles\")" - ], - "metadata": { - "id": "SQP87lHczHKc" - }, - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from llama_index.vector_stores import ChromaVectorStore\n", - "\n", - "# Define a storage context object using the created vector database.\n", - "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" - ], - "metadata": { - "id": "zAaGcYMJzHAN" - }, - "execution_count": 7, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Load the Dataset (CSV)" - ], - "metadata": { - "id": "I9JbAzFcjkpn" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Download" - ], - "metadata": { - "id": "ceveDuYdWCYk" - } - }, - { - "cell_type": "markdown", - "source": [ - "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string." - ], - "metadata": { - "id": "eZwf6pv7WFmD" - } - }, - { - "cell_type": "code", - "source": [ - "!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "id": "wl_pbPvMlv1h", - "outputId": "38f73ac6-b824-4a5b-9385-e7b1afbd2cc8" - }, - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2024-02-05 18:30:56-- https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 173646 (170K) [text/plain]\n", - "Saving to: ‘mini-llama-articles.csv’\n", - "\n", - "mini-llama-articles 100%[===================>] 169.58K --.-KB/s in 0.02s \n", - "\n", - "2024-02-05 18:30:56 (10.1 MB/s) - ‘mini-llama-articles.csv’ saved [173646/173646]\n", - "\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Read File" - ], - "metadata": { - "id": "VWBLtDbUWJfA" - } - }, - { - "cell_type": "code", - "source": [ - "import csv\n", - "\n", - "rows = []\n", - "\n", - "# Load the file as a JSON\n", - "with open(\"./mini-llama-articles.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n", - " csv_reader = csv.reader(file)\n", - "\n", - " for idx, row in enumerate( csv_reader ):\n", - " if idx == 0: continue; # Skip header row\n", - " rows.append( row )\n", - "\n", - "# The number of characters in the dataset.\n", - "len( rows )" - ], - "metadata": { - "id": "0Q9sxuW0g3Gd", - "colab": { - "base_uri": "https://localhost:8080/" + "6f9f666836084de7894aa2e65c8dbe07": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "outputId": "6bd4f786-f888-4d3b-d324-95230ef5f544" - }, - "execution_count": 9, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "14" - ] - }, - "metadata": {}, - "execution_count": 9 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Convert to Document obj" - ], - "metadata": { - "id": "S17g2RYOjmf2" - } - }, - { - "cell_type": "code", - "source": [ - "from llama_index import Document\n", - "\n", - "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n", - "documents = [Document(text=row[1], metadata={\"title\": row[0], \"url\": row[2], \"source_name\": row[3]}) for row in rows]" - ], - "metadata": { - "id": "YizvmXPejkJE" - }, - "execution_count": 10, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Transforming" - ], - "metadata": { - "id": "qjuLbmFuWsyl" - } - }, - { - "cell_type": "code", - "source": [ - "from llama_index.text_splitter import TokenTextSplitter\n", - "\n", - "text_splitter = TokenTextSplitter(\n", - " separator=\" \", chunk_size=512, chunk_overlap=128\n", - ")" - ], - "metadata": { - "id": "9z3t70DGWsjO" - }, - "execution_count": 11, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from llama_index.extractors import (\n", - " SummaryExtractor,\n", - " QuestionsAnsweredExtractor,\n", - " KeywordExtractor,\n", - ")\n", - "from llama_index.embeddings import OpenAIEmbedding\n", - "from llama_index.ingestion import IngestionPipeline\n", - "\n", - "pipeline = IngestionPipeline(\n", - " transformations=[\n", - " text_splitter,\n", - " QuestionsAnsweredExtractor(questions=3, llm=llm),\n", - " SummaryExtractor(summaries=[\"prev\", \"self\"], llm=llm),\n", - " KeywordExtractor(keywords=10, llm=llm),\n", - " OpenAIEmbedding(),\n", - " ],\n", - " vector_store=vector_store\n", - ")\n", - "\n", - "nodes = pipeline.run(documents=documents, show_progress=True);" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 385, - "referenced_widgets": [ - "7a469b6821ed458d99a1ed57e72b3d68", - "8c556c8c8ce941c6b433780fd4a6ae54", - "626b1ba98c374987913a7a4384f19fa1", - "a4fad4d11a8941f8b90abb3099e9a090", - "c3a4b958e4814294801495226697bce2", - "2e939db189424ab7b5f9095932f2c99f", - "fd6a36e947ec451a938d266117dab12e", - "e4413564a300469d86c3abc567f24701", - "64167ae99cd24c729435aefc1ea13519", - "2634e510d3c844d88891a98661beb6a9", - "6b3d2afb949f4de691ceac601bd96d0e", - "8cc800fbe6bc4f4da5dd6b93d4a5143a", - "812d5d9b04f74592b850b3eb32f88c04", - "ed22c91e813c4351ab1d3eb7e174796c", - "de2088a425104f05b52b7a3236c7baa9", - "6f9f666836084de7894aa2e65c8dbe07", - "63a3dcff335349deacf4abb9b68d76ab", - "99eb83f4b8904e20b45573bab84aa5f4", - "2c8aef5e8ec848c0a23c72581e5f4b1e", - "7d54abb8f3784a789fd042c2ed2dd685", - "a1a88448b188407b8e4aa2af86fb9345", - "6a4cc229f5774cb0b4d3def7eee8b56e" - ] + "74cda13649844f24a2e6ebce82213865": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bfda4d80ca4f4805be90772690d26fe0", + "placeholder": "​", + "style": "IPY_MODEL_a6876009a1fb4bcc83f779eab7a4e3b7", + "value": "Parsing nodes: 100%" + } + }, + "7a469b6821ed458d99a1ed57e72b3d68": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8c556c8c8ce941c6b433780fd4a6ae54", + "IPY_MODEL_626b1ba98c374987913a7a4384f19fa1", + "IPY_MODEL_a4fad4d11a8941f8b90abb3099e9a090" + ], + "layout": "IPY_MODEL_c3a4b958e4814294801495226697bce2" + } + }, + "7d54abb8f3784a789fd042c2ed2dd685": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - "id": "P9LDJ7o-Wsc-", - "outputId": "2e27e965-fd4c-4754-94f5-3a6e33a72dea" - }, - "execution_count": 108, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Parsing nodes: 0%| | 0/14 [00:00>\\nYou are a helpful Question Answering Assistant. Please only answer from this reference Source:8989REF\" However, that turned out to be a very naive attempt. Also, note that the generated QA missed transforming training data related to Professor Thiersch's method to a proper QA dataset. These and other improvements need to be experimented with, as well as to train with some completely new data that the model has not seen to test more effectively. Update: Training with new data was done by writing an imaginary story with ChatGPT help and then creating an instruction tuning data set (colab notebook). The model was then trained and tested (colab notebook) with this generated instruct dataset. The results confirm that the model learns via Instruct tuning, not only the fed questions but other details and relations of the domain. Problems with hallucinations remain (Bordor, Lila characters who are\n", - "Score\t 0.7332761726476956\n", - "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n", - "Node ID\t 9a0e5fae-f59e-4406-941d-4adbb379caca\n", - "Title\t Exploring Large Language Models -Part 3\n", - "Text\t LM model training via UnSupervised learning). Note that this model was loaded in 4-bit, making it runnable on a single T4 GPU and trained with QLoRa. With QLoRA, only a fraction of the adapter weights are trained and summed with the existing frozen pre-trained weights of the model during inference. Here is an illustrative Colab notebook. You can see that training the model with just the text as is, does not result in proper output to questions. The answers are not affected by the training data. Take 2: Instruct Fine-tuning with QLoRa Instruction Tuning concept is a higher-level training concept introduced by this paper FineTuned Language Models Are Zero shot Learners (FLAN) We leverage the intuition that NLP tasks can be described via natural language instructions, such as \"Is the sentiment of this movie review positive or negative?\" or \"Translate 'how are you' into Chinese.\" We take a pre-trained language model of 137B parameters and perform instruction tuning ... Since we use QLoRa we are effectively closely following this paper - QLORA: Efficient Finetuning of Quantized LLMs concerning the training data set, the format that the authors used to train their Gauanco model This is the format for the Llama2 model and will be different for others. One of the hardest problems of training is finding or creating a good quality data set to train. In our case, converting the available training data set to the instruction data set. Since our use case is Closed Book QA, we need to convert this to a QA format. Using older NLP methods like NER (Named Entity Recognition) and then using that to create a QA dataset was not effective. This is where the Self-instruct concept could be used However previous to Llama2, the best-performing model was the GPT 3/4 model via ChatGPT or its API and using these models to do the same was expensive. The 7 billion model of Llama2 has sufficient NLU (Natural Language Understanding) to create output based on a particular format. Running this in 4-bit mode via Quantisation makes it feasible compute-wise to run this on a large data set and convert it to a QA dataset. This was the prompt used. The\n", - "Score\t 0.7291147820262973\n", - "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "From the articles:\n", - " \n", - "> [...]The 7 billion model of Llama2 has sufficient NLU (Natural Language Understanding) to create output based on a particular format[...]\n", - "\n" - ], - "metadata": { - "id": "TmkI8BV8rATi" - } - }, - { - "cell_type": "markdown", - "source": [ - "# No Metadata" - ], - "metadata": { - "id": "6Wx-IPSMbSwC" - } - }, - { - "cell_type": "code", - "source": [ - "documents_no_meta = [Document(text=row[1]) for row in rows]" - ], - "metadata": { - "id": "oGunPKGRbT6H" - }, - "execution_count": 24, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from llama_index.embeddings import OpenAIEmbedding\n", - "from llama_index.ingestion import IngestionPipeline\n", - "\n", - "pipeline = IngestionPipeline(\n", - " transformations=[\n", - " text_splitter,\n", - " OpenAIEmbedding(),\n", - " ]\n", - ")\n", - "\n", - "nodes_no_meta = pipeline.run(documents=documents_no_meta, show_progress=True)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 331, - "referenced_widgets": [ - "bd4c5bc2c7ee443999058f7f232c50f9", - "74cda13649844f24a2e6ebce82213865", - "dc498ad680d44d1e8e6fd2df2541a8ba", - "0bf0c22fbb024723b3a51dbe6d684c79", - "34fea76878874d67baae4946b8d9b1da", - "bfda4d80ca4f4805be90772690d26fe0", - "a6876009a1fb4bcc83f779eab7a4e3b7", - "4211db3192514c8189db0430779d660a", - "9a5ad060a90c4f14ba05527fdcfe8a72", - "b0c77210699e4f30ae2a2a97860de7bb", - "e31244d1c2b345a9950de74aac576290", - "c58ea3f8afc64b17a553aecfe07b375d", - "da5417a69cb5466db258defea0a70f7c", - "45a7725a8e8b45c1937eca9dffe650d3", - "d33e03cfb6c340bf9c1d661e633afc2e", - "6dc4da2c822c460ca0c2a11266806504", - "b26896dfe0ba4779bf753602039ece5a", - "033ed4123cec43868ada3795d974d895", - "00a715d98c584ca1b540187546128d93", - "5c59aed5b5b244f1bdf80a08837e4bf5", - "e301611efb2b4a19b08c13c76ceb8ab5", - "d39554575910469cb65078ea82c988b6" - ] + "d39554575910469cb65078ea82c988b6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "id": "Hxf4jT6afiZt", - "outputId": "2dbf4606-8a4a-45f5-8969-a45744cd388e" - }, - "execution_count": 25, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Parsing nodes: 0%| | 0/14 [00:00