{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "-zE1h0uQV7uT" }, "source": [ "# Install Packages and Setup Variables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QPJzr-I9XQ7l", "outputId": "19864102-680b-446b-fb38-7fad066cee09" }, "outputs": [], "source": [ "!pip install -q llama-index==0.10.11 openai==1.12.0 llama-index-finetuning llama-index-embeddings-huggingface llama-index-readers-web tiktoken==0.6.0 chromadb==0.4.22 pandas==2.2.0 html2text sentence_transformers pydantic kaleido==0.2.1" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "riuXwpSPcvWC" }, "outputs": [], "source": [ "import os\n", "\n", "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n", "os.environ[\"OPENAI_API_KEY\"] = \"\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "jIEeZzqLbz0J" }, "outputs": [], "source": [ "# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.\n", "\n", "import nest_asyncio\n", "\n", "nest_asyncio.apply()" ] }, { "cell_type": "markdown", "metadata": { "id": "Bkgi2OrYzF7q" }, "source": [ "# Load a Model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "9oGT6crooSSj" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/louis/Documents/GitHub/ai-tutor-rag-system/.conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from llama_index.llms.openai import OpenAI\n", "\n", "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)" ] }, { "cell_type": "markdown", "metadata": { "id": "0BwVuJXlzHVL" }, "source": [ "# Create a VectoreStore" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "SQP87lHczHKc" }, "outputs": [], "source": [ "import chromadb\n", "\n", "# create client and a new collection\n", "# chromadb.EphemeralClient saves data in-memory.\n", "chroma_client = chromadb.PersistentClient(path=\"./mini-llama-articles\")\n", "chroma_collection = chroma_client.create_collection(\"mini-llama-articles\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "zAaGcYMJzHAN" }, "outputs": [], "source": [ "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "\n", "# Define a storage context object using the created vector database.\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ] }, { "cell_type": "markdown", "metadata": { "id": "I9JbAzFcjkpn" }, "source": [ "# Load the Dataset (CSV)" ] }, { "cell_type": "markdown", "metadata": { "id": "ceveDuYdWCYk" }, "source": [ "## Download" ] }, { "cell_type": "markdown", "metadata": { "id": "eZwf6pv7WFmD" }, "source": [ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wl_pbPvMlv1h", "outputId": "5418de57-b95b-4b90-b7d0-a801ea3c73f7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 169k 100 169k 0 0 784k 0 --:--:-- --:--:-- --:--:-- 785k\n" ] } ], "source": [ "!curl -o ./mini-llama-articles.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv" ] }, { "cell_type": "markdown", "metadata": { "id": "VWBLtDbUWJfA" }, "source": [ "## Read File" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0Q9sxuW0g3Gd", "outputId": "801f2ba8-b498-4923-c1cc-c17d3208850c" }, "outputs": [ { "data": { "text/plain": [ "14" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import csv\n", "\n", "rows = []\n", "\n", "# Load the file as a JSON\n", "with open(\"./mini-llama-articles.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n", " csv_reader = csv.reader(file)\n", "\n", " for idx, row in enumerate( csv_reader ):\n", " if idx == 0: continue; # Skip header row\n", " rows.append( row )\n", "\n", "# The number of characters in the dataset.\n", "len( rows )" ] }, { "cell_type": "markdown", "metadata": { "id": "S17g2RYOjmf2" }, "source": [ "# Convert to Document obj" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "YizvmXPejkJE" }, "outputs": [], "source": [ "from llama_index.core import Document\n", "\n", "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n", "documents = [Document(text=row[1], metadata={\"title\": row[0], \"url\": row[2], \"source_name\": row[3]}) for row in rows]" ] }, { "cell_type": "markdown", "metadata": { "id": "qjuLbmFuWsyl" }, "source": [ "# Transforming" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "9z3t70DGWsjO" }, "outputs": [], "source": [ "from llama_index.core.text_splitter import TokenTextSplitter\n", "\n", "# Define the splitter object that split the text into segments with 512 tokens,\n", "# with a 128 overlap between the segments.\n", "text_splitter = TokenTextSplitter(\n", " separator=\" \", chunk_size=512, chunk_overlap=128\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 331, "referenced_widgets": [ "3fbabd8a8660461ba5e7bc08ef39139a", "df2365556ae242a2ab1a119f9a31a561", "5f4b9d32df8f446e858e4c289dc282f9", "5b588f83a15d42d9aca888e06bbd95ff", "ad073bca655540809e39f26538d2ec0d", "13b9c5395bca4c3ba21265240cb936cf", "47a4586384274577a726c57605e7f8d9", "96a3bdece738481db57e811ccb74a974", "5c7973afd79349ed997a69120d0629b2", "af9b6ae927dd4764b9692507791bc67e", "134210510d49476e959dd7d032bbdbdc", "5f9bb065c2b74d2e8ded32e1306a7807", "73a06bc546a64f7f99a9e4a135319dcd", "ce48deaf4d8c49cdae92bfdbb3a78df0", "4a172e8c6aa44e41a42fc1d9cf714fd0", "0245f2604e4d49c8bd0210302746c47b", "e956dfab55084a9cbe33c8e331b511e7", "cb394578badd43a89850873ad2526542", "193aef33d9184055bb9223f56d456de6", "abfc9aa911ce4a5ea81c7c451f08295f", "e7937a1bc68441a080374911a6563376", "e532ed7bfef34f67b5fcacd9534eb789" ] }, "id": "P9LDJ7o-Wsc-", "outputId": "01070c1f-dffa-4ab7-ad71-b07b76b12e03" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Parsing nodes: 0%| | 0/14 [00:00