{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github" }, "source": [ "\"Open\n" ] }, { "cell_type": "markdown", "metadata": { "id": "5BGJ3fxhOk2V" }, "source": [ "# Install Packages and Setup Variables\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QPJzr-I9XQ7l" }, "outputs": [], "source": [ "!pip install -q llama-index==0.10.57 llama-index-vector-stores-chroma==0.1.9 llama-index-llms-gemini==0.1.11 google-generativeai==0.5.4 langchain==0.1.17 langchain-chroma==0.1.0 langchain_openai==0.1.5 openai==1.37.0 chromadb==0.5.3" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "riuXwpSPcvWC" }, "outputs": [], "source": [ "import os\n", "\n", "# Set the following API Keys in the Python environment. Will be used later.\n", "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", "os.environ[\"GOOGLE_API_KEY\"] = \"\"" ] }, { "cell_type": "markdown", "metadata": { "id": "I9JbAzFcjkpn" }, "source": [ "# Load the Dataset (CSV)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "_Tif8-JoRH68" }, "source": [ "## Download\n" ] }, { "cell_type": "markdown", "metadata": { "id": "4fQaa1LN1mXL" }, "source": [ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string.\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-QTUkdfJjY4N", "outputId": "a88b2f8a-0c84-45a0-9b32-5088fe596612" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 169k 100 169k 0 0 1581k 0 --:--:-- --:--:-- --:--:-- 1584k\n" ] } ], "source": [ "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv" ] }, { "cell_type": "markdown", "metadata": { "id": "zk-4alIxROo8" }, "source": [ "## Read File\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7CYwRT6R0o0I", "outputId": "351f170f-9a00-4b09-ae08-b45c3c48fce5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "171044\n" ] } ], "source": [ "import csv\n", "\n", "text = \"\"\n", "\n", "# Load the file as a JSON\n", "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n", " csv_reader = csv.reader(file)\n", "\n", " for idx, row in enumerate(csv_reader):\n", " if idx == 0:\n", " continue\n", " text += row[1]\n", "\n", "# The number of characters in the dataset.\n", "print(len(text))" ] }, { "cell_type": "markdown", "metadata": { "id": "S17g2RYOjmf2" }, "source": [ "# Chunking\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "STACTMUR1z9N", "outputId": "15a61eac-8774-4cdb-db8d-e2eb5b07e517" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "335\n" ] } ], "source": [ "chunk_size = 512\n", "chunks = []\n", "\n", "# Split the long text into smaller manageable chunks of 512 characters.\n", "for i in range(0, len(text), chunk_size):\n", " chunks.append(text[i : i + chunk_size])\n", "\n", "print(len(chunks))" ] }, { "cell_type": "markdown", "metadata": { "id": "9fOomeMGqu10" }, "source": [ "#Interface of Chroma with LlamaIndex\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "CtdsIUQ81_hT" }, "outputs": [], "source": [ "from llama_index.core import Document\n", "\n", "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n", "documents = [Document(text=t) for t in chunks]" ] }, { "cell_type": "markdown", "metadata": { "id": "OWaT6rL7ksp8" }, "source": [ "Save on Chroma\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "mXi56KTXk2sp" }, "outputs": [], "source": [ "import chromadb\n", "\n", "# create client and a new collection\n", "# chromadb.EphemeralClient saves data in-memory.\n", "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n", "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "jKXURvLtkuTS" }, "outputs": [], "source": [ "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "\n", "# Define a storage context object using the created vector database.\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "WsD52wtrlESi" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/omar/Documents/ai_repos/ai-tutor-rag-system/env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Parsing nodes: 100%|██████████| 335/335 [00:00<00:00, 8031.85it/s]\n", "Generating embeddings: 100%|██████████| 335/335 [00:03<00:00, 97.24it/s] \n" ] } ], "source": [ "from llama_index.core import VectorStoreIndex\n", "from llama_index.core.node_parser import SentenceSplitter\n", "from llama_index.embeddings.openai import OpenAIEmbedding\n", "\n", "# Build index / generate embeddings using OpenAI embedding model\n", "index = VectorStoreIndex.from_documents(\n", " documents,\n", " embed_model=OpenAIEmbedding(model=\"text-embedding-3-small\"),\n", " storage_context=storage_context,\n", " show_progress=True,\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "8JPD8yAinVSq" }, "source": [ "Query Dataset\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "mzS13x1ZlZ5X" }, "outputs": [], "source": [ "# Define a query engine that is responsible for retrieving related pieces of text,\n", "# and using a LLM to formulate the final answer.\n", "\n", "from llama_index.llms.gemini import Gemini\n", "\n", "llm = Gemini(model=\"models/gemini-1.5-flash\", temperature=1, max_tokens=512)\n", "\n", "query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AYsQ4uLN_Oxg", "outputId": "5066a06c-77ff-48a2-ee61-3abe2e9755e2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The LLaMA2 model has four different sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters. \n", "\n" ] } ], "source": [ "response = query_engine.query(\"How many parameters LLaMA2 model has?\")\n", "print(response)" ] }, { "cell_type": "markdown", "metadata": { "id": "kWK571VNg-qR" }, "source": [ "# Interface of Chroma with LangChain\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "SMPAniL2e4NP" }, "outputs": [], "source": [ "from langchain.schema.document import Document\n", "\n", "# Convert the chunks to Document objects so the LangChain framework can process them.\n", "documents = [Document(page_content=t) for t in chunks]" ] }, { "cell_type": "markdown", "metadata": { "id": "QBt8qGxArUPD" }, "source": [ "Save on Chroma\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "2xas7HkuhJ8A" }, "outputs": [], "source": [ "from langchain_chroma import Chroma\n", "from langchain_openai import OpenAIEmbeddings\n", "\n", "# Add the documents to chroma DB and create Index / embeddings\n", "\n", "embeddings = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n", "chroma_db = Chroma.from_documents(\n", " documents=documents,\n", " embedding=embeddings,\n", " persist_directory=\"./mini-chunked-dataset\",\n", " collection_name=\"mini-chunked-dataset\",\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "P8AXJJyBrZWF" }, "source": [ "Query Dataset\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-H64YLxshM2b" }, "outputs": [], "source": [ "from langchain_openai import ChatOpenAI\n", "\n", "# Initializing the LLM model\n", "llm = ChatOpenAI(temperature=0, model=\"gpt-4o-mini\", max_tokens=512)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AxBqPNtthPaa", "outputId": "93c9ad64-1cd1-4f52-c51e-6f3ec5d6542d" }, "outputs": [], "source": [ "from langchain.chains import RetrievalQA\n", "\n", "query = \"How many parameters LLaMA2 model has?\"\n", "retriever = chroma_db.as_retriever(search_kwargs={\"k\": 2})\n", "# Define a RetrievalQA chain that is responsible for retrieving related pieces of text,\n", "# and using a LLM to formulate the final answer.\n", "chain = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=retriever)\n", "\n", "response = chain(query)\n", "print(response[\"result\"])" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 0 }