{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github"
},
"source": [
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5BGJ3fxhOk2V"
},
"source": [
"# Install Packages and Setup Variables\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QPJzr-I9XQ7l"
},
"outputs": [],
"source": [
"!pip install -q llama-index==0.10.57 llama-index-vector-stores-chroma==0.1.9 llama-index-llms-gemini==0.1.11 google-generativeai==0.5.4 langchain==0.1.17 langchain-chroma==0.1.0 langchain_openai==0.1.5 openai==1.37.0 chromadb==0.5.3"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "riuXwpSPcvWC"
},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Set the following API Keys in the Python environment. Will be used later.\n",
"os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
"os.environ[\"GOOGLE_API_KEY\"] = \"\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "I9JbAzFcjkpn"
},
"source": [
"# Load the Dataset (CSV)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_Tif8-JoRH68"
},
"source": [
"## Download\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4fQaa1LN1mXL"
},
"source": [
"The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string.\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-QTUkdfJjY4N",
"outputId": "a88b2f8a-0c84-45a0-9b32-5088fe596612"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 169k 100 169k 0 0 1581k 0 --:--:-- --:--:-- --:--:-- 1584k\n"
]
}
],
"source": [
"!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zk-4alIxROo8"
},
"source": [
"## Read File\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7CYwRT6R0o0I",
"outputId": "351f170f-9a00-4b09-ae08-b45c3c48fce5"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"171044\n"
]
}
],
"source": [
"import csv\n",
"\n",
"text = \"\"\n",
"\n",
"# Load the file as a JSON\n",
"with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n",
" csv_reader = csv.reader(file)\n",
"\n",
" for idx, row in enumerate(csv_reader):\n",
" if idx == 0:\n",
" continue\n",
" text += row[1]\n",
"\n",
"# The number of characters in the dataset.\n",
"print(len(text))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "S17g2RYOjmf2"
},
"source": [
"# Chunking\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "STACTMUR1z9N",
"outputId": "15a61eac-8774-4cdb-db8d-e2eb5b07e517"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"335\n"
]
}
],
"source": [
"chunk_size = 512\n",
"chunks = []\n",
"\n",
"# Split the long text into smaller manageable chunks of 512 characters.\n",
"for i in range(0, len(text), chunk_size):\n",
" chunks.append(text[i : i + chunk_size])\n",
"\n",
"print(len(chunks))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9fOomeMGqu10"
},
"source": [
"#Interface of Chroma with LlamaIndex\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "CtdsIUQ81_hT"
},
"outputs": [],
"source": [
"from llama_index.core import Document\n",
"\n",
"# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
"documents = [Document(text=t) for t in chunks]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OWaT6rL7ksp8"
},
"source": [
"Save on Chroma\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "mXi56KTXk2sp"
},
"outputs": [],
"source": [
"import chromadb\n",
"\n",
"# create client and a new collection\n",
"# chromadb.EphemeralClient saves data in-memory.\n",
"chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
"chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "jKXURvLtkuTS"
},
"outputs": [],
"source": [
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
"from llama_index.core import StorageContext\n",
"\n",
"# Define a storage context object using the created vector database.\n",
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "WsD52wtrlESi"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/omar/Documents/ai_repos/ai-tutor-rag-system/env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Parsing nodes: 100%|██████████| 335/335 [00:00<00:00, 8031.85it/s]\n",
"Generating embeddings: 100%|██████████| 335/335 [00:03<00:00, 97.24it/s] \n"
]
}
],
"source": [
"from llama_index.core import VectorStoreIndex\n",
"from llama_index.core.node_parser import SentenceSplitter\n",
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
"\n",
"# Build index / generate embeddings using OpenAI embedding model\n",
"index = VectorStoreIndex.from_documents(\n",
" documents,\n",
" embed_model=OpenAIEmbedding(model=\"text-embedding-3-small\"),\n",
" storage_context=storage_context,\n",
" show_progress=True,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8JPD8yAinVSq"
},
"source": [
"Query Dataset\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "mzS13x1ZlZ5X"
},
"outputs": [],
"source": [
"# Define a query engine that is responsible for retrieving related pieces of text,\n",
"# and using a LLM to formulate the final answer.\n",
"\n",
"from llama_index.llms.gemini import Gemini\n",
"\n",
"llm = Gemini(model=\"models/gemini-1.5-flash\", temperature=1, max_tokens=512)\n",
"\n",
"query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AYsQ4uLN_Oxg",
"outputId": "5066a06c-77ff-48a2-ee61-3abe2e9755e2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The LLaMA2 model has four different sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters. \n",
"\n"
]
}
],
"source": [
"response = query_engine.query(\"How many parameters LLaMA2 model has?\")\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kWK571VNg-qR"
},
"source": [
"# Interface of Chroma with LangChain\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "SMPAniL2e4NP"
},
"outputs": [],
"source": [
"from langchain.schema.document import Document\n",
"\n",
"# Convert the chunks to Document objects so the LangChain framework can process them.\n",
"documents = [Document(page_content=t) for t in chunks]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "QBt8qGxArUPD"
},
"source": [
"Save on Chroma\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"id": "2xas7HkuhJ8A"
},
"outputs": [],
"source": [
"from langchain_chroma import Chroma\n",
"from langchain_openai import OpenAIEmbeddings\n",
"\n",
"# Add the documents to chroma DB and create Index / embeddings\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n",
"chroma_db = Chroma.from_documents(\n",
" documents=documents,\n",
" embedding=embeddings,\n",
" persist_directory=\"./mini-chunked-dataset\",\n",
" collection_name=\"mini-chunked-dataset\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "P8AXJJyBrZWF"
},
"source": [
"Query Dataset\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-H64YLxshM2b"
},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"\n",
"# Initializing the LLM model\n",
"llm = ChatOpenAI(temperature=0, model=\"gpt-4o-mini\", max_tokens=512)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AxBqPNtthPaa",
"outputId": "93c9ad64-1cd1-4f52-c51e-6f3ec5d6542d"
},
"outputs": [],
"source": [
"from langchain.chains import RetrievalQA\n",
"\n",
"query = \"How many parameters LLaMA2 model has?\"\n",
"retriever = chroma_db.as_retriever(search_kwargs={\"k\": 2})\n",
"# Define a RetrievalQA chain that is responsible for retrieving related pieces of text,\n",
"# and using a LLM to formulate the final answer.\n",
"chain = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=retriever)\n",
"\n",
"response = chain(query)\n",
"print(response[\"result\"])"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}