{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "\"Open\n" ] }, { "cell_type": "markdown", "metadata": { "id": "-zE1h0uQV7uT" }, "source": [ "# Install Packages and Setup Variables\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QPJzr-I9XQ7l", "outputId": "5d48c88b-a0a9-49ff-d788-e076d1cb4ead" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install -q llama-index==0.10.57 openai==1.37.0 cohere==5.6.2 tiktoken==0.7.0 chromadb==0.5.5 html2text sentence_transformers pydantic llama-index-vector-stores-chroma==0.1.10 kaleido==0.2.1 llama-index-llms-gemini==0.1.11" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "riuXwpSPcvWC" }, "outputs": [], "source": [ "import os\n", "\n", "# Set the following API Keys in the Python environment. Will be used later.\n", "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", "os.environ[\"GOOGLE_API_KEY\"] = \"\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "jIEeZzqLbz0J" }, "outputs": [], "source": [ "# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.\n", "import nest_asyncio\n", "\n", "nest_asyncio.apply()" ] }, { "cell_type": "markdown", "metadata": { "id": "Bkgi2OrYzF7q" }, "source": [ "# Load a Model\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "9oGT6crooSSj" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/fabio/Desktop/ai-tutor-rag-system/venv_ai_tutor/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "I0000 00:00:1723471002.830383 5318658 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache\n", "I0000 00:00:1723471002.837404 5318658 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported\n" ] } ], "source": [ "from llama_index.llms.gemini import Gemini\n", "\n", "llm = Gemini(model=\"models/gemini-1.5-flash\", temperature=1, max_tokens=512)" ] }, { "cell_type": "markdown", "metadata": { "id": "0BwVuJXlzHVL" }, "source": [ "# Create a VectoreStore\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "SQP87lHczHKc" }, "outputs": [], "source": [ "import chromadb\n", "from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction\n", "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "\n", "# create client and a new collection\n", "# chromadb.EphemeralClient saves data in-memory.\n", "chroma_client = chromadb.PersistentClient(path=\"./mini-llama-articles\")\n", "chroma_collection = chroma_client.get_or_create_collection(\n", " \"mini-llama-articles\",\n", " embedding_function=OpenAIEmbeddingFunction(api_key=os.environ[\"OPENAI_API_KEY\"], model_name=\"text-embedding-3-small\")\n", ")\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ] }, { "cell_type": "markdown", "metadata": { "id": "I9JbAzFcjkpn" }, "source": [ "# Load the Dataset (CSV)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "ceveDuYdWCYk" }, "source": [ "## Download\n" ] }, { "cell_type": "markdown", "metadata": { "id": "eZwf6pv7WFmD" }, "source": [ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string.\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wl_pbPvMlv1h", "outputId": "a453b612-20a8-4396-d22b-b19d2bc47816" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "I0000 00:00:1723471003.927906 5318658 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 169k 100 169k 0 0 506k 0 --:--:-- --:--:-- --:--:-- 506k\n" ] } ], "source": [ "!curl -o ./mini-llama-articles.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv" ] }, { "cell_type": "markdown", "metadata": { "id": "VWBLtDbUWJfA" }, "source": [ "## Read File\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0Q9sxuW0g3Gd", "outputId": "49b27d8a-1f96-4e8d-fa0f-27afbf2c395c" }, "outputs": [ { "data": { "text/plain": [ "14" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import csv\n", "\n", "rows = []\n", "\n", "# Load the file as a JSON\n", "with open(\"./mini-llama-articles.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n", " csv_reader = csv.reader(file)\n", "\n", " for idx, row in enumerate(csv_reader):\n", " if idx == 0:\n", " continue\n", " # Skip header row\n", " rows.append(row)\n", "\n", "# The number of characters in the dataset.\n", "len(rows)" ] }, { "cell_type": "markdown", "metadata": { "id": "S17g2RYOjmf2" }, "source": [ "# Convert to Document obj\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "YizvmXPejkJE" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Doc ID: 8908a7bc-6918-4725-9859-6e6a7788f865\n", "Text: LLM Variants and Meta's Open Source Before shedding light on\n", "four major trends, I'd share the latest Meta's Llama 2 and Code Llama.\n", "Meta's Llama 2 represents a sophisticated evolution in LLMs. This\n", "suite spans models pretrained and fine-tuned across a parameter\n", "spectrum of 7 billion to 70 billion. A specialized derivative, Llama\n", "2-Chat, has been...\n" ] } ], "source": [ "from llama_index.core import Document\n", "\n", "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n", "documents = [\n", " Document(\n", " text=row[1], metadata={\"title\": row[0], \"url\": row[2], \"source_name\": row[3]}\n", " )\n", " for row in rows\n", "]\n", "print(documents[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "qjuLbmFuWsyl" }, "source": [ "# Transforming\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "9z3t70DGWsjO" }, "outputs": [], "source": [ "from llama_index.core.text_splitter import TokenTextSplitter\n", "\n", "text_splitter = TokenTextSplitter(separator=\" \", chunk_size=512, chunk_overlap=128)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 331, "referenced_widgets": [ "3fbabd8a8660461ba5e7bc08ef39139a", "df2365556ae242a2ab1a119f9a31a561", "5f4b9d32df8f446e858e4c289dc282f9", "5b588f83a15d42d9aca888e06bbd95ff", "ad073bca655540809e39f26538d2ec0d", "13b9c5395bca4c3ba21265240cb936cf", "47a4586384274577a726c57605e7f8d9", "96a3bdece738481db57e811ccb74a974", "5c7973afd79349ed997a69120d0629b2", "af9b6ae927dd4764b9692507791bc67e", "134210510d49476e959dd7d032bbdbdc", "5f9bb065c2b74d2e8ded32e1306a7807", "73a06bc546a64f7f99a9e4a135319dcd", "ce48deaf4d8c49cdae92bfdbb3a78df0", "4a172e8c6aa44e41a42fc1d9cf714fd0", "0245f2604e4d49c8bd0210302746c47b", "e956dfab55084a9cbe33c8e331b511e7", "cb394578badd43a89850873ad2526542", "193aef33d9184055bb9223f56d456de6", "abfc9aa911ce4a5ea81c7c451f08295f", "e7937a1bc68441a080374911a6563376", "e532ed7bfef34f67b5fcacd9534eb789" ] }, "id": "P9LDJ7o-Wsc-", "outputId": "01070c1f-dffa-4ab7-ad71-b07b76b12e03" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "I0000 00:00:1723471005.241134 5318658 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork\n", "Parsing nodes: 100%|██████████| 14/14 [00:00<00:00, 51.60it/s]\n", " 0%| | 0/108 [00:00