{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Create AI-Tutor vector database" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n", "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nest_asyncio\n", "\n", "nest_asyncio.apply()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "\n", "# create client and a new collection\n", "# chromadb.EphemeralClient saves data in-memory.\n", "chroma_client = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n", "chroma_collection = chroma_client.create_collection(\"ai-tutor-db\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "\n", "# Define a storage context object using the created vector database.\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "from llama_index.core.schema import TextNode\n", "\n", "\n", "def load_jsonl_create_nodes(filepath):\n", " nodes = [] # List to hold the created node objects\n", " with open(filepath, \"r\") as file:\n", " for line in file:\n", " # Load each line as a JSON object\n", " json_obj = json.loads(line)\n", " # Extract required information\n", " title = json_obj.get(\"title\")\n", " url = json_obj.get(\"url\")\n", " content = json_obj.get(\"content\")\n", " source = json_obj.get(\"source\")\n", " # Create a TextNode object and append to the list\n", " node = TextNode(\n", " text=content,\n", " metadata={\"title\": title, \"url\": url, \"source\": source},\n", " excluded_embed_metadata_keys=[\"title\", \"url\", \"source\"],\n", " excluded_llm_metadata_keys=[\"title\", \"url\", \"source\"],\n", " )\n", " nodes.append(node)\n", " return nodes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filepath = \"../data/ai-tutor-csv-files/combined_data_lines.jsonl\"\n", "nodes = load_jsonl_create_nodes(filepath)\n", "\n", "print(f\"Loaded {len(nodes)} nodes/chunks from the JSONL file\\n \")\n", "\n", "node = nodes[0]\n", "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")\n", "\n", "print(\"\\n\")\n", "\n", "node = nodes[-10000]\n", "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# # Create the pipeline to apply the transformation on each chunk,\n", "# # and store the transformed text in the chroma vector store.\n", "# pipeline = IngestionPipeline(\n", "# transformations=[\n", "# text_splitter,\n", "# QuestionsAnsweredExtractor(questions=3, llm=llm),\n", "# SummaryExtractor(summaries=[\"prev\", \"self\"], llm=llm),\n", "# KeywordExtractor(keywords=10, llm=llm),\n", "# OpenAIEmbedding(),\n", "# ],\n", "# vector_store=vector_store\n", "# )\n", "\n", "# nodes = pipeline.run(documents=documents, show_progress=True);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.embeddings.openai import OpenAIEmbedding\n", "from llama_index.core import VectorStoreIndex\n", "\n", "# embeds = OpenAIEmbedding(model=\"text-embedding-3-small\", mode=\"similarity\")\n", "# embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\")\n", "embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n", "# embeds = OpenAIEmbedding(model=\"text-embedding-ada-002\", mode=\"similarity\")\n", "\n", "# Build index / generate embeddings using OpenAI.\n", "index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=embeds, insert_batch_size=3000,)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.llms.openai import OpenAI\n", "\n", "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n", "query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res = query_engine.query(\"What is the LLama model?\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res.response" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for src in res.source_nodes:\n", " print(\"Node ID\\t\", src.node_id)\n", " print(\"Title\\t\", src.metadata['title'])\n", " print(\"Text\\t\", src.text)\n", " print(\"Score\\t\", src.score)\n", " print(\"Metadata\\t\", src.metadata) \n", " print(\"-_\"*20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load DB from disk" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "# Create your index\n", "db2 = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n", "chroma_collection = db2.get_or_create_collection(\"ai-tutor-db\")\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create your index\n", "from llama_index.core import VectorStoreIndex\n", "index = VectorStoreIndex.from_vector_store(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from llama_index.embeddings.openai import OpenAIEmbedding\n", "from llama_index.llms.openai import OpenAI\n", "from llama_index.core.vector_stores import (\n", " ExactMatchFilter,\n", " MetadataFilters,\n", " MetadataFilter,\n", " FilterOperator,\n", " FilterCondition,\n", ")\n", "\n", "\n", "filters = MetadataFilters(\n", " filters=[\n", " MetadataFilter(key=\"source\", value=\"lanchain_course\"),\n", " MetadataFilter(key=\"source\", value=\"langchain_docs\"),\n", " ],\n", " condition=FilterCondition.OR,\n", ")\n", "\n", "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n", "embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n", "# query_engine = index.as_query_engine(\n", "# llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True, streaming=True, filters=filters\n", "# )\n", "query_engine = index.as_query_engine(\n", " llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res = query_engine.query(\"What is the LLama model?\")\n", "\n", "# history = \"\" \n", "# for token in res.response_gen:\n", "# history += token\n", "# print(history)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res.response" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for src in res.source_nodes:\n", " print(\"Node ID\\t\", src.node_id)\n", " print(\"Source\\t\", src.metadata['source'])\n", " print(\"Title\\t\", src.metadata['title'])\n", " print(\"Text\\t\", src.text)\n", " print(\"Score\\t\", src.score)\n", " print(\"-_\"*20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Markdown, display\n", "# define prompt viewing function\n", "def display_prompt_dict(prompts_dict):\n", " for k, p in prompts_dict.items():\n", " text_md = f\"**Prompt Key**: {k}
\" f\"**Text:**
\"\n", " display(Markdown(text_md))\n", " print(p.get_template())\n", " display(Markdown(\"

\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "prompts_dict = query_engine.get_prompts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "display_prompt_dict(prompts_dict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }