{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create AI-Tutor vector database"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import nest_asyncio\n",
"\n",
"nest_asyncio.apply()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import chromadb\n",
"\n",
"# create client and a new collection\n",
"# chromadb.EphemeralClient saves data in-memory.\n",
"chroma_client = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n",
"chroma_collection = chroma_client.create_collection(\"ai-tutor-db\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
"from llama_index.core import StorageContext\n",
"\n",
"# Define a storage context object using the created vector database.\n",
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from llama_index.core.schema import TextNode\n",
"\n",
"\n",
"def load_jsonl_create_nodes(filepath):\n",
" nodes = [] # List to hold the created node objects\n",
" with open(filepath, \"r\") as file:\n",
" for line in file:\n",
" # Load each line as a JSON object\n",
" json_obj = json.loads(line)\n",
" # Extract required information\n",
" title = json_obj.get(\"title\")\n",
" url = json_obj.get(\"url\")\n",
" content = json_obj.get(\"content\")\n",
" source = json_obj.get(\"source\")\n",
" # Create a TextNode object and append to the list\n",
" node = TextNode(\n",
" text=content,\n",
" metadata={\"title\": title, \"url\": url, \"source\": source},\n",
" excluded_embed_metadata_keys=[\"title\", \"url\", \"source\"],\n",
" excluded_llm_metadata_keys=[\"title\", \"url\", \"source\"],\n",
" )\n",
" nodes.append(node)\n",
" return nodes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filepath = \"../data/ai-tutor-csv-files/combined_data_lines.jsonl\"\n",
"nodes = load_jsonl_create_nodes(filepath)\n",
"\n",
"print(f\"Loaded {len(nodes)} nodes/chunks from the JSONL file\\n \")\n",
"\n",
"node = nodes[0]\n",
"print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")\n",
"\n",
"print(\"\\n\")\n",
"\n",
"node = nodes[-10000]\n",
"print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # Create the pipeline to apply the transformation on each chunk,\n",
"# # and store the transformed text in the chroma vector store.\n",
"# pipeline = IngestionPipeline(\n",
"# transformations=[\n",
"# text_splitter,\n",
"# QuestionsAnsweredExtractor(questions=3, llm=llm),\n",
"# SummaryExtractor(summaries=[\"prev\", \"self\"], llm=llm),\n",
"# KeywordExtractor(keywords=10, llm=llm),\n",
"# OpenAIEmbedding(),\n",
"# ],\n",
"# vector_store=vector_store\n",
"# )\n",
"\n",
"# nodes = pipeline.run(documents=documents, show_progress=True);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
"from llama_index.core import VectorStoreIndex\n",
"\n",
"# embeds = OpenAIEmbedding(model=\"text-embedding-3-small\", mode=\"similarity\")\n",
"# embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\")\n",
"embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n",
"# embeds = OpenAIEmbedding(model=\"text-embedding-ada-002\", mode=\"similarity\")\n",
"\n",
"# Build index / generate embeddings using OpenAI.\n",
"index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=embeds, insert_batch_size=3000,)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.llms.openai import OpenAI\n",
"\n",
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n",
"query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = query_engine.query(\"What is the LLama model?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res.response"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for src in res.source_nodes:\n",
" print(\"Node ID\\t\", src.node_id)\n",
" print(\"Title\\t\", src.metadata['title'])\n",
" print(\"Text\\t\", src.text)\n",
" print(\"Score\\t\", src.score)\n",
" print(\"Metadata\\t\", src.metadata) \n",
" print(\"-_\"*20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load DB from disk"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import chromadb\n",
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
"# Create your index\n",
"db2 = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n",
"chroma_collection = db2.get_or_create_collection(\"ai-tutor-db\")\n",
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create your index\n",
"from llama_index.core import VectorStoreIndex\n",
"index = VectorStoreIndex.from_vector_store(vector_store=vector_store)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
"from llama_index.llms.openai import OpenAI\n",
"from llama_index.core.vector_stores import (\n",
" ExactMatchFilter,\n",
" MetadataFilters,\n",
" MetadataFilter,\n",
" FilterOperator,\n",
" FilterCondition,\n",
")\n",
"\n",
"\n",
"filters = MetadataFilters(\n",
" filters=[\n",
" MetadataFilter(key=\"source\", value=\"lanchain_course\"),\n",
" MetadataFilter(key=\"source\", value=\"langchain_docs\"),\n",
" ],\n",
" condition=FilterCondition.OR,\n",
")\n",
"\n",
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n",
"embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n",
"# query_engine = index.as_query_engine(\n",
"# llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True, streaming=True, filters=filters\n",
"# )\n",
"query_engine = index.as_query_engine(\n",
" llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = query_engine.query(\"What is the LLama model?\")\n",
"\n",
"# history = \"\" \n",
"# for token in res.response_gen:\n",
"# history += token\n",
"# print(history)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res.response"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for src in res.source_nodes:\n",
" print(\"Node ID\\t\", src.node_id)\n",
" print(\"Source\\t\", src.metadata['source'])\n",
" print(\"Title\\t\", src.metadata['title'])\n",
" print(\"Text\\t\", src.text)\n",
" print(\"Score\\t\", src.score)\n",
" print(\"-_\"*20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Markdown, display\n",
"# define prompt viewing function\n",
"def display_prompt_dict(prompts_dict):\n",
" for k, p in prompts_dict.items():\n",
" text_md = f\"**Prompt Key**: {k}
\" f\"**Text:**
\"\n",
" display(Markdown(text_md))\n",
" print(p.get_template())\n",
" display(Markdown(\"
\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompts_dict = query_engine.get_prompts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display_prompt_dict(prompts_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}