Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Sleeping

File size: 10,657 Bytes

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create AI-Tutor vector database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import chromadb\n",
    "\n",
    "# create client and a new collection\n",
    "# chromadb.EphemeralClient saves data in-memory.\n",
    "chroma_client = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n",
    "chroma_collection = chroma_client.create_collection(\"ai-tutor-db\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
    "from llama_index.core import StorageContext\n",
    "\n",
    "# Define a storage context object using the created vector database.\n",
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from llama_index.core.schema import TextNode\n",
    "\n",
    "\n",
    "def load_jsonl_create_nodes(filepath):\n",
    "    nodes = []  # List to hold the created node objects\n",
    "    with open(filepath, \"r\") as file:\n",
    "        for line in file:\n",
    "            # Load each line as a JSON object\n",
    "            json_obj = json.loads(line)\n",
    "            # Extract required information\n",
    "            title = json_obj.get(\"title\")\n",
    "            url = json_obj.get(\"url\")\n",
    "            content = json_obj.get(\"content\")\n",
    "            source = json_obj.get(\"source\")\n",
    "            # Create a TextNode object and append to the list\n",
    "            node = TextNode(\n",
    "                text=content,\n",
    "                metadata={\"title\": title, \"url\": url, \"source\": source},\n",
    "                excluded_embed_metadata_keys=[\"title\", \"url\", \"source\"],\n",
    "                excluded_llm_metadata_keys=[\"title\", \"url\", \"source\"],\n",
    "            )\n",
    "            nodes.append(node)\n",
    "    return nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filepath = \"../data/ai-tutor-csv-files/combined_data_lines.jsonl\"\n",
    "nodes = load_jsonl_create_nodes(filepath)\n",
    "\n",
    "print(f\"Loaded {len(nodes)} nodes/chunks from the JSONL file\\n \")\n",
    "\n",
    "node = nodes[0]\n",
    "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")\n",
    "\n",
    "print(\"\\n\")\n",
    "\n",
    "node = nodes[-10000]\n",
    "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Create the pipeline to apply the transformation on each chunk,\n",
    "# # and store the transformed text in the chroma vector store.\n",
    "# pipeline = IngestionPipeline(\n",
    "#     transformations=[\n",
    "#         text_splitter,\n",
    "#         QuestionsAnsweredExtractor(questions=3, llm=llm),\n",
    "#         SummaryExtractor(summaries=[\"prev\", \"self\"], llm=llm),\n",
    "#         KeywordExtractor(keywords=10, llm=llm),\n",
    "#         OpenAIEmbedding(),\n",
    "#     ],\n",
    "#     vector_store=vector_store\n",
    "# )\n",
    "\n",
    "# nodes = pipeline.run(documents=documents, show_progress=True);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.embeddings.openai import OpenAIEmbedding\n",
    "from llama_index.core import VectorStoreIndex\n",
    "\n",
    "# embeds = OpenAIEmbedding(model=\"text-embedding-3-small\", mode=\"similarity\")\n",
    "# embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\")\n",
    "embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n",
    "# embeds = OpenAIEmbedding(model=\"text-embedding-ada-002\", mode=\"similarity\")\n",
    "\n",
    "# Build index / generate embeddings using OpenAI.\n",
    "index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=embeds, insert_batch_size=3000,)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llms.openai import OpenAI\n",
    "\n",
    "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n",
    "query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = query_engine.query(\"What is the LLama model?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res.response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for src in res.source_nodes:\n",
    "  print(\"Node ID\\t\", src.node_id)\n",
    "  print(\"Title\\t\", src.metadata['title'])\n",
    "  print(\"Text\\t\", src.text)\n",
    "  print(\"Score\\t\", src.score)\n",
    "  print(\"Metadata\\t\", src.metadata) \n",
    "  print(\"-_\"*20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load DB from disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import chromadb\n",
    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
    "# Create your index\n",
    "db2 = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n",
    "chroma_collection = db2.get_or_create_collection(\"ai-tutor-db\")\n",
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create your index\n",
    "from llama_index.core import VectorStoreIndex\n",
    "index = VectorStoreIndex.from_vector_store(vector_store=vector_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.embeddings.openai import OpenAIEmbedding\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.core.vector_stores import (\n",
    "    ExactMatchFilter,\n",
    "    MetadataFilters,\n",
    "    MetadataFilter,\n",
    "    FilterOperator,\n",
    "    FilterCondition,\n",
    ")\n",
    "\n",
    "\n",
    "filters = MetadataFilters(\n",
    "    filters=[\n",
    "        MetadataFilter(key=\"source\", value=\"lanchain_course\"),\n",
    "        MetadataFilter(key=\"source\", value=\"langchain_docs\"),\n",
    "    ],\n",
    "    condition=FilterCondition.OR,\n",
    ")\n",
    "\n",
    "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n",
    "embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n",
    "# query_engine = index.as_query_engine(\n",
    "#     llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True, streaming=True, filters=filters\n",
    "# )\n",
    "query_engine = index.as_query_engine(\n",
    "    llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = query_engine.query(\"What is the LLama model?\")\n",
    "\n",
    "# history = \"\"   \n",
    "# for token in res.response_gen:\n",
    "#     history += token\n",
    "#     print(history)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res.response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for src in res.source_nodes:\n",
    "  print(\"Node ID\\t\", src.node_id)\n",
    "  print(\"Source\\t\", src.metadata['source'])\n",
    "  print(\"Title\\t\", src.metadata['title'])\n",
    "  print(\"Text\\t\", src.text)\n",
    "  print(\"Score\\t\", src.score)\n",
    "  print(\"-_\"*20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Markdown, display\n",
    "# define prompt viewing function\n",
    "def display_prompt_dict(prompts_dict):\n",
    "    for k, p in prompts_dict.items():\n",
    "        text_md = f\"**Prompt Key**: {k}<br>\" f\"**Text:** <br>\"\n",
    "        display(Markdown(text_md))\n",
    "        print(p.get_template())\n",
    "        display(Markdown(\"<br><br>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompts_dict = query_engine.get_prompts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display_prompt_dict(prompts_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}