Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

Omar Solano commited on Feb 19, 2024

Commit

84f8c13

1 Parent(s): 2b85cb1

update embedding model

Browse files

Files changed (2) hide show

scripts/ai-tutor.ipynb +115 -187
scripts/gradio-ui.py +10 -4

scripts/ai-tutor.ipynb CHANGED Viewed

@@ -9,19 +9,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"sk-\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -32,18 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from llama_index.llms.openai import OpenAI\n",
-    "\n",
-    "llm = OpenAI(temperature=0.9, model=\"gpt-3.5-turbo\", max_tokens=512)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -57,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,219 +55,121 @@
     "\n",
     "# Define a storage context object using the created vector database.\n",
     "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
-    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
-    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "import csv\n",
     "from llama_index.core.schema import TextNode\n",
     "\n",
-    "def load_csv_files_from_directory(directory):\n",
-    "    nodes = []\n",
-    "    node_count = 0\n",
-    "\n",
-    "    # Iterate over all files in the given directory\n",
-    "    for filename in os.listdir(directory):\n",
-    "        if filename.endswith(\".csv\"):\n",
-    "            filepath = os.path.join(directory, filename)\n",
-    "            with open(filepath, mode='r', encoding='utf-8') as file:\n",
-    "                csv_reader = csv.reader(file)\n",
-    "                headers = next(csv_reader, None)  # Read the header row\n",
-    "                \n",
-    "                # Dynamically determine the column indices\n",
-    "                title_idx = headers.index('title') if 'title' in headers else None\n",
-    "                url_idx = headers.index('url') if 'url' in headers else None\n",
-    "                content_idx = headers.index('content') if 'content' in headers else None\n",
-    "                source_idx = headers.index('source') if 'source' in headers else None\n",
-    "                \n",
-    "                for row in csv_reader:\n",
-    "                    if title_idx is not None and url_idx is not None and content_idx is not None and source_idx is not None:\n",
-    "                        node_id = f\"node_{node_count}\"\n",
-    "                        node = TextNode(\n",
-    "                            text=row[content_idx],\n",
-    "                            metadata={\n",
-    "                                \"title\": row[title_idx],\n",
-    "                                \"url\": row[url_idx],\n",
-    "                                \"source\": row[source_idx]\n",
-    "                            },\n",
-    "                            id_=node_id\n",
-    "                        )\n",
-    "                        nodes.append(node)\n",
-    "                        node_count += 1\n",
     "\n",
     "    return nodes"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ID: node_0 \n",
-      "Text: # Introduction\n",
-      "This lesson will explore the powerful concept of LangChain memory, which is designed to help chatbots maintain context and improve their conversational capabilities in more details. The traditional approach to chatbot development involves processing user prompts independently and without considering the history of interactions. This can lead to disjointed and unsatisfactory user experiences. LangChain provides memory components to manage and manipulate previous chat messages and incorporate them into chains. This is crucial for chatbots, which require remembering the prior interactions. ![ Image by Midjourney](Mastering%20Memory%20Types%20in%20LangChain%20A%20Comprehensiv%209a0515e0407345888439a8c036e47e43/membot.png) Image by Midjourney By default, LLMs are stateless, which means they process each incoming query in isolation, without considering previous interactions. To overcome this limitation, LangChain offers a standard interface for memory, a variety of memory implementations, and examples of chains and agents that employ memory. It also provides Agents that have access to a suite of Tools. Depending on the user’s input, an Agent can decide which Tools to use., \n",
-      "Metadata: {'title': 'Mastering Memory Types in LangChain: A Comprehensive Guide with Practical Examples', 'url': 'https://learn.activeloop.ai/courses/take/langchain/multimedia/46318209-mastering-memory-types-in-langchain-a-comprehensive-guide-with-practical-examples', 'source': 'langchain_course'}\n",
-      "ID: node_20677 \n",
-      "Text: rue (to lift the ambiguity with a batch of sequences).   add_special_tokens (bool, optional, defaults to True) —\n",
-      "Whether or not to add special tokens when encoding the sequences. This will use the underlying\n",
-      "PretrainedTokenizerBase.build_inputs_with_special_tokens function, which defines which tokens are\n",
-      "automatically added to the input ids. This is usefull if you want to add bos or eos tokens\n",
-      "automatically.   padding (bool, str or PaddingStrategy, optional, defaults to False) —\n",
-      "Activates and controls padding. Accepts the following values:\n",
-      "True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single\n",
-      "sequence if provided).\n",
-      "'max_length': Pad to a maximum length specified with the argument max_length or to the maximum\n",
-      "acceptable input length for the model if that argument is not provided.\n",
-      "False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different\n",
-      "lengths).\n",
-      "   truncation (bool, str or Truncation, \n",
-      "Metadata: {'title': 'PreTrainedTokenizerFast', 'url': 'https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast', 'source': 'hf_transformers'}\n"
-     ]
-    }
-   ],
    "source": [
-    "directory_path = '../data/ai-tutor-csv-files'\n",
-    "nodes = load_csv_files_from_directory(directory_path)\n",
     "\n",
     "node = nodes[0]\n",
     "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")\n",
     "\n",
-    "node = nodes[-5000]\n",
     "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/omar/Documents/ai_repos/ai-tutor-rag-system/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  5.27it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.23it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 10.93it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.51it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 10.74it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.41it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  8.36it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.57it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.08it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.90it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  8.22it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.77it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  6.02it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  8.81it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.00it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.67it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.71it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.51it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 10.10it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.14it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.08it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.79it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.30it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:02<00:00,  4.43it/s]\n",
-      "Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  5.92it/s]\n",
-      "Generating embeddings: 100%|██████████| 7/7 [00:00<00:00,  8.70it/s]\n"
-     ]
-    }
-   ],
    "source": [
     "from llama_index.embeddings.openai import OpenAIEmbedding\n",
     "from llama_index.core import VectorStoreIndex\n",
     "\n",
     "# Build index / generate embeddings using OpenAI.\n",
-    "index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=OpenAIEmbedding(), insert_batch_size=1000,)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
-    "query_engine = index.as_query_engine(top_k=5)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
-    "res = query_engine.query(\"what can you tell me about the llama2 llm\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'I cannot provide an answer to the query as there is no relevant information or context provided about \"llama2 llm\" in the given text.'"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "res.response"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Node ID\t node_1708\n",
-      "Title\t The Generative AI Revolution: Exploring the Current Landscape\n",
-      "Text\t 1. OpenAI's GPT Models Notable Models Task specific models Find model information here: https://platform.openai.com/docs/models/gpt-3 Image & Audio Models OpenAI, the company behind the GPT models, is an AI research and deployment company. The San Francisco-based lab was founded in 2015 as a nonprofit with the goal of building \"artificial general intelligence\" (AGI), which is essentially software as smart as humans. OpenAI conducts innovative research in various fields of AI, such as deep learning, natural language processing, computer vision, and robotics, and develops AI technologies and products intended to solve real-world problems. OpenAI transitioned into a for-profit company in 2019. The company plans to cap the profit of the investors at a fixed multiple of their investment (noted by Sam Altman as currently ranging between 7x and 100x depending on the investment round date and risk). As per the WSJ OpenAI was initially funded by $130m of charity funding (Elon Musk tweeted he contributed $100m) and has since raised at least $13bn led by Microsoft (where OpenAI makes use of Azure cloud credits). With the Microsoft partnership, OpenAI's ChatGPT, along with Microsoft's own search AI, created an improved version of Bing and transformed Microsoft's Office productivity apps. In 2019, OpenAI released GPT-2, a model that could generate realistic human-like text in entire paragraphs with internal consistency, unlike any of the previous models. The next generation, GPT-3, launched in 2020, was trained with 175 billion parameters. GPT-3 is a multi-purpose language tool that users can access without requiring them to learn a programming language or other computer tools. In November 2022, OpenAI released ChatGPT, which is a superior version of the company's earlier text generation models with the capability to generate humanlike prose. After the success of ChatGPT (GPT 3.5), Open AI released GPT-4 in March 2023, which has multimodal capabilities. The model processes both image and text inputs for text generation. The model has a maximum token count of 32,768 capable of generating around 25,000 words as compared to GPT-3.5 which has 4,096 tokens context size. GPT-4 produces 40% more factual responses and its response rate for disallowed content is down by 82% as compared to previous models. (reported by OpenAI) \n",
-      "Score\t 0.7294525989858827\n",
-      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
-      "Node ID\t node_19679\n",
-      "Title\t TFBartForConditionalGeneration\n",
-      "Text\t ach tensor of shape (2, batch_size, num_heads, sequence_length, embed_size_per_head)).\n",
-      "Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be\n",
-      "used (see past_key_values input) to speed up sequential decoding.\n",
-      "decoder_hidden_states (tuple(tf.Tensor), optional, returned when output_hidden_states=True is passed or when config.output_hidden_states=True) — Tuple of tf.Tensor (one for the output of the embeddings + one for the output of each layer) of shape\n",
-      "(batch_size, sequence_length, hidden_size).\n",
-      "Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.\n",
-      "decoder_attentions (tuple(tf.Tensor), optional, returned when output_attentions=True is passed or when config.output_attentions=True) — Tuple of tf.Tensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).\n",
-      "Attentions weights of the decoder, after the attention softmax, used to com\n",
-      "Score\t 0.7243357660195968\n",
-      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
-     ]
-    }
-   ],
    "source": [
     "for src in res.source_nodes:\n",
     "  print(\"Node ID\\t\", src.node_id)\n",
     "  print(\"Title\\t\", src.metadata['title'])\n",
     "  print(\"Text\\t\", src.text)\n",
     "  print(\"Score\\t\", src.score)\n",
     "  print(\"-_\"*20)"
    ]
   },
@@ -291,21 +182,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
     "import chromadb\n",
     "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
     "# Create your index\n",
-    "db2 = chromadb.PersistentClient(path=\"ai-tutor-db\")\n",
     "chroma_collection = db2.get_or_create_collection(\"ai-tutor-db\")\n",
     "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -316,34 +207,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
-    "query_engine = index.as_query_engine()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
-    "res = query_engine.query(\"How many parameters LLaMA2 model has?\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'The LLaMA2 model has 13 billion parameters.'"
       ]
      },
-     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -354,26 +250,58 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Node ID\t node_3662\n",
       "Source\t towards_ai\n",
-      "Title\t Sorting & Analytics Pane in Tableau: A Road to Tableau Desktop Specialist Certification\n",
-      "Text\t Sample Certification Questions from this Topic Sorting from field label gives ______ sort by default.a. Nestedb. Non-Nestedc. Manuald. Data Source order Solution: Non-nested \n",
-      "Score\t 0.7556534272859884\n",
       "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
-      "Node ID\t node_16411\n",
       "Source\t hf_transformers\n",
-      "Title\t Overview\n",
-      "Text\t The LLaMA model was proposed in LLaMA: Open and Efficient Foundation Language Models by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.\n",
-      "The abstract from the paper is the following:\n",
-      "We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B \n",
-      "Score\t 0.72749631299345\n",
       "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
      ]
     }
@@ -419,7 +347,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"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\""
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "# Define a storage context object using the created vector database.\n",
     "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
+    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import json\n",
     "from llama_index.core.schema import TextNode\n",
     "\n",
     "\n",
+    "def load_jsonl_create_nodes(filepath):\n",
+    "    nodes = []  # List to hold the created node objects\n",
+    "    with open(filepath, \"r\") as file:\n",
+    "        for line in file:\n",
+    "            # Load each line as a JSON object\n",
+    "            json_obj = json.loads(line)\n",
+    "            # Extract required information\n",
+    "            title = json_obj.get(\"title\")\n",
+    "            url = json_obj.get(\"url\")\n",
+    "            content = json_obj.get(\"content\")\n",
+    "            source = json_obj.get(\"source\")\n",
+    "            # Create a TextNode object and append to the list\n",
+    "            node = TextNode(\n",
+    "                text=content,\n",
+    "                metadata={\"title\": title, \"url\": url, \"source\": source},\n",
+    "                excluded_embed_metadata_keys=[\"title\", \"url\", \"source\"],\n",
+    "                excluded_llm_metadata_keys=[\"title\", \"url\", \"source\"],\n",
+    "            )\n",
+    "            nodes.append(node)\n",
     "    return nodes"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "filepath = \"../data/ai-tutor-csv-files/combined_data_lines.jsonl\"\n",
+    "nodes = load_jsonl_create_nodes(filepath)\n",
+    "\n",
+    "print(f\"Loaded {len(nodes)} nodes/chunks from the JSONL file\\n \")\n",
     "\n",
     "node = nodes[0]\n",
     "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")\n",
     "\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "node = nodes[-10000]\n",
     "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from llama_index.embeddings.openai import OpenAIEmbedding\n",
     "from llama_index.core import VectorStoreIndex\n",
     "\n",
+    "# embeds = OpenAIEmbedding(model=\"text-embedding-3-small\", mode=\"similarity\")\n",
+    "# embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\")\n",
+    "embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n",
+    "# embeds = OpenAIEmbedding(model=\"text-embedding-ada-002\", mode=\"similarity\")\n",
+    "\n",
     "# Build index / generate embeddings using OpenAI.\n",
+    "index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=embeds, insert_batch_size=3000,)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from llama_index.llms.openai import OpenAI\n",
+    "\n",
+    "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n",
+    "query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "res = query_engine.query(\"What is the LLama model?\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "res.response"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "for src in res.source_nodes:\n",
     "  print(\"Node ID\\t\", src.node_id)\n",
     "  print(\"Title\\t\", src.metadata['title'])\n",
     "  print(\"Text\\t\", src.text)\n",
     "  print(\"Score\\t\", src.score)\n",
+    "  print(\"Metadata\\t\", src.metadata) \n",
     "  print(\"-_\"*20)"
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import chromadb\n",
     "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
     "# Create your index\n",
+    "db2 = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n",
     "chroma_collection = db2.get_or_create_collection(\"ai-tutor-db\")\n",
     "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from llama_index.embeddings.openai import OpenAIEmbedding\n",
+    "from llama_index.llms.openai import OpenAI\n",
+    "\n",
+    "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n",
+    "embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n",
+    "query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
+    "res = query_engine.query(\"What is the LLama model?\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "'The Llama model is a new family of pre-trained and finetuned models released by Meta in mid-July. It includes different sizes such as 7B, 13B, and 70B, with corresponding papers describing their characteristics and learning process. The models are based on the standard transformer architecture and utilize techniques like RMSNorm normalization, SwiGLU activation, and rotatory positional embedding. The 70B model specifically applies the grouped-query attention (GQA) technique to speed up inference.'"
       ]
      },
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Node ID\t 7307e8a4-c4bd-4992-a68c-5230340f01c7\n",
+      "Source\t hf_transformers\n",
+      "Title\t Train\n",
+      "Text\t ged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole\n",
+      "dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too!  Loading data as a tf.data.Dataset If you want to avoid slowing down training, you can load your data as a tf.data.Dataset instead. Although you can write your own\n",
+      "tf.data pipeline if you want, we have two convenience methods for doing this: prepare_tf_dataset(): This is the method we recommend in most cases. Because it is a method\n",
+      "on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and\n",
+      "discard the others to make a simpler, more performant dataset. to_tf_dataset: This method is more low-level, and is useful when you want to exactly control how\n",
+      "your dataset is created, by specifying exactly which columns and label_cols to include. Before you can use prepare_tf_dataset(), you will need to add the\n",
+      "Score\t 0.5175680124550022\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t 346a1018-8b33-4d83-b78f-2ba1b94f5e3b\n",
+      "Source\t openai\n",
+      "Title\t Researcher Access Program\n",
+      "Text\t There are a number of research directions we are excited to explore with the OpenAI API. If you are interested in the opportunity for subsidized access, please provide us with details about your research use case on the Researcher Access Program application.In particular, we consider the following to be especially important directions, though you are free to craft your own direction:Alignment: How can we understand what objective, if any, a model is best understood as pursuing? How do we increase the extent to which that objective is aligned with human preferences, such as via prompt design or fine-tuning?Fairness and representation: How should performance criteria be established for fairness and representation in language models? How can language models be improved in order to effectively support the goals of fairness and representation in specific, deployed contexts?Interdisciplinary research: How can AI development draw on insights from other disciplines such as philosophy, cognitive science, and sociolinguistics?Interpretability and transparency: How do these models work, mechanistically? Can we identify what concepts they're using, or extract latent knowledge from the model, make inferences about the training procedure, or predict surprising future behavior?Misuse potential: How can systems like the API be misused? What sorts of 'red teaming' approaches can we develop to help us and other AI developers think about responsibly deploying technologies like this?Model exploration: Models like those served by the API have a variety of capabilities which we have yet to explore. We're excited by investigations in many areas including model limitations, linguistic properties, commonsense reasoning, and potential uses for many other problems.Robustness: Generative models have uneven capability surfaces, with the potential for surprisingly strong and surprisingly weak areas of capability. How robust are large generative models to 'natural' perturbations in the prompt, such as phrasing the same idea in different ways or with or without typos? Can we predict the kinds of domains and tasks for which large generative models are more likely to be robust (or not robust), and how does this relate to the training data? Are there techniques we can use to predict and mitigate worst-case behavior? How can robustness be measured in the context of few-shot learning (e.g., across variations in prompts)? Can we train models so that they satisfy safety properties with a very high level of reliability, even under adversarial inputs?Please note that due to a high volume of requests, it takes time for us to review these applications and not all research will be prioritized for subsidy. We will only be in touch if your application is selected for subsidy.\n",
+      "Score\t 0.5129222370072439\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t ff0f2362-ddf7-4116-ac38-465dae37886a\n",
       "Source\t towards_ai\n",
+      "Title\t Fine-Tuning a Llama-2 7B Model for Python Code Generation\n",
+      "Text\t New Llama-2 model In mid-July, Meta released its new family of pre-trained and finetuned models called Llama-2, with an open source and commercial character to facilitate its use and expansion. The base model was released with a chat version and sizes 7B, 13B, and 70B. Together with the models, the corresponding papers were published describing their characteristics and relevant points of the learning process, which provide very interesting information on the subject. For pre-training, 40% more tokens were used, reaching 2T, the context length was doubled and the grouped-query attention (GQA) technique was applied to speed up inference on the heavier 70B model. On the standard transformer architecture, RMSNorm normalization, SwiGLU activation, and rotatory positional embedding are used, the context length reaches 4096 tokens, and an Adam optimizer is applied with a cosine learning rate schedule, a weight decay of 0.1 and gradient clipping. \n",
+      "Score\t 0.49847282129286796\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t b0449da9-480c-48ea-80a3-35cfd84dbb48\n",
+      "Source\t hf_transformers\n",
+      "Title\t LayoutLMv2Tokenizer\n",
+      "Text\t class transformers.LayoutLMv2Tokenizer  < source > ( vocab_file do_lower_case = True do_basic_tokenize = True never_split = None unk_token = '[UNK]' sep_token = '[SEP]' pad_token = '[PAD]' cls_token = '[CLS]' mask_token = '[MASK]' cls_token_box = [0, 0, 0, 0] sep_token_box = [1000, 1000, 1000, 1000] pad_token_box = [0, 0, 0, 0] pad_token_label = -100 only_label_first_subword = True tokenize_chinese_chars = True strip_accents = None model_max_length: int = 512 additional_special_tokens: typing.Optional[typing.List[str]] = None **kwargs  )    Construct a LayoutLMv2 tokenizer. Based on WordPiece. LayoutLMv2Tokenizer can be used to turn words, word-level\n",
+      "bounding boxes and optional word labels to token-level input_ids, attention_mask, token_type_ids, bbox, and\n",
+      "optional labels (for token classification). This tokenizer inherits from PreTrainedTokenizer which contains most of the main methods. Users should refer to\n",
+      "this superclass for more information regarding those methods. LayoutLM\n",
+      "Score\t 0.488783381968426\n",
       "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t bdb45412-1d60-4c22-9de7-a3a469ac675a\n",
       "Source\t hf_transformers\n",
+      "Title\t Train\n",
+      "Text\t  tokenizer outputs to your dataset as columns, as shown in\n",
+      "the following code sample:   Copied def tokenize_dataset(data):\n",
+      "    # Keys of the returned dictionary will be added to the dataset as columns\n",
+      "    return tokenizer(data[\"text\"])\n",
+      "dataset = dataset.map(tokenize_dataset) Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the\n",
+      "columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly\n",
+      "reduces the number of padding tokens compared to padding the entire dataset.   Copied >>> tf_dataset = model.prepare_tf_dataset(dataset[\"train\"], batch_size=16, shuffle=True, tokenizer=tokenizer) Note that in the code sample above, you need to pass the tokenizer to prepare_tf_dataset so it can correctly pad batches as they’re loaded.\n",
+      "If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument.\n",
+      "If you need to do something mor\n",
+      "Score\t 0.4819307254673903\n",
       "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
      ]
     }
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,

scripts/gradio-ui.py CHANGED Viewed

@@ -6,6 +6,8 @@ from datetime import datetime
 import chromadb
 from llama_index.vector_stores.chroma import ChromaVectorStore
 from llama_index.core import VectorStoreIndex
 import gradio as gr
 from gradio.themes.utils import (
     fonts,
@@ -51,12 +53,16 @@ mongo_db = (
     else logger.warning("No mongodb uri found, you will not be able to save data.")
 )
-# Initialize ChromaDB
 db2 = chromadb.PersistentClient(path="scripts/ai-tutor-db")
 chroma_collection = db2.get_or_create_collection("ai-tutor-db")
 vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
 index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
-query_engine = index.as_query_engine()
 AVAILABLE_SOURCES_UI = [
@@ -148,7 +154,7 @@ def format_sources(completion) -> str:
         "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
     )
     document_template: str = (
-        "[🔗 {source}: {title}]({url}), relevance: {score:2.1f} %"  # Adjusted to include URL and format score as relevance
     )
     documents = "\n".join(
@@ -157,7 +163,7 @@ def format_sources(completion) -> str:
                 title=src.metadata["title"],
                 score=src.score,
                 source=display_source_to_ui.get(
-                    src.metadata["source_name"], src.metadata["source_name"]
                 ),
                 url=src.metadata["url"],
             )

 import chromadb
 from llama_index.vector_stores.chroma import ChromaVectorStore
 from llama_index.core import VectorStoreIndex
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.openai import OpenAI
 import gradio as gr
 from gradio.themes.utils import (
     fonts,
     else logger.warning("No mongodb uri found, you will not be able to save data.")
 )
+# Initialize vector store and index
 db2 = chromadb.PersistentClient(path="scripts/ai-tutor-db")
 chroma_collection = db2.get_or_create_collection("ai-tutor-db")
 vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
 index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+# Initialize query engine
+llm = OpenAI(temperature=0, model="gpt-3.5-turbo-0125", max_tokens=None)
+embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
+query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)
 AVAILABLE_SOURCES_UI = [
         "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
     )
     document_template: str = (
+        "[🔗 {source}: {title}]({url}), relevance: {score:2.1f}"  # Adjusted to include URL and format score as relevance
     )
     documents = "\n".join(
                 title=src.metadata["title"],
                 score=src.score,
                 source=display_source_to_ui.get(
+                    src.metadata["source"], src.metadata["source"]
                 ),
                 url=src.metadata["url"],
             )