Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

Omar Solano commited on Jul 16, 2024

Commit

0769f39

1 Parent(s): 7e6a905

update gradio-ui

Browse files

Files changed (5) hide show

README.md +1 -1
requirements.txt +15 -204
scripts/create_db.ipynb +723 -142
scripts/gradio-ui.py +117 -127
scripts/tutor_prompts.py +42 -19

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🧑🏻‍🏫
 colorFrom: gray
 colorTo: pink
 sdk: gradio
-sdk_version: 4.31.3
 app_file: scripts/gradio-ui.py
 pinned: false
 ---

 colorFrom: gray
 colorTo: pink
 sdk: gradio
+sdk_version: 4.38.1
 app_file: scripts/gradio-ui.py
 pinned: false
 ---

requirements.txt CHANGED Viewed

@@ -1,204 +1,15 @@
-aiofiles==23.2.1
-aiohttp==3.9.5
-aiosignal==1.3.1
-altair==5.3.0
-annotated-types==0.6.0
-anyio==4.3.0
-appnope==0.1.4
-asgiref==3.8.1
-asttokens==2.4.1
-attrs==23.2.0
-backoff==2.2.1
-bcrypt==4.1.3
-beautifulsoup4==4.12.3
-boto3==1.34.107
-botocore==1.34.107
-build==1.2.1
-cachetools==5.3.3
-certifi==2024.2.2
-charset-normalizer==3.3.2
-chroma-hnswlib==0.7.3
-chromadb==0.5.0
-click==8.1.7
-cohere==5.5.0
-coloredlogs==15.0.1
-comm==0.2.2
-contourpy==1.2.1
-cycler==0.12.1
-dataclasses-json==0.6.6
-debugpy==1.8.1
-decorator==5.1.1
-deprecated==1.2.14
-dirtyjson==1.0.8
-distro==1.9.0
-dnspython==2.6.1
-docstring-parser==0.16
-email-validator==2.1.1
-executing==2.0.1
-fastapi==0.111.0
-fastapi-cli==0.0.3
-fastavro==1.9.4
-ffmpy==0.3.2
-filelock==3.14.0
-flatbuffers==24.3.25
-fonttools==4.51.0
-frozenlist==1.4.1
-fsspec==2024.5.0
-google-auth==2.29.0
-googleapis-common-protos==1.63.0
-gradio==4.31.3
-gradio-client==0.16.3
-greenlet==3.0.3
-grpcio==1.63.0
-h11==0.14.0
-html2text==2024.2.26
-httpcore==1.0.5
-httptools==0.6.1
-httpx==0.27.0
-httpx-sse==0.4.0
-huggingface-hub==0.23.0
-humanfriendly==10.0
-idna==3.7
-importlib-metadata==7.0.0
-importlib-resources==6.4.0
-instructor==1.2.6
-ipykernel==6.29.4
-ipython==8.24.0
-jedi==0.19.1
-jinja2==3.1.4
-jmespath==1.0.1
-joblib==1.4.2
-jsonschema==4.22.0
-jsonschema-specifications==2023.12.1
-jupyter-client==8.6.1
-jupyter-core==5.7.2
-kaleido==0.2.1
-kiwisolver==1.4.5
-kubernetes==29.0.0
-llama-index==0.10.37
-llama-index-agent-openai==0.2.5
-llama-index-cli==0.1.12
-llama-index-core==0.10.36
-llama-index-embeddings-openai==0.1.9
-llama-index-indices-managed-llama-cloud==0.1.6
-llama-index-legacy==0.9.48
-llama-index-llms-openai==0.1.19
-llama-index-multi-modal-llms-openai==0.1.6
-llama-index-program-openai==0.1.6
-llama-index-question-gen-openai==0.1.3
-llama-index-readers-file==0.1.22
-llama-index-readers-llama-parse==0.1.4
-llama-index-vector-stores-chroma==0.1.8
-llama-parse==0.4.3
-llamaindex-py-client==0.1.19
-markdown-it-py==3.0.0
-markupsafe==2.1.5
-marshmallow==3.21.2
-matplotlib==3.9.0
-matplotlib-inline==0.1.7
-mdurl==0.1.2
-mmh3==4.1.0
-monotonic==1.6
-mpmath==1.3.0
-multidict==6.0.5
-mypy-extensions==1.0.0
-nest-asyncio==1.6.0
-networkx==3.3
-nltk==3.8.1
-numpy==1.26.4
-oauthlib==3.2.2
-onnxruntime==1.17.3
-openai==1.30.1
-opentelemetry-api==1.24.0
-opentelemetry-exporter-otlp-proto-common==1.24.0
-opentelemetry-exporter-otlp-proto-grpc==1.24.0
-opentelemetry-instrumentation==0.45b0
-opentelemetry-instrumentation-asgi==0.45b0
-opentelemetry-instrumentation-fastapi==0.45b0
-opentelemetry-proto==1.24.0
-opentelemetry-sdk==1.24.0
-opentelemetry-semantic-conventions==0.45b0
-opentelemetry-util-http==0.45b0
-orjson==3.10.3
-overrides==7.7.0
-packaging==24.0
-pandas==2.2.2
-parso==0.8.4
-pexpect==4.9.0
-pillow==10.3.0
-platformdirs==4.2.2
-posthog==3.5.0
-prompt-toolkit==3.0.43
-protobuf==4.25.3
-psutil==5.9.8
-ptyprocess==0.7.0
-pure-eval==0.2.2
-pyarrow==16.1.0
-pyasn1==0.6.0
-pyasn1-modules==0.4.0
-pydantic==2.7.1
-pydantic-core==2.18.2
-pydub==0.25.1
-pygments==2.18.0
-pymongo==4.7.2
-pyparsing==3.1.2
-pypdf==4.2.0
-pypika==0.48.9
-pyproject-hooks==1.1.0
-python-dateutil==2.9.0.post0
-python-dotenv==1.0.1
-python-multipart==0.0.9
-pytz==2024.1
-pyyaml==6.0.1
-pyzmq==26.0.3
-referencing==0.35.1
-regex==2024.5.15
-requests==2.31.0
-requests-oauthlib==2.0.0
-rich==13.7.1
-rpds-py==0.18.1
-rsa==4.9
-ruff==0.4.4
-s3transfer==0.10.1
-safetensors==0.4.3
-scikit-learn==1.4.2
-scipy==1.13.0
-semantic-version==2.10.0
-sentence-transformers==2.7.0
-setuptools==69.5.1
-shellingham==1.5.4
-six==1.16.0
-sniffio==1.3.1
-soupsieve==2.5
-sqlalchemy==2.0.30
-stack-data==0.6.3
-starlette==0.37.2
-striprtf==0.0.26
-sympy==1.12
-tenacity==8.3.0
-threadpoolctl==3.5.0
-tiktoken==0.7.0
-tokenizers==0.19.1
-tomlkit==0.12.0
-toolz==0.12.1
-torch==2.3.0
-tornado==6.4
-tqdm==4.66.4
-traitlets==5.14.3
-transformers==4.40.2
-typer==0.12.3
-types-requests==2.31.0.20240406
-typing-extensions==4.11.0
-typing-inspect==0.9.0
-tzdata==2024.1
-ujson==5.10.0
-urllib3==2.2.1
-uvicorn==0.29.0
-uvloop==0.19.0
-watchfiles==0.21.0
-wcwidth==0.2.13
-websocket-client==1.8.0
-websockets==11.0.3
-wrapt==1.16.0
-yarl==1.9.4
-zipp==3.18.2

+openai
+chromadb
+tiktoken
+ipykernel
+google-cloud-aiplatform
+google-generativeai
+llama-index-vector-stores-chroma
+llama-index
+llama-index-llms-vertex
+llama-index-llms-gemini
+langchain
+langchain-chroma
+langchain_openai
+gradio
+instructor

scripts/create_db.ipynb CHANGED Viewed

@@ -9,9 +9,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from dotenv import load_dotenv\n",
     "\n",
@@ -20,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -33,16 +44,27 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Clean data\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import json\n",
     "import tiktoken\n",
     "from collections import OrderedDict\n",
     "\n",
@@ -72,7 +94,9 @@
     "                token_count == 92 and json_obj.get(\"name\") == \"Transformers\"\n",
     "            ):\n",
     "                # Create a new OrderedDict with 'tokens' as the first key\n",
-    "                new_obj = OrderedDict([(\"tokens\", token_count)])\n",
     "                # Add the rest of the key-value pairs from the original object\n",
     "                new_obj.update(json_obj)\n",
     "                cleaned_data.append(new_obj)\n",
@@ -96,14 +120,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Merges lines by 'URL' and creates a new file with the merged data.\n",
     "\n",
-    "Fixes the 'name'\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -173,65 +197,29 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Count tokens of lines in merged file\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import json\n",
-    "# import tiktoken\n",
-    "\n",
-    "\n",
-    "# def num_tokens_from_string(string: str, encoding_name: str) -> int:\n",
-    "#     \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
-    "#     encoding = tiktoken.get_encoding(encoding_name)\n",
-    "#     num_tokens = len(\n",
-    "#         encoding.encode(\n",
-    "#             string, disallowed_special=(encoding.special_tokens_set - {\"<|endoftext|>\"})\n",
-    "#         )\n",
-    "#     )\n",
-    "#     return num_tokens\n",
-    "\n",
-    "\n",
-    "# def count_tokens(input_file):\n",
-    "\n",
-    "#     # Read and process the input file\n",
-    "#     with open(input_file, \"r\") as f:\n",
-    "#         for i, line in enumerate(f):\n",
-    "#             data = json.loads(line)\n",
-    "#             content = data[\"content\"]\n",
-    "#             nb_tokens = num_tokens_from_string(content, \"cl100k_base\")\n",
-    "#             # print(i + 1, data[\"url\"], nb_tokens)\n",
-    "#             if nb_tokens > 2000:\n",
-    "#                 print(i + 1, data[\"url\"], data[\"name\"], nb_tokens)\n",
-    "#             # if nb_tokens < 8:\n",
-    "#             # print(nb_tokens)\n",
-    "#             # print(data[\"url\"])\n",
-    "#             # print(data[\"content\"])\n",
-    "\n",
-    "\n",
-    "# # Usage\n",
-    "# input_file = \"../hf_transformers_v4_42_0_merged.jsonl\"\n",
-    "# # input_file = \"../hf_transformers_v4_42_0.jsonl\"\n",
-    "# count_tokens(input_file)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Create a set of llama-index Documents\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from llama_index.core import Document\n",
     "from llama_index.core.schema import MetadataMode\n",
@@ -245,24 +233,28 @@
     "            data = json.loads(line)\n",
     "            documents.append(\n",
     "                Document(\n",
     "                    text=data[\"content\"],\n",
     "                    metadata={\n",
     "                        \"url\": data[\"url\"],\n",
     "                        \"title\": data[\"name\"],\n",
     "                        \"tokens\": data[\"tokens\"],\n",
     "                        \"retrieve_doc\": data[\"retrieve_doc\"],\n",
     "                    },\n",
     "                    excluded_llm_metadata_keys=[\n",
     "                        \"url\",\n",
     "                        \"title\",\n",
     "                        \"tokens\",\n",
     "                        \"retrieve_doc\",\n",
     "                    ],\n",
     "                    excluded_embed_metadata_keys=[\n",
     "                        \"url\",\n",
     "                        \"title\",\n",
     "                        \"tokens\",\n",
     "                        \"retrieve_doc\",\n",
     "                    ],\n",
     "                )\n",
     "            )\n",
@@ -273,35 +265,81 @@
     "print(documents[0])\n",
     "print(documents[0].metadata)\n",
     "\n",
-    "document_dict = {doc.doc_id: doc for doc in documents}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# import chromadb\n",
     "\n",
-    "# # create client and a new collection\n",
-    "# # chromadb.EphemeralClient saves data in-memory.\n",
-    "# chroma_client = chromadb.PersistentClient(path=\"./ai-tutor-dataset\")\n",
-    "# chroma_collection = chroma_client.create_collection(\"ai-tutor-dataset\")\n",
     "\n",
-    "# from llama_index.vector_stores.chroma import ChromaVectorStore\n",
-    "# from llama_index.core import StorageContext\n",
     "\n",
-    "# # Define a storage context object using the created vector database.\n",
-    "# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
-    "# storage_context = StorageContext.from_defaults(vector_store=vector_store)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from llama_index.core import VectorStoreIndex\n",
     "from llama_index.core.node_parser import SentenceSplitter\n",
@@ -315,25 +353,16 @@
     "    transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],\n",
     "    show_progress=True,\n",
     "    use_async=True,\n",
-    "    # storage_context=storage_context,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# from llama_index.llms.openai import OpenAI\n",
-    "\n",
-    "# llm = OpenAI(temperature=1, model=\"gpt-3.5-turbo\", max_tokens=None)\n",
-    "# query_engine = index.as_query_engine(\n",
-    "#     llm=llm,\n",
-    "#     similarity_top_k=5,\n",
-    "#     embed_model=OpenAIEmbedding(model=\"text-embedding-3-small\"),\n",
-    "#     use_async=True,\n",
-    "# )\n",
     "retriever = index.as_retriever(\n",
     "    similarity_top_k=10,\n",
     "    use_async=True,\n",
@@ -344,15 +373,499 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from llama_index.core.data_structs import Node\n",
-    "from llama_index.core.schema import NodeWithScore\n",
     "\n",
-    "# res = query_engine.query(\"What is the LLaMa model?\")\n",
-    "# res.response\n",
     "\n",
     "# query = \"fine-tune a pretrained model\"\n",
     "# query = \"fine-tune an llm\"\n",
@@ -362,17 +875,18 @@
     "nodes = retriever.retrieve(query)\n",
     "\n",
     "\n",
-    "# # Filter nodes with the same ref_doc_id\n",
-    "# def filter_nodes_by_unique_doc_id(nodes):\n",
-    "#     unique_nodes = {}\n",
-    "#     for node in nodes:\n",
-    "#         doc_id = node.node.ref_doc_id\n",
-    "#         if doc_id is not None and doc_id not in unique_nodes:\n",
-    "#             unique_nodes[doc_id] = node\n",
-    "#     return list(unique_nodes.values())\n",
     "\n",
     "\n",
-    "# nodes = filter_nodes_by_unique_doc_id(nodes)\n",
     "\n",
     "for node in nodes:\n",
     "    print(\"Node ID\\t\", node.node_id)\n",
@@ -385,73 +899,140 @@
     "        print(\"This node will be replaced by the document\")\n",
     "        doc = document_dict[node.node.ref_doc_id]\n",
     "        # print(doc.text)\n",
-    "        new_node = (\n",
-    "            NodeWithScore(\n",
-    "                node=Node(text=doc.text, metadata=node.metadata), score=node.score\n",
-    "            ),\n",
     "        )\n",
     "        nodes_context.append(new_node)\n",
     "    else:\n",
-    "        nodes_context.append(node)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# from llama_index.core.schema import TextNode\n",
     "\n",
-    "# for src in res.source_nodes:\n",
-    "#     print(src.node.ref_doc_id)\n",
-    "#     # print(src.node.get_metadata_str())\n",
-    "#     print(\"Node ID\\t\", src.node_id)\n",
-    "#     print(\"Title\\t\", src.metadata[\"title\"])\n",
-    "#     print(\"Text\\t\", src.text)\n",
-    "#     print(\"Score\\t\", src.score)\n",
-    "#     print(\"Metadata\\t\", src.metadata)\n",
-    "#     print(\"-_\" * 20)\n",
-    "#     break"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from llama_index.core.data_structs import Node\n",
     "from llama_index.core.schema import NodeWithScore\n",
     "from llama_index.core import get_response_synthesizer\n",
     "from llama_index.llms.gemini import Gemini\n",
     "from llama_index.llms.openai import OpenAI\n",
     "\n",
-    "from tutor_prompts import (\n",
-    "    TEXT_QA_TEMPLATE,\n",
-    ")\n",
-    "\n",
-    "\n",
     "# llm = Gemini(model=\"models/gemini-1.5-flash\", temperature=1, max_tokens=None)\n",
-    "# llm = Gemini(model=\"models/gemini-1.5-pro\", temperature=1, max_tokens=None)\n",
     "# llm = OpenAI(temperature=1, model=\"gpt-3.5-turbo\", max_tokens=None)\n",
-    "llm = OpenAI(temperature=1, model=\"gpt-4o\", max_tokens=None)\n",
     "\n",
     "response_synthesizer = get_response_synthesizer(\n",
     "    llm=llm, response_mode=\"simple_summarize\", text_qa_template=TEXT_QA_TEMPLATE\n",
     ")\n",
     "\n",
-    "response = response_synthesizer.synthesize(\n",
-    "    query,\n",
-    "    nodes=nodes,\n",
-    "    # nodes=[\n",
-    "    #     NodeWithScore(\n",
-    "    #         node=Node(text=\"LLama2 model has a total of 2B parameters.\"), score=1.0\n",
-    "    #     ),\n",
-    "    # ],\n",
-    "    # text_chunks=[\"text1\", \"text2\", \"text3\"],\n",
-    ")\n",
-    "print(response.response)\n",
     "# for src in response.source_nodes:\n",
     "#     print(src.node.ref_doc_id)\n",
     "#     print(\"Node ID\\t\", src.node_id)\n",
@@ -658,8 +1239,8 @@
     "# from llama_index.vector_stores.chroma import ChromaVectorStore\n",
     "\n",
     "# # Create your index\n",
-    "# db2 = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n",
-    "# chroma_collection = db2.get_or_create_collection(\"ai-tutor-db\")\n",
     "# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
    ]
   },

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from dotenv import load_dotenv\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "### Clean scraped data\n",
+    "- Removes sections with <7 tokens and sections titled \"Transformers\""
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original number of lines: 10413\n",
+      "Cleaned number of lines: 4123\n"
+     ]
+    }
+   ],
    "source": [
     "import json\n",
+    "import uuid\n",
     "import tiktoken\n",
     "from collections import OrderedDict\n",
     "\n",
     "                token_count == 92 and json_obj.get(\"name\") == \"Transformers\"\n",
     "            ):\n",
     "                # Create a new OrderedDict with 'tokens' as the first key\n",
+    "                new_obj = OrderedDict(\n",
+    "                    [(\"tokens\", token_count), (\"doc_id\", str(uuid.uuid4()))]\n",
+    "                )\n",
     "                # Add the rest of the key-value pairs from the original object\n",
     "                new_obj.update(json_obj)\n",
     "                cleaned_data.append(new_obj)\n",
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "### Merges sections by 'URL'\n",
     "\n",
+    "- Excluding sections like \"model_doc\", \"internal\", \"main_classes\"\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "### Create a set of Llama-index Documents with each section in the jsonl file\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Doc ID: 85b2c5b6-ce24-4e4e-8a2d-d6557c917012\n",
+      "Text: DeepSpeed is a PyTorch optimization library that makes\n",
+      "distributed training memory-efficient and fast. At it’s core is the\n",
+      "Zero Redundancy Optimizer (ZeRO) which enables training large models\n",
+      "at scale. ZeRO works in several stages: ZeRO-1, optimizer state\n",
+      "partioning across GPUs ZeRO-2, gradient partitioning across GPUs\n",
+      "ZeRO-3, parameteter partit...\n",
+      "{'url': 'https://huggingface.co/docs/transformers/deepspeed', 'title': 'DeepSpeed', 'tokens': 8483, 'retrieve_doc': True, 'source': 'HF_Transformers'}\n"
+     ]
+    }
+   ],
    "source": [
     "from llama_index.core import Document\n",
     "from llama_index.core.schema import MetadataMode\n",
     "            data = json.loads(line)\n",
     "            documents.append(\n",
     "                Document(\n",
+    "                    doc_id=data[\"doc_id\"],\n",
     "                    text=data[\"content\"],\n",
     "                    metadata={\n",
     "                        \"url\": data[\"url\"],\n",
     "                        \"title\": data[\"name\"],\n",
     "                        \"tokens\": data[\"tokens\"],\n",
     "                        \"retrieve_doc\": data[\"retrieve_doc\"],\n",
+    "                        \"source\": \"HF_Transformers\",\n",
     "                    },\n",
     "                    excluded_llm_metadata_keys=[\n",
     "                        \"url\",\n",
     "                        \"title\",\n",
     "                        \"tokens\",\n",
     "                        \"retrieve_doc\",\n",
+    "                        \"source\",\n",
     "                    ],\n",
     "                    excluded_embed_metadata_keys=[\n",
     "                        \"url\",\n",
     "                        \"title\",\n",
     "                        \"tokens\",\n",
     "                        \"retrieve_doc\",\n",
+    "                        \"source\",\n",
     "                    ],\n",
     "                )\n",
     "            )\n",
     "print(documents[0])\n",
     "print(documents[0].metadata)\n",
     "\n",
+    "document_dict = {doc.doc_id: doc for doc in documents}\n",
+    "# save dict to disk, as .pkl file\n",
+    "import pickle\n",
+    "\n",
+    "with open(\"document_dict.pkl\", \"wb\") as f:\n",
+    "    pickle.dump(document_dict, f)\n",
+    "\n",
+    "# load dict from disk\n",
+    "with open(\"document_dict.pkl\", \"rb\") as f:\n",
+    "    document_dict = pickle.load(f)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import chromadb\n",
     "\n",
+    "# create client and a new collection\n",
+    "# chromadb.EphemeralClient saves data in-memory.\n",
+    "chroma_client = chromadb.PersistentClient(path=\"./ai-tutor-vector-db\")\n",
+    "chroma_collection = chroma_client.create_collection(\"ai-tutor-vector-db\")\n",
     "\n",
+    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
+    "from llama_index.core import StorageContext\n",
     "\n",
+    "# Define a storage context object using the created vector database.\n",
+    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
+    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/omar/Documents/ai_repos/ai-tutor-rag-system/env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Parsing nodes: 100%|██████████| 3374/3374 [00:36<00:00, 93.13it/s] \n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.60it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  5.00it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  4.78it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:22<00:00,  1.09s/it]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:05<00:00,  3.63it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.82it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  5.18it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.58it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.36it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.34it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.80it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.59it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.64it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.07it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.82it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:08<00:00,  2.39it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.81it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  4.26it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.69it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.97it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.61it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:02<00:00,  7.71it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  4.51it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.81it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.55it/s]\n",
+      "Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.69it/s]\n",
+      "Generating embeddings: 100%|██████████| 12/12 [00:04<00:00,  2.57it/s]\n"
+     ]
+    }
+   ],
    "source": [
     "from llama_index.core import VectorStoreIndex\n",
     "from llama_index.core.node_parser import SentenceSplitter\n",
     "    transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],\n",
     "    show_progress=True,\n",
     "    use_async=True,\n",
+    "    storage_context=storage_context,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "retriever = index.as_retriever(\n",
     "    similarity_top_k=10,\n",
     "    use_async=True,\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9\n",
+      "Node ID\t df8090eb-b13b-4f61-b94b-5489a43acfad\n",
+      "Title\t Generation with LLMs\n",
+      "Text\t That is why we have a GenerationConfig file associated with each model, which contains a good default generative parameterization and is loaded alongside your model.\n",
+      "Let’s talk code!\n",
+      "If you’re interested in basic LLM usage, our high-level Pipeline interface is a great starting point. However, LLMs often require advanced features like quantization and fine control of the token selection step, which is best done through generate() . Autoregressive generation with LLMs is also resource-intensive and should be executed on a GPU for adequate throughput.\n",
+      "First, you need to load the model.\n",
+      "Copied >>> from transformers import AutoModelForCausalLM >>> model = AutoModelForCausalLM.from_pretrained( ... \"mistralai/Mistral-7B-v0.1\" , device_map= \"auto\" , load_in_4bit= True ... )\n",
+      "You’ll notice two flags in the from_pretrained call:\n",
+      "device_map ensures the model is moved to your GPU(s) load_in_4bit applies 4-bit dynamic quantization to massively reduce the resource requirements\n",
+      "There are other ways to initialize a model, but this is a good baseline to begin with an LLM.\n",
+      "Next, you need to preprocess your text input with a tokenizer .\n",
+      "Copied >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained( \"mistralai/Mistral-7B-v0.1\" , padding_side= \"left\" ) >>> model_inputs = tokenizer([ \"A list of colors: red, blue\" ], return_tensors= \"pt\" ).to( \"cuda\" )\n",
+      "The model_inputs variable holds the tokenized text input, as well as the attention mask. While generate() does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.\n",
+      "After tokenizing the inputs, you can call the generate() method to returns the generated tokens. The generated tokens then should be converted to text before printing.\n",
+      "Copied >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] 'A list of colors: red, blue, green, yellow, orange, purple, pink,'\n",
+      "Finally, you don’t need to do it one sequence at a time! You can batch your inputs, which will greatly improve the throughput at a small latency and memory cost. All you need to do is to make sure you pad your inputs properly (more on that below).\n",
+      "Copied >>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default >>> model_inputs = tokenizer( ... [ \"A list of colors: red, blue\" , \"Portugal is\" ], return_tensors= \"pt\" , padding= True ... ).to( \"cuda\" ) >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )\n",
+      "[ 'A list of colors: red, blue, green, yellow, orange, purple, pink,' , 'Portugal is a country in southwestern Europe, on the Iber' ]\n",
+      "And that’s it! In a few lines of code, you can harness the power of an LLM.\n",
+      "\n",
+      "There are many generation strategies , and sometimes the default values may not be appropriate for your use case. If your outputs aren’t aligned with what you’re expecting, we’ve created a list of the most common pitfalls and how to avoid them.\n",
+      "Score\t 0.33395801169351413\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/llm_tutorial', 'title': 'Generation with LLMs', 'tokens': 2901, 'retrieve_doc': True, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "This node will be replaced by the document\n",
+      "LLMs, or Large Language Models, are the key component behind text generation. In a nutshell, they consist of large pretrained transformer models trained to predict the next word (or, more precisely, token) given some input text. Since they predict one token at a time, you need to do something more elaborate to generate new sentences other than just calling the model — you need to do autoregressive generation.\n",
+      "Autoregressive generation is the inference-time procedure of iteratively calling a model with its own generated outputs, given a few initial inputs. In 🤗 Transformers, this is handled by the generate() method, which is available to all models with generative capabilities.\n",
+      "This tutorial will show you how to:\n",
+      "Generate text with an LLM Avoid common pitfalls Next steps to help you get the most out of your LLM\n",
+      "Before you begin, make sure you have all the necessary libraries installed:\n",
+      "Copied pip install transformers bitsandbytes>=0.39.0 -q\n",
+      "\n",
+      "A language model trained for causal language modeling takes a sequence of text tokens as input and returns the probability distribution for the next token.\n",
+      "\"Forward pass of an LLM\"\n",
+      "A critical aspect of autoregressive generation with LLMs is how to select the next token from this probability distribution. Anything goes in this step as long as you end up with a token for the next iteration. This means it can be as simple as selecting the most likely token from the probability distribution or as complex as applying a dozen transformations before sampling from the resulting distribution.\n",
+      "\"Autoregressive generation iteratively selects the next token from a probability distribution to generate text\"\n",
+      "The process depicted above is repeated iteratively until some stopping condition is reached. Ideally, the stopping condition is dictated by the model, which should learn when to output an end-of-sequence ( EOS ) token. If this is not the case, generation stops when some predefined maximum length is reached.\n",
+      "Properly setting up the token selection step and the stopping condition is essential to make your model behave as you’d expect on your task. That is why we have a GenerationConfig file associated with each model, which contains a good default generative parameterization and is loaded alongside your model.\n",
+      "Let’s talk code!\n",
+      "If you’re interested in basic LLM usage, our high-level Pipeline interface is a great starting point. However, LLMs often require advanced features like quantization and fine control of the token selection step, which is best done through generate() . Autoregressive generation with LLMs is also resource-intensive and should be executed on a GPU for adequate throughput.\n",
+      "First, you need to load the model.\n",
+      "Copied >>> from transformers import AutoModelForCausalLM >>> model = AutoModelForCausalLM.from_pretrained( ... \"mistralai/Mistral-7B-v0.1\" , device_map= \"auto\" , load_in_4bit= True ... )\n",
+      "You’ll notice two flags in the from_pretrained call:\n",
+      "device_map ensures the model is moved to your GPU(s) load_in_4bit applies 4-bit dynamic quantization to massively reduce the resource requirements\n",
+      "There are other ways to initialize a model, but this is a good baseline to begin with an LLM.\n",
+      "Next, you need to preprocess your text input with a tokenizer .\n",
+      "Copied >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained( \"mistralai/Mistral-7B-v0.1\" , padding_side= \"left\" ) >>> model_inputs = tokenizer([ \"A list of colors: red, blue\" ], return_tensors= \"pt\" ).to( \"cuda\" )\n",
+      "The model_inputs variable holds the tokenized text input, as well as the attention mask. While generate() does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.\n",
+      "After tokenizing the inputs, you can call the generate() method to returns the generated tokens. The generated tokens then should be converted to text before printing.\n",
+      "Copied >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] 'A list of colors: red, blue, green, yellow, orange, purple, pink,'\n",
+      "Finally, you don’t need to do it one sequence at a time! You can batch your inputs, which will greatly improve the throughput at a small latency and memory cost. All you need to do is to make sure you pad your inputs properly (more on that below).\n",
+      "Copied >>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default >>> model_inputs = tokenizer( ... [ \"A list of colors: red, blue\" , \"Portugal is\" ], return_tensors= \"pt\" , padding= True ... ).to( \"cuda\" ) >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )\n",
+      "[ 'A list of colors: red, blue, green, yellow, orange, purple, pink,' , 'Portugal is a country in southwestern Europe, on the Iber' ]\n",
+      "And that’s it! In a few lines of code, you can harness the power of an LLM.\n",
+      "\n",
+      "There are many generation strategies , and sometimes the default values may not be appropriate for your use case. If your outputs aren’t aligned with what you’re expecting, we’ve created a list of the most common pitfalls and how to avoid them.\n",
+      "Copied >>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained( \"mistralai/Mistral-7B-v0.1\" ) >>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default >>> model = AutoModelForCausalLM.from_pretrained( ... \"mistralai/Mistral-7B-v0.1\" , device_map= \"auto\" , load_in_4bit= True ... )\n",
+      "\n",
+      "If not specified in the GenerationConfig file, generate returns up to 20 tokens by default. We highly recommend manually setting max_new_tokens in your generate call to control the maximum number of new tokens it can return. Keep in mind LLMs (more precisely, decoder-only models ) also return the input prompt as part of the output.\n",
+      "Copied >>> model_inputs = tokenizer([ \"A sequence of numbers: 1, 2\" ], return_tensors= \"pt\" ).to( \"cuda\" ) >>> # By default, the output will contain up to 20 tokens >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] 'A sequence of numbers: 1, 2, 3, 4, 5' >>> # Setting `max_new_tokens` allows you to control the maximum length >>> generated_ids = model.generate(**model_inputs, max_new_tokens= 50 ) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] 'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'\n",
+      "\n",
+      "By default, and unless specified in the GenerationConfig file, generate selects the most likely token at each iteration (greedy decoding). Depending on your task, this may be undesirable; creative tasks like chatbots or writing an essay benefit from sampling. On the other hand, input-grounded tasks like audio transcription or translation benefit from greedy decoding. Enable sampling with do_sample=True , and you can learn more about this topic in this blog post .\n",
+      "Copied >>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility >>> from transformers import set_seed >>> set_seed( 42 ) >>> model_inputs = tokenizer([ \"I am a cat.\" ], return_tensors= \"pt\" ).to( \"cuda\" ) >>> # LLM + greedy decoding = repetitive, boring output >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] 'I am a cat. I am a cat. I am a cat. I am a cat' >>> # With sampling, the output becomes more creative! >>> generated_ids = model.generate(**model_inputs, do_sample= True ) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] 'I am a cat.  Specifically, I am an indoor-only cat.  I'\n",
+      "\n",
+      "LLMs are decoder-only architectures, meaning they continue to iterate on your input prompt. If your inputs do not have the same length, they need to be padded. Since LLMs are not trained to continue from pad tokens, your input needs to be left-padded. Make sure you also don’t forget to pass the attention mask to generate!\n",
+      "Copied >>> # The tokenizer initialized above has right-padding active by default: the 1st sequence, >>> # which is shorter, has padding on the right side. Generation fails to capture the logic. >>> model_inputs = tokenizer( ... [ \"1, 2, 3\" , \"A, B, C, D, E\" ], padding= True , return_tensors= \"pt\" ... ).to( \"cuda\" ) >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] '1, 2, 33333333333' >>> # With left-padding, it works as expected! >>> tokenizer = AutoTokenizer.from_pretrained( \"mistralai/Mistral-7B-v0.1\" , padding_side= \"left\" ) >>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default >>> model_inputs = tokenizer( ... [ \"1, 2, 3\" , \"A, B, C, D, E\" ], padding= True , return_tensors= \"pt\" ... ).to( \"cuda\" ) >>> generated_ids = model.generate(**model_inputs) >>> tokenizer.batch_decode(generated_ids, skip_special_tokens= True )[ 0 ] '1, 2, 3, 4, 5, 6,'\n",
+      "\n",
+      "Some models and tasks expect a certain input prompt format to work properly. When this format is not applied, you will get a silent performance degradation: the model kinda works, but not as well as if you were following the expected prompt. More information about prompting, including which models and tasks need to be careful, is available in this guide . Let’s see an example with a chat LLM, which makes use of chat templating :\n",
+      "Copied >>> tokenizer = AutoTokenizer.from_pretrained( \"HuggingFaceH4/zephyr-7b-alpha\" ) >>> model = AutoModelForCausalLM.from_pretrained( ... \"HuggingFaceH4/zephyr-7b-alpha\" , device_map= \"auto\" , load_in_4bit= True ... ) >>> set_seed( 0 ) >>> prompt = \"\"\"How many helicopters can a human eat in one sitting? Reply as a thug.\"\"\" >>> model_inputs = tokenizer([prompt], return_tensors= \"pt\" ).to( \"cuda\" ) >>> input_length = model_inputs.input_ids.shape[ 1 ] >>> generated_ids = model.generate(**model_inputs, max_new_tokens= 20 ) >>> print (tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens= True )[ 0 ]) \"I'm not a thug, but i can tell you that a human cannot eat\" >>> # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write >>> # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`) >>> set_seed( 0 ) >>> messages = [ ... { ... \"role\" : \"system\" , ... \"content\" : \"You are a friendly chatbot who always responds in the style of a thug\" , ... }, ... { \"role\" : \"user\" , \"content\" : \"How many helicopters can a human eat in one sitting?\" }, ... ] >>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt= True , return_tensors= \"pt\" ).to( \"cuda\" ) >>> input_length = model_inputs.shape[ 1 ] >>> generated_ids = model.generate(model_inputs, do_sample= True , max_new_tokens= 20 ) >>> print (tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens= True )[ 0 ]) 'None, you thug. How bout you try to focus on more useful questions?' >>> # As we can see, it followed a proper thug style 😎\n",
+      "\n",
+      "While the autoregressive generation process is relatively straightforward, making the most out of your LLM can be a challenging endeavor because there are many moving parts. For your next steps to help you dive deeper into LLM usage and understanding:\n",
+      "\n",
+      "Guide on how to control different generation methods , how to set up the generation configuration file, and how to stream the output; Accelerating text generation ; Prompt templates for chat LLMs ; Prompt design guide ; API reference on GenerationConfig , generate() , and generate-related classes . Most of the classes, including the logits processors, have usage examples!\n",
+      "\n",
+      "Open LLM Leaderboard , which focuses on the quality of the open-source models; Open LLM-Perf Leaderboard , which focuses on LLM throughput.\n",
+      "\n",
+      "Guide on how to optimize LLMs for speed and memory ; Guide on quantization such as bitsandbytes and autogptq, which shows you how to drastically reduce your memory requirements.\n",
+      "\n",
+      "optimum , an extension of 🤗 Transformers that optimizes for specific hardware devices. outlines , a library where you can constrain text generation (e.g. to generate JSON files); text-generation-inference , a production-ready server for LLMs; text-generation-webui , a UI for text generation;\n",
+      "< > Update on GitHub\n",
+      "HTML_TAG_END\n",
+      "Node ID\t 038a63a0-4559-4e68-8d00-38dfb08f3e46\n",
+      "Title\t Trainer\n",
+      "Text\t Please see this appropriate section for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please raise an issue on GitHub if you encounter such issue.\n",
+      "\n",
+      "The LOMO optimizers have been introduced in Full Parameter Fine-Tuning for Large Language Models with Limited Resources and AdaLomo: Low-memory Optimization with Adaptive Learning Rate .\n",
+      "They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are \"lomo\" and \"adalomo\" . First either install LOMO from pypi pip install lomo-optim or install it from source with pip install git+https://github.com/OpenLMLab/LOMO.git .\n",
+      "According to the authors, it is recommended to use AdaLomo without grad_norm to get better performance and higher throughput.\n",
+      "Below is a simple script to demonstrate how to fine-tune google/gemma-2b on IMDB dataset in full precision:\n",
+      "Copied import torch import datasets from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM import trl\n",
+      "\n",
+      "train_dataset = datasets.load_dataset( 'imdb' , split= 'train' )\n",
+      "\n",
+      "args = TrainingArguments(\n",
+      "    output_dir= \"./test-lomo\" ,\n",
+      "    max_steps= 1000 ,\n",
+      "    per_device_train_batch_size= 4 ,\n",
+      "    optim= \"adalomo\" ,\n",
+      "    gradient_checkpointing= True ,\n",
+      "    logging_strategy= \"steps\" ,\n",
+      "    logging_steps= 1 ,\n",
+      "    learning_rate= 2e-6 ,\n",
+      "    save_strategy= \"no\" ,\n",
+      "    run_name= \"lomo-imdb\" ,\n",
+      ")\n",
+      "\n",
+      "model_id = \"google/gemma-2b\" tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+      "model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage= True ).to( 0 )\n",
+      "\n",
+      "trainer = trl.SFTTrainer(\n",
+      "    model=model, \n",
+      "    args=args,\n",
+      "    train_dataset=train_dataset,\n",
+      "    dataset_text_field= 'text' ,\n",
+      "    max_seq_length= 1024 ,\n",
+      ")\n",
+      "\n",
+      "trainer.train()\n",
+      "\n",
+      "The Trainer class is powered by Accelerate , a library for easily training PyTorch models in distributed environments with support for integrations such as FullyShardedDataParallel (FSDP) and DeepSpeed .\n",
+      "Learn more about FSDP sharding strategies, CPU offloading, and more with the Trainer in the Fully Sharded Data Parallel guide.\n",
+      "To use Accelerate with Trainer , run the accelerate.config command to set up training for your training environment. This command creates a config_file.yaml that’ll be used when you launch your training script. For example, some example configurations you can setup are:\n",
+      "DistributedDataParallel FSDP DeepSpeed DeepSpeed with Accelerate plugin\n",
+      "Copied compute_environment: LOCAL_MACHINE distributed_type: MULTI_GPU downcast_bf16: 'no' gpu_ids: all machine_rank: 0 #change rank as per the node main_process_ip: 192.168 .20 .1 main_process_port: 9898 main_training_function: main mixed_precision: fp16 num_machines: 2 num_processes: 8 rdzv_backend: static same_network: true tpu_env: [] tpu_use_cluster: false tpu_use_sudo: false use_cpu: false\n",
+      "The accelerate_launch command is the recommended way to launch your training script on a distributed system with Accelerate and Trainer with the parameters specified in config_file.yaml . This file is saved to the Accelerate cache folder and automatically loaded when you run accelerate_launch .\n",
+      "Score\t 0.333921268256774\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/trainer', 'title': 'Trainer', 'tokens': 4064, 'retrieve_doc': True, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "This node will be replaced by the document\n",
+      "The Trainer is a complete training and evaluation loop for PyTorch models implemented in the Transformers library. You only need to pass it the necessary pieces for training (model, tokenizer, dataset, evaluation function, training hyperparameters, etc.), and the Trainer class takes care of the rest. This makes it easier to start training faster without manually writing your own training loop. But at the same time, Trainer is very customizable and offers a ton of training options so you can tailor it to your exact training needs.\n",
+      "In addition to the Trainer class, Transformers also provides a Seq2SeqTrainer class for sequence-to-sequence tasks like translation or summarization. There is also the SFTTrainer class from the TRL library which wraps the Trainer class and is optimized for training language models like Llama-2 and Mistral with autoregressive techniques. SFTTrainer also supports features like sequence packing, LoRA, quantization, and DeepSpeed for efficiently scaling to any model size. Feel free to check out the API reference for these other Trainer -type classes to learn more about when to use which one. In general, Trainer is the most versatile option and is appropriate for a broad spectrum of tasks. Seq2SeqTrainer is designed for sequence-to-sequence tasks and SFTTrainer is designed for training language models.\n",
+      "Before you start, make sure Accelerate - a library for enabling and running PyTorch training across distributed environments - is installed.\n",
+      "Copied pip install accelerate # upgrade pip install accelerate --upgrade\n",
+      "This guide provides an overview of the Trainer class.\n",
+      "\n",
+      "Trainer includes all the code you’ll find in a basic training loop:\n",
+      "perform a training step to calculate the loss calculate the gradients with the backward method update the weights based on the gradients repeat this process until you’ve reached a predetermined number of epochs\n",
+      "The Trainer class abstracts all of this code away so you don’t have to worry about manually writing a training loop every time or if you’re just getting started with PyTorch and training. You only need to provide the essential components required for training, such as a model and a dataset, and the Trainer class handles everything else.\n",
+      "If you want to specify any training options or hyperparameters, you can find them in the TrainingArguments class. For example, let’s define where to save the model in output_dir and push the model to the Hub after training with push_to_hub=True .\n",
+      "Copied from transformers import TrainingArguments\n",
+      "\n",
+      "training_args = TrainingArguments(\n",
+      "    output_dir= \"your-model\" ,\n",
+      "    learning_rate= 2e-5 ,\n",
+      "    per_device_train_batch_size= 16 ,\n",
+      "    per_device_eval_batch_size= 16 ,\n",
+      "    num_train_epochs= 2 ,\n",
+      "    weight_decay= 0.01 ,\n",
+      "    eval_strategy= \"epoch\" ,\n",
+      "    save_strategy= \"epoch\" ,\n",
+      "    load_best_model_at_end= True ,\n",
+      "    push_to_hub= True ,\n",
+      ")\n",
+      "Pass training_args to the Trainer along with a model, dataset, something to preprocess the dataset with (depending on your data type it could be a tokenizer, feature extractor or image processor), a data collator, and a function to compute the metrics you want to track during training.\n",
+      "Finally, call train() to start training!\n",
+      "Copied from transformers import Trainer\n",
+      "\n",
+      "trainer = Trainer(\n",
+      "    model=model,\n",
+      "    args=training_args,\n",
+      "    train_dataset=dataset[ \"train\" ],\n",
+      "    eval_dataset=dataset[ \"test\" ],\n",
+      "    tokenizer=tokenizer,\n",
+      "    data_collator=data_collator,\n",
+      "    compute_metrics=compute_metrics,\n",
+      ")\n",
+      "\n",
+      "trainer.train()\n",
+      "\n",
+      "The Trainer class saves your model checkpoints to the directory specified in the output_dir parameter of TrainingArguments . You’ll find the checkpoints saved in a checkpoint-000 subfolder where the numbers at the end correspond to the training step. Saving checkpoints are useful for resuming training later.\n",
+      "Copied # resume from latest checkpoint trainer.train(resume_from_checkpoint= True ) # resume from specific checkpoint saved in output directory trainer.train(resume_from_checkpoint= \"your-model/checkpoint-1000\" )\n",
+      "You can save your checkpoints (the optimizer state is not saved by default) to the Hub by setting push_to_hub=True in TrainingArguments to commit and push them. Other options for deciding how your checkpoints are saved are set up in the hub_strategy parameter:\n",
+      "hub_strategy=\"checkpoint\" pushes the latest checkpoint to a subfolder named “last-checkpoint” from which you can resume training hub_strategy=\"all_checkpoints\" pushes all checkpoints to the directory defined in output_dir (you’ll see one checkpoint per folder in your model repository)\n",
+      "When you resume training from a checkpoint, the Trainer tries to keep the Python, NumPy, and PyTorch RNG states the same as they were when the checkpoint was saved. But because PyTorch has various non-deterministic default settings, the RNG states aren’t guaranteed to be the same. If you want to enable full determinism, take a look at the Controlling sources of randomness guide to learn what you can enable to make your training fully deterministic. Keep in mind though that by making certain settings deterministic, training may be slower.\n",
+      "\n",
+      "While the Trainer class is designed to be accessible and easy-to-use, it also offers a lot of customizability for more adventurous users. Many of the Trainer ’s method can be subclassed and overridden to support the functionality you want, without having to rewrite the entire training loop from scratch to accommodate it. These methods include:\n",
+      "get_train_dataloader() creates a training DataLoader get_eval_dataloader() creates an evaluation DataLoader get_test_dataloader() creates a test DataLoader log() logs information on the various objects that watch training create_optimizer_and_scheduler() creates an optimizer and learning rate scheduler if they weren’t passed in the __init__ ; these can also be separately customized with create_optimizer() and create_scheduler() respectively compute_loss() computes the loss on a batch of training inputs training_step() performs the training step prediction_step() performs the prediction and test step evaluate() evaluates the model and returns the evaluation metrics predict() makes predictions (with metrics if labels are available) on the test set\n",
+      "For example, if you want to customize the compute_loss() method to use a weighted loss instead.\n",
+      "Copied from torch import nn from transformers import Trainer class CustomTrainer ( Trainer ): def compute_loss ( self, model, inputs, return_outputs= False ):\n",
+      "        labels = inputs.pop( \"labels\" ) # forward pass outputs = model(**inputs)\n",
+      "        logits = outputs.get( \"logits\" ) # compute custom loss for 3 labels with different weights loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([ 1.0 , 2.0 , 3.0 ], device=model.device))\n",
+      "        loss = loss_fct(logits.view(- 1 , self.model.config.num_labels), labels.view(- 1 )) return (loss, outputs) if return_outputs else loss\n",
+      "\n",
+      "Another option for customizing the Trainer is to use callbacks . Callbacks don’t change anything in the training loop. They inspect the training loop state and then execute some action (early stopping, logging results, etc.) depending on the state. In other words, a callback can’t be used to implement something like a custom loss function and you’ll need to subclass and override the compute_loss() method for that.\n",
+      "For example, if you want to add an early stopping callback to the training loop after 10 steps.\n",
+      "Copied from transformers import TrainerCallback class EarlyStoppingCallback ( TrainerCallback ): def __init__ ( self, num_steps= 10 ):\n",
+      "        self.num_steps = num_steps def on_step_end ( self, args, state, control, **kwargs ): if state.global_step >= self.num_steps: return { \"should_training_stop\" : True } else : return {}\n",
+      "Then pass it to the Trainer ’s callback parameter.\n",
+      "Copied from transformers import Trainer\n",
+      "\n",
+      "trainer = Trainer(\n",
+      "    model=model,\n",
+      "    args=training_args,\n",
+      "    train_dataset=dataset[ \"train\" ],\n",
+      "    eval_dataset=dataset[ \"test\" ],\n",
+      "    tokenizer=tokenizer,\n",
+      "    data_collator=data_collator,\n",
+      "    compute_metrics=compute_metrics,\n",
+      "    callback=[EarlyStoppingCallback()],\n",
+      ")\n",
+      "\n",
+      "Check out the logging API reference for more information about the different logging levels.\n",
+      "The Trainer is set to logging.INFO by default which reports errors, warnings, and other basic information. A Trainer replica - in distributed environments - is set to logging.WARNING which only reports errors and warnings. You can change the logging level with the log_level and log_level_replica parameters in TrainingArguments .\n",
+      "To configure the log level setting for each node, use the log_on_each_node parameter to determine whether to use the log level on each node or only on the main node.\n",
+      "Trainer sets the log level separately for each node in the Trainer.__init__() method, so you may want to consider setting this sooner if you’re using other Transformers functionalities before creating the Trainer object.\n",
+      "For example, to set your main code and modules to use the same log level according to each node:\n",
+      "Copied logger = logging.getLogger(__name__)\n",
+      "\n",
+      "logging.basicConfig( format = \"%(asctime)s - %(levelname)s - %(name)s - %(message)s\" ,\n",
+      "    datefmt= \"%m/%d/%Y %H:%M:%S\" ,\n",
+      "    handlers=[logging.StreamHandler(sys.stdout)],\n",
+      ")\n",
+      "\n",
+      "log_level = training_args.get_process_log_level()\n",
+      "logger.setLevel(log_level)\n",
+      "datasets.utils.logging.set_verbosity(log_level)\n",
+      "transformers.utils.logging.set_verbosity(log_level)\n",
+      "\n",
+      "trainer = Trainer(...)\n",
+      "Use different combinations of log_level and log_level_replica to configure what gets logged on each of the nodes.\n",
+      "single node multi-node\n",
+      "Copied my_app.py ... --log_level warning --log_level_replica error\n",
+      "\n",
+      "NEFTune is a technique that can improve performance by adding noise to the embedding vectors during training. To enable it in Trainer , set the neftune_noise_alpha parameter in TrainingArguments to control how much noise is added.\n",
+      "Copied from transformers import TrainingArguments, Trainer\n",
+      "\n",
+      "training_args = TrainingArguments(..., neftune_noise_alpha= 0.1 )\n",
+      "trainer = Trainer(..., args=training_args)\n",
+      "NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior.\n",
+      "\n",
+      "Gradient Low-Rank Projection (GaLore) is a memory-efficient low-rank training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods, such as LoRA.\n",
+      "First make sure to install GaLore official repository:\n",
+      "Copied pip install galore-torch\n",
+      "Then simply add one of [\"galore_adamw\", \"galore_adafactor\", \"galore_adamw_8bit\"] in optim together with optim_target_modules , which can be a list of strings, regex or full path corresponding to the target module names you want to adapt. Below is an end-to-end example script (make sure to pip install trl datasets ):\n",
+      "Copied import torch import datasets import trl from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM\n",
+      "\n",
+      "train_dataset = datasets.load_dataset( 'imdb' , split= 'train' )\n",
+      "\n",
+      "args = TrainingArguments(\n",
+      "    output_dir= \"./test-galore\" ,\n",
+      "    max_steps= 100 ,\n",
+      "    per_device_train_batch_size= 2 ,\n",
+      "    optim= \"galore_adamw\" ,\n",
+      "    optim_target_modules=[ \"attn\" , \"mlp\" ]\n",
+      ")\n",
+      "\n",
+      "model_id = \"google/gemma-2b\" config = AutoConfig.from_pretrained(model_id)\n",
+      "\n",
+      "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+      "model = AutoModelForCausalLM.from_config(config).to( 0 )\n",
+      "\n",
+      "trainer = trl.SFTTrainer(\n",
+      "    model=model, \n",
+      "    args=args,\n",
+      "    train_dataset=train_dataset,\n",
+      "    dataset_text_field= 'text' ,\n",
+      "    max_seq_length= 512 ,\n",
+      ")\n",
+      "\n",
+      "trainer.train()\n",
+      "To pass extra arguments supports by GaLore, you should pass correctly optim_args , for example:\n",
+      "Copied import torch import datasets import trl from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM\n",
+      "\n",
+      "train_dataset = datasets.load_dataset( 'imdb' , split= 'train' )\n",
+      "\n",
+      "args = TrainingArguments(\n",
+      "    output_dir= \"./test-galore\" ,\n",
+      "    max_steps= 100 ,\n",
+      "    per_device_train_batch_size= 2 ,\n",
+      "    optim= \"galore_adamw\" ,\n",
+      "    optim_target_modules=[ \"attn\" , \"mlp\" ],\n",
+      "    optim_args= \"rank=64, update_proj_gap=100, scale=0.10\" ,\n",
+      ")\n",
+      "\n",
+      "model_id = \"google/gemma-2b\" config = AutoConfig.from_pretrained(model_id)\n",
+      "\n",
+      "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+      "model = AutoModelForCausalLM.from_config(config).to( 0 )\n",
+      "\n",
+      "trainer = trl.SFTTrainer(\n",
+      "    model=model, \n",
+      "    args=args,\n",
+      "    train_dataset=train_dataset,\n",
+      "    dataset_text_field= 'text' ,\n",
+      "    max_seq_length= 512 ,\n",
+      ")\n",
+      "\n",
+      "trainer.train()\n",
+      "You can read more about the method in the original repository or the paper .\n",
+      "Currently you can only train Linear layers that are considered as GaLore layers and will use low-rank decomposition to be trained while remaining layers will be optimized in the conventional manner.\n",
+      "Note it will take a bit of time before starting the training (~3 minutes for a 2B model on a NVIDIA A100), but training should go smoothly afterwards.\n",
+      "You can also perform layer-wise optimization by post-pending the optimizer name with layerwise like below:\n",
+      "Copied import torch import datasets import trl from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM\n",
+      "\n",
+      "train_dataset = datasets.load_dataset( 'imdb' , split= 'train' )\n",
+      "\n",
+      "args = TrainingArguments(\n",
+      "    output_dir= \"./test-galore\" ,\n",
+      "    max_steps= 100 ,\n",
+      "    per_device_train_batch_size= 2 ,\n",
+      "    optim= \"galore_adamw_layerwise\" ,\n",
+      "    optim_target_modules=[ \"attn\" , \"mlp\" ]\n",
+      ")\n",
+      "\n",
+      "model_id = \"google/gemma-2b\" config = AutoConfig.from_pretrained(model_id)\n",
+      "\n",
+      "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+      "model = AutoModelForCausalLM.from_config(config).to( 0 )\n",
+      "\n",
+      "trainer = trl.SFTTrainer(\n",
+      "    model=model, \n",
+      "    args=args,\n",
+      "    train_dataset=train_dataset,\n",
+      "    dataset_text_field= 'text' ,\n",
+      "    max_seq_length= 512 ,\n",
+      ")\n",
+      "\n",
+      "trainer.train()\n",
+      "Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see this appropriate section for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please raise an issue on GitHub if you encounter such issue.\n",
+      "\n",
+      "The LOMO optimizers have been introduced in Full Parameter Fine-Tuning for Large Language Models with Limited Resources and AdaLomo: Low-memory Optimization with Adaptive Learning Rate .\n",
+      "They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are \"lomo\" and \"adalomo\" . First either install LOMO from pypi pip install lomo-optim or install it from source with pip install git+https://github.com/OpenLMLab/LOMO.git .\n",
+      "According to the authors, it is recommended to use AdaLomo without grad_norm to get better performance and higher throughput.\n",
+      "Below is a simple script to demonstrate how to fine-tune google/gemma-2b on IMDB dataset in full precision:\n",
+      "Copied import torch import datasets from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM import trl\n",
+      "\n",
+      "train_dataset = datasets.load_dataset( 'imdb' , split= 'train' )\n",
+      "\n",
+      "args = TrainingArguments(\n",
+      "    output_dir= \"./test-lomo\" ,\n",
+      "    max_steps= 1000 ,\n",
+      "    per_device_train_batch_size= 4 ,\n",
+      "    optim= \"adalomo\" ,\n",
+      "    gradient_checkpointing= True ,\n",
+      "    logging_strategy= \"steps\" ,\n",
+      "    logging_steps= 1 ,\n",
+      "    learning_rate= 2e-6 ,\n",
+      "    save_strategy= \"no\" ,\n",
+      "    run_name= \"lomo-imdb\" ,\n",
+      ")\n",
+      "\n",
+      "model_id = \"google/gemma-2b\" tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+      "model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage= True ).to( 0 )\n",
+      "\n",
+      "trainer = trl.SFTTrainer(\n",
+      "    model=model, \n",
+      "    args=args,\n",
+      "    train_dataset=train_dataset,\n",
+      "    dataset_text_field= 'text' ,\n",
+      "    max_seq_length= 1024 ,\n",
+      ")\n",
+      "\n",
+      "trainer.train()\n",
+      "\n",
+      "The Trainer class is powered by Accelerate , a library for easily training PyTorch models in distributed environments with support for integrations such as FullyShardedDataParallel (FSDP) and DeepSpeed .\n",
+      "Learn more about FSDP sharding strategies, CPU offloading, and more with the Trainer in the Fully Sharded Data Parallel guide.\n",
+      "To use Accelerate with Trainer , run the accelerate.config command to set up training for your training environment. This command creates a config_file.yaml that’ll be used when you launch your training script. For example, some example configurations you can setup are:\n",
+      "DistributedDataParallel FSDP DeepSpeed DeepSpeed with Accelerate plugin\n",
+      "Copied compute_environment: LOCAL_MACHINE distributed_type: MULTI_GPU downcast_bf16: 'no' gpu_ids: all machine_rank: 0 #change rank as per the node main_process_ip: 192.168 .20 .1 main_process_port: 9898 main_training_function: main mixed_precision: fp16 num_machines: 2 num_processes: 8 rdzv_backend: static same_network: true tpu_env: [] tpu_use_cluster: false tpu_use_sudo: false use_cpu: false\n",
+      "The accelerate_launch command is the recommended way to launch your training script on a distributed system with Accelerate and Trainer with the parameters specified in config_file.yaml . This file is saved to the Accelerate cache folder and automatically loaded when you run accelerate_launch .\n",
+      "For example, to run the run_glue.py training script with the FSDP configuration:\n",
+      "Copied accelerate launch \\\n",
+      "    ./examples/pytorch/text-classification/run_glue.py \\\n",
+      "    --model_name_or_path google-bert/bert-base-cased \\\n",
+      "    --task_name $TASK_NAME \\\n",
+      "    --do_train \\\n",
+      "    --do_eval \\\n",
+      "    --max_seq_length 128 \\\n",
+      "    --per_device_train_batch_size 16 \\\n",
+      "    --learning_rate 5e-5 \\\n",
+      "    --num_train_epochs 3 \\\n",
+      "    --output_dir /tmp/ $TASK_NAME / \\\n",
+      "    --overwrite_output_dir\n",
+      "You could also specify the parameters from the config_file.yaml file directly in the command line:\n",
+      "Copied accelerate launch --num_processes=2 \\\n",
+      "    --use_fsdp \\\n",
+      "    --mixed_precision=bf16 \\\n",
+      "    --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \\\n",
+      "    --fsdp_transformer_layer_cls_to_wrap= \"BertLayer\" \\\n",
+      "    --fsdp_sharding_strategy=1 \\\n",
+      "    --fsdp_state_dict_type=FULL_STATE_DICT \\\n",
+      "    ./examples/pytorch/text-classification/run_glue.py\n",
+      "    --model_name_or_path google-bert/bert-base-cased \\\n",
+      "    --task_name $TASK_NAME \\\n",
+      "    --do_train \\\n",
+      "    --do_eval \\\n",
+      "    --max_seq_length 128 \\\n",
+      "    --per_device_train_batch_size 16 \\\n",
+      "    --learning_rate 5e-5 \\\n",
+      "    --num_train_epochs 3 \\\n",
+      "    --output_dir /tmp/ $TASK_NAME / \\\n",
+      "    --overwrite_output_dir\n",
+      "Check out the Launching your Accelerate scripts tutorial to learn more about accelerate_launch and custom configurations.\n",
+      "< > Update on GitHub\n",
+      "HTML_TAG_END\n",
+      "Node ID\t 9f229371-68e3-4388-bd8e-a95f3d956010\n",
+      "Title\t Resources\n",
+      "Text\t A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA2. If you’re interested in submitting a resource to be included here, please feel free to open a Pull Request and we’ll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.\n",
+      "Llama 2 is here - get it on Hugging Face , a blog post about Llama 2 and how to use it with 🤗 Transformers and 🤗 PEFT. LLaMA 2 - Every Resource you need , a compilation of relevant resources to learn about LLaMA 2 and how to get started quickly.\n",
+      "Text Generation\n",
+      "A notebook on how to fine-tune Llama 2 in Google Colab using QLoRA and 4-bit precision. 🌎 A notebook on how to fine-tune the “Llama-v2-7b-guanaco” model with 4-bit QLoRA and generate Q&A datasets from PDFs. 🌎\n",
+      "Text Classification\n",
+      "A notebook on how to fine-tune the Llama 2 model with QLoRa, TRL, and Korean text classification dataset. 🌎🇰🇷\n",
+      "⚗️ Optimization\n",
+      "Fine-tune Llama 2 with DPO , a guide to using the TRL library’s DPO method to fine tune Llama 2 on a specific dataset. Extended Guide: Instruction-tune Llama 2 , a guide to training Llama 2 to generate instructions from inputs, transforming the model from instruction-following to instruction-giving. A notebook on how to fine-tune the Llama 2 model on a personal computer using QLoRa and TRL. 🌎\n",
+      "⚡️ Inference\n",
+      "A notebook on how to quantize the Llama 2 model using GPTQ from the AutoGPTQ library. 🌎 A notebook on how to run the Llama 2 Chat Model with 4-bit quantization on a local computer or Google Colab. 🌎\n",
+      "🚀 Deploy\n",
+      "Fine-tune LLaMA 2 (7-70B) on Amazon SageMaker , a complete guide from setup to QLoRA fine-tuning and deployment on Amazon SageMaker. Deploy Llama 2 7B/13B/70B on Amazon SageMaker , a guide on using Hugging Face’s LLM DLC container for secure and scalable deployment.\n",
+      "Score\t 0.3272136875241064\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/model_doc/llama2', 'title': 'Resources', 'tokens': 511, 'retrieve_doc': False, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t 17a1946d-1195-4eca-b37b-da849a5810a4\n",
+      "Title\t Overview\n",
+      "Text\t The Open-Llama model was proposed in the open source Open-Llama project by community developer s-JoL.\n",
+      "The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.\n",
+      "And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.\n",
+      "This model was contributed by s-JoL .\n",
+      "The original code was released on GitHub by s-JoL , but is now removed.\n",
+      "Score\t 0.32692360476331983\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/model_doc/open-llama', 'title': 'Overview', 'tokens': 109, 'retrieve_doc': False, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t aa2270f6-685e-400c-a70d-d109c9615f5b\n",
+      "Title\t Overview\n",
+      "Text\t The Llama3 model was proposed in Introducing Meta Llama 3: The most capable openly available LLM to date by the meta AI team.\n",
+      "The abstract from the blogpost is the following:\n",
+      "Today, we’re excited to share the first two models of the next generation of Llama, Meta Llama 3, available for broad use. This release features pretrained and instruction-fine-tuned language models with 8B and 70B parameters that can support a broad range of use cases. This next generation of Llama demonstrates state-of-the-art performance on a wide range of industry benchmarks and offers new capabilities, including improved reasoning. We believe these are the best open source models of their class, period. In support of our longstanding open approach, we’re putting Llama 3 in the hands of the community. We want to kickstart the next wave of innovation in AI across the stack—from applications to developer tools to evals to inference optimizations and more. We can’t wait to see what you build and look forward to your feedback.\n",
+      "Checkout all Llama3 model checkpoints here .\n",
+      "The original code of the authors can be found here .\n",
+      "Score\t 0.3262211438008452\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/model_doc/llama3', 'title': 'Overview', 'tokens': 232, 'retrieve_doc': False, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t 165a9166-b34a-41b7-96a4-ca219c700004\n",
+      "Title\t Usage tips\n",
+      "Text\t XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to\n",
+      "select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation). XLM has multilingual checkpoints which leverage a specific lang parameter. Check out the multi-lingual page for more information. A transformer model trained on several languages. There are three different type of training for this model and the library provides checkpoints for all of them: Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the previous section as well). One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages. Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, with dynamic masking of the tokens. A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two different languages, with random masking. To predict one of the masked tokens, the model can use both, the surrounding context in language 1 and the context given by language 2.\n",
+      "Score\t 0.31788594866380976\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/model_doc/xlm', 'title': 'Usage tips', 'tokens': 277, 'retrieve_doc': False, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t 35d185e8-43f2-454f-800f-5f3fcdc93642\n",
+      "Title\t Overview\n",
+      "Text\t The Llama2 model was proposed in LLaMA: Open Foundation and Fine-Tuned Chat Models by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. It is a collection of foundation language models ranging from 7B to 70B parameters, with checkpoints finetuned for chat application!\n",
+      "The abstract from the paper is the following:\n",
+      "In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.\n",
+      "Checkout all Llama2 model checkpoints here .\n",
+      "This model was contributed by Arthur Zucker with contributions from Lysandre Debut . The code of the implementation in Hugging Face is based on GPT-NeoX here . The original code of the authors can be found here .\n",
+      "Score\t 0.3115804336253633\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/model_doc/llama2', 'title': 'Overview', 'tokens': 595, 'retrieve_doc': False, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t d26d2116-374a-4db6-bda6-30c4b9e581e8\n",
+      "Title\t Resources\n",
+      "Text\t A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA. If you’re interested in submitting a resource to be included here, please feel free to open a Pull Request and we’ll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.\n",
+      "Text Classification\n",
+      "A notebook on how to use prompt tuning to adapt the LLaMA model for text classification task. 🌎\n",
+      "Question Answering\n",
+      "StackLLaMA: A hands-on guide to train LLaMA with RLHF , a blog post about how to train LLaMA to answer questions on Stack Exchange with RLHF.\n",
+      "⚗️ Optimization\n",
+      "A notebook on how to fine-tune LLaMA model using xturing library on GPU which has limited memory. 🌎\n",
+      "⚡️ Inference\n",
+      "A notebook on how to run the LLaMA Model using PeftModel from the 🤗 PEFT library. 🌎 A notebook on how to load a PEFT adapter LLaMA model with LangChain. 🌎\n",
+      "🚀 Deploy\n",
+      "A notebook on how to fine-tune LLaMA model using LoRA method via the 🤗 PEFT library with intuitive UI. 🌎 A notebook on how to deploy Open-LLaMA model for text generation on Amazon SageMaker. 🌎\n",
+      "Score\t 0.3112708125913629\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/model_doc/llama', 'title': 'Resources', 'tokens': 287, 'retrieve_doc': False, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "Node ID\t ed8ce253-a77a-4ccf-9bc6-8a66a8bfaa8e\n",
+      "Title\t Overview\n",
+      "Text\t LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. It is an auto-regressive language model, based on the transformer architecture. In other words, it is an multi-modal version of LLMs fine-tuned for chat / instructions.\n",
+      "The LLaVa model was proposed in Visual Instruction Tuning and improved in Improved Baselines with Visual Instruction Tuning by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.\n",
+      "The abstract from the paper is the following:\n",
+      "Large multimodal models (LMM) have recently shown encouraging progress with visual instruction tuning. In this note, we show that the fully-connected vision-language cross-modal connector in LLaVA is surprisingly powerful and data-efficient. With simple modifications to LLaVA, namely, using CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA data with simple response formatting prompts, we establish stronger baselines that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint uses merely 1.2M publicly available data, and finishes full training in ∼1 day on a single 8-A100 node. We hope this can make state-of-the-art LMM research more accessible. Code and model will be publicly available\n",
+      "LLaVa architecture. Taken from the original paper.\n",
+      "This model was contributed by ArthurZ and ybelkada .\n",
+      "The original code can be found here .\n",
+      "Score\t 0.305050072534564\n",
+      "Metadata\t {'url': 'https://huggingface.co/docs/transformers/model_doc/llava', 'title': 'Overview', 'tokens': 307, 'retrieve_doc': False, 'source': 'HF_Transformers'}\n",
+      "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
+      "9\n"
+     ]
+    }
+   ],
    "source": [
     "from llama_index.core.data_structs import Node\n",
+    "from llama_index.core.schema import NodeWithScore, BaseNode, TextNode\n",
     "\n",
     "\n",
     "# query = \"fine-tune a pretrained model\"\n",
     "# query = \"fine-tune an llm\"\n",
     "nodes = retriever.retrieve(query)\n",
     "\n",
     "\n",
+    "# Filter nodes with the same ref_doc_id\n",
+    "def filter_nodes_by_unique_doc_id(nodes):\n",
+    "    unique_nodes = {}\n",
+    "    for node in nodes:\n",
+    "        doc_id = node.node.ref_doc_id\n",
+    "        if doc_id is not None and doc_id not in unique_nodes:\n",
+    "            unique_nodes[doc_id] = node\n",
+    "    return list(unique_nodes.values())\n",
     "\n",
     "\n",
+    "nodes = filter_nodes_by_unique_doc_id(nodes)\n",
+    "print(len(nodes))\n",
     "\n",
     "for node in nodes:\n",
     "    print(\"Node ID\\t\", node.node_id)\n",
     "        print(\"This node will be replaced by the document\")\n",
     "        doc = document_dict[node.node.ref_doc_id]\n",
     "        # print(doc.text)\n",
+    "        new_node = NodeWithScore(\n",
+    "            node=TextNode(text=doc.text, metadata=node.metadata), score=node.score\n",
     "        )\n",
+    "        print(new_node.text)\n",
     "        nodes_context.append(new_node)\n",
     "    else:\n",
+    "        nodes_context.append(node)\n",
+    "\n",
+    "print(len(nodes_context))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from llama_index.core import ChatPromptTemplate\n",
+    "from llama_index.core.llms import ChatMessage, MessageRole\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "system_prompt = (\n",
+    "    \"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine-tuning models, giving 'memory' to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, Llama-Index, LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context.\"\n",
+    "    \"You are provided information found in Hugging Face's documentation and the RAG course. \"\n",
+    "    \"Only some information might be relevant to the question, so ignore the irrelevant part and use the relevant part to answer the question.\"\n",
+    "    \"Only respond with information given to you documentation. DO NOT use additional information, even if you know the answer. \"\n",
+    "    \"If the answer is somewhere in the documentation, answer the question (depending on the questions and the variety of relevant information in the documentation, give complete and helpful answers.\"\n",
+    "    \"Here is the information you can use, the order is not important: \\n\\n\"\n",
+    "    \"---------------------\\n\"\n",
+    "    \"{context_str}\\n\"\n",
+    "    \"---------------------\\n\\n\"\n",
+    "    \"REMEMBER:\\n\"\n",
+    "    \"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context.\"\n",
+    "    \"You are provided information found in Hugging Face's documentation and the RAG course. \"\n",
+    "    \"Here are the rules you must follow:\\n\"\n",
+    "    \"* Only respond with information inside the documentation. DO NOT provide additional information, even if you know the answer. \"\n",
+    "    \"* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. \"\n",
+    "    \"* Only use information summarized from the documentation, do not respond otherwise. \"\n",
+    "    \"* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. \"\n",
+    "    \"* Do not reference any links, urls or hyperlinks in your answers.\\n\"\n",
+    "    \"* Make sure to format your answers in Markdown format, including code block and snippets.\\n\"\n",
+    "    \"Now answer the following question: \\n\"\n",
+    ")\n",
     "\n",
+    "chat_text_qa_msgs: list[ChatMessage] = [\n",
+    "    ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),\n",
+    "    ChatMessage(\n",
+    "        role=MessageRole.USER,\n",
+    "        content=\"{query_str}\",\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "TEXT_QA_TEMPLATE = ChatPromptTemplate(chat_text_qa_msgs)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "This is a very broad question. There are many ways to fine-tune a large language model, and the best approach will depend on the specific model and the desired outcome. Generally, the process involves taking a pre-trained language model and further training it on a dataset specific to the task you are interested in. For example, you can fine-tune a large language model on a dataset of code to make it better at generating code. Also, you could fine-tune it on a dataset of dialogue to make it better at generating more engaging and human-like dialogue. \n",
+       "\n",
+       "Here is an example of fine-tuning a [google/gemma-2b](https://huggingface.co/google/gemma-2b) model on the IMDB dataset using the `trl.SFTTrainer` and the AdaLomo optimizer:\n",
+       "\n",
+       "```python\n",
+       "import torch\n",
+       "import datasets\n",
+       "from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM\n",
+       "import trl\n",
+       "\n",
+       "train_dataset = datasets.load_dataset('imdb', split='train')\n",
+       "\n",
+       "args = TrainingArguments(\n",
+       "    output_dir= \"./test-lomo\",\n",
+       "    max_steps= 1000,\n",
+       "    per_device_train_batch_size= 4,\n",
+       "    optim= \"adalomo\",\n",
+       "    gradient_checkpointing= True,\n",
+       "    logging_strategy= \"steps\",\n",
+       "    logging_steps= 1,\n",
+       "    learning_rate= 2e-6,\n",
+       "    save_strategy= \"no\",\n",
+       "    run_name= \"lomo-imdb\",\n",
+       ")\n",
+       "\n",
+       "model_id = \"google/gemma-2b\"\n",
+       "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+       "model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage= True).to(0)\n",
+       "\n",
+       "trainer = trl.SFTTrainer(\n",
+       "    model=model,\n",
+       "    args=args,\n",
+       "    train_dataset=train_dataset,\n",
+       "    dataset_text_field= 'text',\n",
+       "    max_seq_length= 1024,\n",
+       ")\n",
+       "\n",
+       "trainer.train()\n",
+       "```\n",
+       "\n",
+       "This is just one example of fine-tuning a large language model. There are many other methods and techniques available. I recommend checking out the Hugging Face documentation and exploring the different options to find the best approach for your needs. \n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
+    "from IPython.display import Markdown\n",
     "from llama_index.core.data_structs import Node\n",
     "from llama_index.core.schema import NodeWithScore\n",
     "from llama_index.core import get_response_synthesizer\n",
     "from llama_index.llms.gemini import Gemini\n",
     "from llama_index.llms.openai import OpenAI\n",
     "\n",
     "# llm = Gemini(model=\"models/gemini-1.5-flash\", temperature=1, max_tokens=None)\n",
+    "llm = Gemini(model=\"models/gemini-1.5-pro\", temperature=1, max_tokens=None)\n",
     "# llm = OpenAI(temperature=1, model=\"gpt-3.5-turbo\", max_tokens=None)\n",
+    "# llm = OpenAI(temperature=1, model=\"gpt-4o\", max_tokens=None)\n",
     "\n",
     "response_synthesizer = get_response_synthesizer(\n",
     "    llm=llm, response_mode=\"simple_summarize\", text_qa_template=TEXT_QA_TEMPLATE\n",
     ")\n",
     "\n",
+    "response = response_synthesizer.synthesize(query, nodes=nodes_context)\n",
+    "# print(response.response)\n",
+    "display(Markdown(response.response))\n",
+    "\n",
     "# for src in response.source_nodes:\n",
     "#     print(src.node.ref_doc_id)\n",
     "#     print(\"Node ID\\t\", src.node_id)\n",
     "# from llama_index.vector_stores.chroma import ChromaVectorStore\n",
     "\n",
     "# # Create your index\n",
+    "# db2 = chromadb.PersistentClient(path=\"./ai-tutor-dataset\")\n",
+    "# chroma_collection = db2.get_or_create_collection(\"ai-tutor-dataset\")\n",
     "# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
    ]
   },

scripts/gradio-ui.py CHANGED Viewed

@@ -1,33 +1,29 @@
-import os
 import logging
-from typing import Optional
 from datetime import datetime
 import chromadb
-from llama_index.core.tools import QueryEngineTool, FunctionTool, ToolMetadata
 from llama_index.agent.openai import OpenAIAgent
-from llama_index.vector_stores.chroma import ChromaVectorStore
-from llama_index.core import VectorStoreIndex
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.llms.openai import OpenAI
-from llama_index.core.vector_stores import (
-    MetadataFilters,
-    MetadataFilter,
-    FilterCondition,
-)
-import gradio as gr
-from gradio.themes.utils import (
-    fonts,
-)
-from utils import init_mongo_db
 from tutor_prompts import (
     TEXT_QA_TEMPLATE,
     QueryValidation,
-    system_message_validation,
     system_message_openai_agent,
 )
-from call_openai import api_function_call
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -43,8 +39,8 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
 CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
 MONGODB_URI = os.getenv("MONGODB_URI")
-DB_NAME = os.getenv("DB_NAME", "ai-tutor-db")
-DB_PATH = os.getenv("DB_PATH", f"scripts/{DB_NAME}")
 if not os.path.exists(DB_PATH):
     # Download the vector database from the Hugging Face Hub if it doesn't exist locally
@@ -55,103 +51,49 @@ if not os.path.exists(DB_PATH):
     from huggingface_hub import snapshot_download
     snapshot_download(
-        repo_id="towardsai-buster/ai-tutor-db", local_dir=DB_PATH, repo_type="dataset"
     )
     logger.info(f"Downloaded vector database to {DB_PATH}")
 AVAILABLE_SOURCES_UI = [
-    "Gen AI 360: LLMs",
-    "Gen AI 360: LangChain",
-    "Gen AI 360: Advanced RAG",
-    "Towards AI Blog",
-    "Activeloop Docs",
-    "HF Transformers Docs",
-    "Wikipedia",
-    "OpenAI Docs",
-    "LangChain Docs",
 ]
 AVAILABLE_SOURCES = [
-    "llm_course",
-    "langchain_course",
-    "advanced_rag_course",
-    "towards_ai",
-    "activeloop",
-    "hf_transformers",
-    "wikipedia",
-    "openai",
-    "langchain_docs",
 ]
-# Initialize MongoDB
-mongo_db = (
-    init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster")
-    if MONGODB_URI
-    else logger.warning("No mongodb uri found, you will not be able to save data.")
-)
 # Initialize vector store and index
 db2 = chromadb.PersistentClient(path=DB_PATH)
-chroma_collection = db2.get_or_create_collection(DB_NAME)
 vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
-index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
-# Initialize OpenAI models
-llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=None)
-# embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
-embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="similarity")
-query_engine = index.as_query_engine(
-    llm=llm,
-    similarity_top_k=5,
-    embed_model=embeds,
-    streaming=True,
-    text_qa_template=TEXT_QA_TEMPLATE,
-)
-query_engine_tools = [
-    QueryEngineTool(
-        query_engine=query_engine,
-        metadata=ToolMetadata(
-            name="AI_information",
-            description="""The 'AI_information' tool serves as a comprehensive repository for insights into the field of artificial intelligence. When utilizing this tool, the input should be the user's complete question. The input can also be adapted to focus on specific aspects or further details of the current topic under discussion. This dynamic input approach allows for a tailored exploration of AI subjects, ensuring that responses are relevant and informative. Employ this tool to fetch nuanced information on topics such as model training, fine-tuning, LLM augmentation, and more, thereby facilitating a rich, context-aware dialogue.""",
-        ),
-    )
-]
-def initialize_agent():
-    agent = OpenAIAgent.from_tools(
-        query_engine_tools,
-        llm=llm,
-        verbose=True,
-        system_prompt=system_message_openai_agent,
-    )
-    return agent
-def reset_agent(agent_state):
-    agent_state = initialize_agent()  # Reset the agent by reassigning a new instance
-    chatbot = [[None, None]]
-    return "Agent has been reset.", chatbot
-def log_emails(email: gr.Textbox):
-    collection = "email_data-test"
-    if mongo_db is None:
-        logger.warning("No MongoDB instance found, skipping email logging")
-        return ""
-    logger.info(f"User reported {email=}")
-    email_document = {"email": email}
-    try:
-        mongo_db[collection].insert_one(email_document)
-        logger.info("")
-    except:
-        logger.info("Something went wrong logging")
-    return ""
 def format_sources(completion) -> str:
@@ -194,6 +136,7 @@ def add_sources(history, completion):
         yield history
     history[-1][1] += "\n\n" + formatted_sources
     yield history
@@ -206,7 +149,57 @@ def get_answer(history, agent_state):
     user_input = history[-1][0]
     history[-1][1] = ""
-    completion = agent_state.stream_chat(user_input)
     for token in completion.response_gen:
         history[-1][1] += token
@@ -216,7 +209,7 @@ def get_answer(history, agent_state):
 example_questions = [
-    "What is the LLama model?",
     "What is a Large Language Model?",
     "What is an embedding?",
 ]
@@ -224,7 +217,8 @@ example_questions = [
 with gr.Blocks(fill_height=True) as demo:
-    agent_state = gr.State(initialize_agent())
     with gr.Row():
         gr.HTML(
@@ -248,25 +242,21 @@ with gr.Blocks(fill_height=True) as demo:
             show_label=False,
         )
         submit = gr.Button(value="Send", variant="primary", scale=1)
-        reset_button = gr.Button("Reset Chat", variant="secondary", scale=1)
-    with gr.Row():
-        examples = gr.Examples(
-            examples=example_questions,
-            inputs=question,
-        )
-        with gr.Row():
-            email = gr.Textbox(
-                label="Want to receive updates about our AI tutor?",
-                placeholder="Enter your email here...",
-                lines=1,
-                scale=6,
-            )
-            submit_email = gr.Button(value="Submit", variant="secondary", scale=1)
-    gr.Markdown(
-        "This application uses GPT3.5-Turbo to search the docs for relevant information and answer questions."
-    )
     completion = gr.State()
@@ -296,11 +286,11 @@ with gr.Blocks(fill_height=True) as demo:
     #     save_completion, inputs=[completion, chatbot]
     # )
-    reset_button.click(
-        reset_agent, inputs=[agent_state], outputs=[agent_state, chatbot]
-    )
-    submit_email.click(log_emails, email, email)
-    email.submit(log_emails, email, email)
 demo.queue(default_concurrency_limit=CONCURRENCY_COUNT)
 demo.launch(debug=False, share=False)

+import json
 import logging
+import os
+import pickle
 from datetime import datetime
+from typing import Optional
 import chromadb
+import gradio as gr
 from llama_index.agent.openai import OpenAIAgent
+from llama_index.core import VectorStoreIndex, get_response_synthesizer
+from llama_index.core.data_structs import Node
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.schema import BaseNode, MetadataMode, NodeWithScore, TextNode
 from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.gemini import Gemini
 from llama_index.llms.openai import OpenAI
+from llama_index.vector_stores.chroma import ChromaVectorStore
 from tutor_prompts import (
     TEXT_QA_TEMPLATE,
     QueryValidation,
     system_message_openai_agent,
+    system_message_validation,
 )
+# from utils import init_mongo_db
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
 MONGODB_URI = os.getenv("MONGODB_URI")
+DB_PATH = os.getenv("DB_PATH", f"scripts/ai-tutor-vector-db")
+DB_COLLECTION = os.getenv("DB_NAME", "ai-tutor-vector-db")
 if not os.path.exists(DB_PATH):
     # Download the vector database from the Hugging Face Hub if it doesn't exist locally
     from huggingface_hub import snapshot_download
     snapshot_download(
+        repo_id="towardsai-buster/ai-tutor-vector-db",
+        local_dir=DB_PATH,
+        repo_type="dataset",
     )
     logger.info(f"Downloaded vector database to {DB_PATH}")
 AVAILABLE_SOURCES_UI = [
+    "HF Transformers",
 ]
 AVAILABLE_SOURCES = [
+    "HF_Transformers",
 ]
+# # Initialize MongoDB
+# mongo_db = (
+#     init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster")
+#     if MONGODB_URI
+#     else logger.warning("No mongodb uri found, you will not be able to save data.")
+# )
 # Initialize vector store and index
 db2 = chromadb.PersistentClient(path=DB_PATH)
+chroma_collection = db2.get_or_create_collection(DB_COLLECTION)
 vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+index = VectorStoreIndex.from_vector_store(
+    vector_store=vector_store,
+    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
+    transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],
+    show_progress=True,
+    use_async=True,
+)
+retriever = index.as_retriever(
+    similarity_top_k=10,
+    use_async=True,
+    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
+)
+with open("scripts/document_dict.pkl", "rb") as f:
+    document_dict = pickle.load(f)
 def format_sources(completion) -> str:
         yield history
     history[-1][1] += "\n\n" + formatted_sources
+    # history.append([None, formatted_sources])
     yield history
     user_input = history[-1][0]
     history[-1][1] = ""
+    query = user_input
+    nodes_context = []
+    nodes = retriever.retrieve(query)
+    # Filter nodes with the same ref_doc_id
+    def filter_nodes_by_unique_doc_id(nodes):
+        unique_nodes = {}
+        for node in nodes:
+            doc_id = node.node.ref_doc_id
+            if doc_id is not None and doc_id not in unique_nodes:
+                unique_nodes[doc_id] = node
+        return list(unique_nodes.values())
+    nodes = filter_nodes_by_unique_doc_id(nodes)
+    print(len(nodes))
+    for node in nodes:
+        print("Node ID\t", node.node_id)
+        print("Title\t", node.metadata["title"])
+        print("Text\t", node.text)
+        print("Score\t", node.score)
+        print("Metadata\t", node.metadata)
+        print("-_" * 20)
+        if node.metadata["retrieve_doc"] == True:
+            print("This node will be replaced by the document")
+            doc = document_dict[node.node.ref_doc_id]
+            print(doc.text)
+            new_node = NodeWithScore(
+                node=TextNode(text=doc.text, metadata=node.metadata), score=node.score
+            )
+            print(type(new_node))
+            nodes_context.append(new_node)
+        else:
+            nodes_context.append(node)
+            print(type(node))
+    # llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=None)
+    llm = Gemini(model="models/gemini-1.5-pro", temperature=1, max_tokens=None)
+    # llm = OpenAI(temperature=1, model="gpt-3.5-turbo", max_tokens=None)
+    # llm = OpenAI(temperature=1, model="gpt-4o", max_tokens=None)
+    response_synthesizer = get_response_synthesizer(
+        llm=llm,
+        response_mode="simple_summarize",
+        text_qa_template=TEXT_QA_TEMPLATE,
+        streaming=True,
+    )
+    completion = response_synthesizer.synthesize(query, nodes=nodes_context)
     for token in completion.response_gen:
         history[-1][1] += token
 example_questions = [
+    "how to fine-tune an llm?",
     "What is a Large Language Model?",
     "What is an embedding?",
 ]
 with gr.Blocks(fill_height=True) as demo:
+    # agent_state = gr.State(initialize_agent())
+    agent_state = gr.State()
     with gr.Row():
         gr.HTML(
             show_label=False,
         )
         submit = gr.Button(value="Send", variant="primary", scale=1)
+        # reset_button = gr.Button("Reset Chat", variant="secondary", scale=1)
+    # with gr.Row():
+    #     examples = gr.Examples(
+    #         examples=example_questions,
+    #         inputs=question,
+    #     )
+    # with gr.Row():
+    #     email = gr.Textbox(
+    #         label="Want to receive updates about our AI tutor?",
+    #         placeholder="Enter your email here...",
+    #         lines=1,
+    #         scale=6,
+    #     )
+    #     submit_email = gr.Button(value="Submit", variant="secondary", scale=1)
     completion = gr.State()
     #     save_completion, inputs=[completion, chatbot]
     # )
+    # reset_button.click(
+    #     reset_agent, inputs=[agent_state], outputs=[agent_state, chatbot]
+    # )
+    # submit_email.click(log_emails, email, email)
+    # email.submit(log_emails, email, email)
 demo.queue(default_concurrency_limit=CONCURRENCY_COUNT)
 demo.launch(debug=False, share=False)

scripts/tutor_prompts.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.core import ChatPromptTemplate
 from pydantic import BaseModel, Field
 default_user_prompt = (
@@ -12,34 +12,57 @@ default_user_prompt = (
 )
 system_prompt = (
-    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
-    "You are provided information found in the json documentation. "
-    "Only respond with information inside the json documentation. DO NOT use additional information, even if you know the answer. "
-    "If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation, answer in 5 paragraphs."
-    "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
-    "Here is the information you can use in order: \n"
     "---------------------\n"
     "{context_str}\n"
-    "---------------------\n"
     "REMEMBER:\n"
-    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
-    "You are provided information found in the json documentation. "
     "Here are the rules you must follow:\n"
-    "* Only respond with information inside the json documentation. DO NOT provide additional information, even if you know the answer. "
     "* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. "
-    "* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
-    "* Only use information summarized from the json documentation, do not respond otherwise. "
-    "* Do not refer to the json documentation directly, but use the instructions provided within it to answer questions. "
     "* Do not reference any links, urls or hyperlinks in your answers.\n"
     "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
-    "* If you do not know the answer to a question, or if it is completely irrelevant to the AI courses, simply reply with:\n"
-    "'I'm sorry, but I couldn't find the information that answers you question. Is there anything else I can assist you with?'"
-    "For example:\n"
-    "What is the meaning of life for a qa bot?\n"
-    "I'm sorry, but I couldn't find the information that answers you question. Is there anything else I can assist you with?"
     "Now answer the following question: \n"
 )
 chat_text_qa_msgs: list[ChatMessage] = [
     ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
     ChatMessage(

 from llama_index.core import ChatPromptTemplate
+from llama_index.core.llms import ChatMessage, MessageRole
 from pydantic import BaseModel, Field
 default_user_prompt = (
 )
 system_prompt = (
+    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine-tuning models, giving 'memory' to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, Llama-Index, LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
+    "You are provided information found in Hugging Face's documentation and the RAG course. "
+    "Only some information might be relevant to the question, so ignore the irrelevant part and use the relevant part to answer the question."
+    "Only respond with information given to you documentation. DO NOT use additional information, even if you know the answer. "
+    "If the answer is somewhere in the documentation, answer the question (depending on the questions and the variety of relevant information in the documentation, give complete and helpful answers."
+    "Here is the information you can use, the order is not important: \n\n"
     "---------------------\n"
     "{context_str}\n"
+    "---------------------\n\n"
     "REMEMBER:\n"
+    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
+    "You are provided information found in Hugging Face's documentation and the RAG course. "
     "Here are the rules you must follow:\n"
+    "* Only respond with information inside the documentation. DO NOT provide additional information, even if you know the answer. "
     "* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. "
+    "* Only use information summarized from the documentation, do not respond otherwise. "
+    "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
     "* Do not reference any links, urls or hyperlinks in your answers.\n"
     "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
     "Now answer the following question: \n"
 )
+# system_prompt = (
+#     "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
+#     "You are provided information found in Hugging Face's documentation and the RAG course. "
+#     "Only respond with information inside the documentation. DO NOT use additional information, even if you know the answer. "
+#     "If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the documentation, give complete and helpless answers."
+#     "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
+#     "Here is the information you can use in order: \n"
+#     "---------------------\n"
+#     "{context_str}\n"
+#     "---------------------\n"
+#     "REMEMBER:\n"
+#     "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
+#     "You are provided information found in Hugging Face's documentation and the RAG course. "
+#     "Here are the rules you must follow:\n"
+#     "* Only respond with information inside the documentation. DO NOT provide additional information, even if you know the answer. "
+#     "* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. "
+#     "* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
+#     "* Only use information summarized from the documentation, do not respond otherwise. "
+#     "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+#     "* Do not reference any links, urls or hyperlinks in your answers.\n"
+#     "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
+#     "* If you do not know the answer to a question, or if it is completely irrelevant to the AI courses, simply reply with:\n"
+#     "'I'm sorry, but I couldn't find information that answers you question. Is there anything else I can assist you with?'"
+#     "For example:\n"
+#     "What is the meaning of life for a qa bot?\n"
+#     "I'm sorry, but I couldn't find information that answers you question. Is there anything else I can assist you with?"
+#     "Now answer the following question: \n"
+# )
 chat_text_qa_msgs: list[ChatMessage] = [
     ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
     ChatMessage(