{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/Web_Search_API.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JboB5VaCJUrb",
        "outputId": "b7221d06-8783-4586-f98a-72af45cae54f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.1/211.1 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.3/81.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.6/97.6 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m24.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Building wheel for tinysegmenter (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Building wheel for feedfinder2 (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Building wheel for jieba3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Building wheel for sgmllib3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ]
        }
      ],
      "source": [
        "!pip install -q llama-index==0.10.57 openai==1.37.0 tiktoken==0.7.0 llama-index-tools-google==0.1.3 newspaper3k==0.2.8"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "1NKAn5scN_g9"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "# Set the following API Keys in the Python environment. Will be used later.\n",
        "os.environ[\"OPENAI_API_KEY\"] = \"[OPENAI_API_KEY]\"\n",
        "GOOGLE_SEARCH_KEY = \"[GOOGLE_SEARCH_KEY]\"\n",
        "GOOGLE_SEARCH_ENGINE = \"[GOOGLE_SEARCH_ENGINE]\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ex1gQVHvITMI"
      },
      "source": [
        "# Using Agents/Tools\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0LMypoqUyuXq"
      },
      "source": [
        "## Define Google Search Tool\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4Q7sc69nJvWI"
      },
      "outputs": [],
      "source": [
        "from llama_index.tools.google import GoogleSearchToolSpec\n",
        "\n",
        "tool_spec = GoogleSearchToolSpec(key=GOOGLE_SEARCH_KEY, engine=GOOGLE_SEARCH_ENGINE)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VrbuIOaMeOIf"
      },
      "outputs": [],
      "source": [
        "# Import and initialize our tool spec\n",
        "from llama_index.core.tools.tool_spec.load_and_search import LoadAndSearchToolSpec\n",
        "\n",
        "# Wrap the google search tool to create an index on top of the returned Google search\n",
        "wrapped_tool = LoadAndSearchToolSpec.from_defaults(\n",
        "    tool_spec.to_tool_list()[0],\n",
        ").to_tool_list()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T3ENpLyBy7UL"
      },
      "source": [
        "## Create the Agent\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-_Ab47ppK8b2"
      },
      "outputs": [],
      "source": [
        "from llama_index.agent.openai import OpenAIAgent\n",
        "\n",
        "agent = OpenAIAgent.from_tools(wrapped_tool, verbose=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YcUyz1-FlCQ8"
      },
      "outputs": [],
      "source": [
        "res = agent.chat(\"How many parameters LLaMA2 model has?\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "w4wK5sY-lOOv",
        "outputId": "8090a106-6fac-4514-fdbd-c72a01b28169"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'The LLaMA2 model has parameters available in three different sizes: 7 billion, 13 billion, and 70 billion.'"
            ]
          },
          "execution_count": 72,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "res.response"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TM_cvBA1nTJM",
        "outputId": "0bf3533a-c62d-4d0d-bd76-76c043477042"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "[ToolOutput(content='Content loaded! You can now search the information using read_google_search', tool_name='google_search', raw_input={'args': (), 'kwargs': {'query': 'parameters of LLaMA2 model'}}, raw_output='Content loaded! You can now search the information using read_google_search', is_error=False),\n",
              " ToolOutput(content='Answer: The parameters of the LLaMA2 model are available in three different sizes: 7 billion, 13 billion, and 70 billion.', tool_name='read_google_search', raw_input={'args': (), 'kwargs': {'query': 'parameters of LLaMA2 model'}}, raw_output='Answer: The parameters of the LLaMA2 model are available in three different sizes: 7 billion, 13 billion, and 70 billion.', is_error=False)]"
            ]
          },
          "execution_count": 73,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "res.sources"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "who-NM4pIhPn"
      },
      "source": [
        "# Using Tools w/ VectorStoreIndex\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9g9cTM9GI-19"
      },
      "source": [
        "A limitation of the current agent/tool in LlamaIndex is that it **relies solely on the page description from the retrieved pages** to answer questions. This approach will miss answers that are not visible in the page's description tag. To address this, a possible workaround is to fetch the page results, extract the page content using the newspaper3k library, and then create an index based on the downloaded content. Also, the previous method stacks all retrieved items from the search engine into a single document, making it **difficult to pinpoint the exact source** of the response. However, the following method will enable us to present the sources easily.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "31G_fxxJIsbC"
      },
      "source": [
        "## Define Google Search Tool\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lwRmj2odIHxt"
      },
      "outputs": [],
      "source": [
        "from llama_index.tools.google import GoogleSearchToolSpec\n",
        "\n",
        "tool_spec = GoogleSearchToolSpec(key=GOOGLE_SEARCH_KEY, engine=GOOGLE_SEARCH_ENGINE)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UVIxdj04Bsf2"
      },
      "outputs": [],
      "source": [
        "search_results = tool_spec.google_search(\"LLaMA2 model details\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "AlYDNfg2BsdQ"
      },
      "outputs": [],
      "source": [
        "import json\n",
        "\n",
        "search_results = json.loads(search_results[0].text)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pHALd3uhIxtQ"
      },
      "source": [
        "## Read Each URL Contents\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jXz3JFduBsaq",
        "outputId": "1b795423-26a6-4a61-a878-cca5e27dd5d1"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "8\n"
          ]
        }
      ],
      "source": [
        "import newspaper\n",
        "\n",
        "pages_content = []\n",
        "\n",
        "for item in search_results[\"items\"]:\n",
        "\n",
        "    try:\n",
        "        article = newspaper.Article(item[\"link\"])\n",
        "        article.download()\n",
        "        article.parse()\n",
        "        if len(article.text) > 0:\n",
        "            pages_content.append(\n",
        "                {\"url\": item[\"link\"], \"text\": article.text, \"title\": item[\"title\"]}\n",
        "            )\n",
        "    except:\n",
        "        continue\n",
        "\n",
        "print(len(pages_content))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iqxa_qRVI3G0"
      },
      "source": [
        "## Create the Index\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "O4PkK8DuBsZT"
      },
      "outputs": [],
      "source": [
        "from llama_index.core import Document\n",
        "\n",
        "# Convert the texts to Document objects so the LlamaIndex framework can process them.\n",
        "documents = [\n",
        "    Document(text=row[\"text\"], metadata={\"title\": row[\"title\"], \"url\": row[\"url\"]})\n",
        "    for row in pages_content\n",
        "]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2RtMBWpgBsWX"
      },
      "outputs": [],
      "source": [
        "from llama_index.core import VectorStoreIndex\n",
        "from llama_index.core.node_parser import SentenceSplitter\n",
        "\n",
        "# Build index / generate embeddings using OpenAI.\n",
        "index = VectorStoreIndex.from_documents(\n",
        "    documents,\n",
        "    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=64)],\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xV_ibEZ_BsM4"
      },
      "outputs": [],
      "source": [
        "# Define a query engine that is responsible for retrieving related pieces of text,\n",
        "# and using a LLM to formulate the final answer.\n",
        "query_engine = index.as_query_engine()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nziwu27MI6ih"
      },
      "source": [
        "## Query\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5K1h2_t-HNPe",
        "outputId": "58ce5d66-eddc-43fe-e7c8-d78bc0cb8c32"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "LLaMA2 model has sizes ranging from 7 to 70 billion parameters.\n"
          ]
        }
      ],
      "source": [
        "response = query_engine.query(\"How many parameters LLaMA2 model has?\")\n",
        "print(response)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Xea7ZeidH27i",
        "outputId": "d455c379-9c91-4c9e-e9c1-6bd2deb7342e"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "The LLaMA2 model comes in several sizes with different numbers of parameters:\n",
            "- LLaMA2 7B\n",
            "- LLaMA2 13B\n",
            "- LLaMA2 33B\n",
            "- LLaMA2 65B\n"
          ]
        }
      ],
      "source": [
        "response = query_engine.query(\"How many parameters LLaMA2 model has? list exact sizes.\")\n",
        "print(response)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4QpGPD5nHORP",
        "outputId": "8f9fc185-7745-4357-8471-25d34726cdd8"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Title\t Introducing LLaMA: A foundational, 65-billion-parameter language ...\n",
            "Source\t https://ai.meta.com/blog/large-language-model-llama-meta-ai/\n",
            "Score\t 0.8124383491026671\n",
            "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
            "Title\t Llama 2 follow-up: too much RLHF, GPU sizing, technical details\n",
            "Source\t https://www.interconnects.ai/p/llama-2-part-2\n",
            "Score\t 0.8046542892214631\n",
            "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
          ]
        }
      ],
      "source": [
        "# Show the retrieved nodes\n",
        "for src in response.source_nodes:\n",
        "    print(\"Title\\t\", src.metadata[\"title\"])\n",
        "    print(\"Source\\t\", src.metadata[\"url\"])\n",
        "    print(\"Score\\t\", src.score)\n",
        "    print(\"-_\" * 20)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "B5b4nZ-qHpdP"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "authorship_tag": "ABX9TyNH2OsWaT8fcT3tgDhO3NQn",
      "include_colab_link": true,
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}