Created using Colab
Browse files- notebooks/Crawl_a_Website.ipynb +586 -0
notebooks/Crawl_a_Website.ipynb
ADDED
@@ -0,0 +1,586 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"nbformat": 4,
|
3 |
+
"nbformat_minor": 0,
|
4 |
+
"metadata": {
|
5 |
+
"colab": {
|
6 |
+
"provenance": [],
|
7 |
+
"toc_visible": true,
|
8 |
+
"authorship_tag": "ABX9TyNGAAT9u3Fj2YsNtlfp5oyc",
|
9 |
+
"include_colab_link": true
|
10 |
+
},
|
11 |
+
"kernelspec": {
|
12 |
+
"name": "python3",
|
13 |
+
"display_name": "Python 3"
|
14 |
+
},
|
15 |
+
"language_info": {
|
16 |
+
"name": "python"
|
17 |
+
}
|
18 |
+
},
|
19 |
+
"cells": [
|
20 |
+
{
|
21 |
+
"cell_type": "markdown",
|
22 |
+
"metadata": {
|
23 |
+
"id": "view-in-github",
|
24 |
+
"colab_type": "text"
|
25 |
+
},
|
26 |
+
"source": [
|
27 |
+
"<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/Crawl_a_Website.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"source": [
|
33 |
+
"!pip install -q llama-index==0.10.30 openai==1.12.0 cohere==4.47 tiktoken==0.6.0 newspaper3k==0.2.8"
|
34 |
+
],
|
35 |
+
"metadata": {
|
36 |
+
"id": "4CW8ux1RSdem",
|
37 |
+
"colab": {
|
38 |
+
"base_uri": "https://localhost:8080/"
|
39 |
+
},
|
40 |
+
"outputId": "155feab4-8ae6-43da-a07f-8a1f4b677c2b"
|
41 |
+
},
|
42 |
+
"execution_count": 35,
|
43 |
+
"outputs": [
|
44 |
+
{
|
45 |
+
"output_type": "stream",
|
46 |
+
"name": "stdout",
|
47 |
+
"text": [
|
48 |
+
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m211.1/211.1 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
49 |
+
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m81.3/81.3 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
50 |
+
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m97.6/97.6 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
51 |
+
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
52 |
+
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m43.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
53 |
+
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
54 |
+
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
55 |
+
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
56 |
+
" Building wheel for tinysegmenter (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
57 |
+
" Building wheel for feedfinder2 (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
58 |
+
" Building wheel for jieba3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
59 |
+
" Building wheel for sgmllib3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
|
60 |
+
]
|
61 |
+
}
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "code",
|
66 |
+
"source": [
|
67 |
+
"import os\n",
|
68 |
+
"\n",
|
69 |
+
"# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
|
70 |
+
"os.environ[\"OPENAI_API_KEY\"] = \"[OPENAI_API_KEY]\"\n",
|
71 |
+
"USESCRAPER_API_KEY = \"[USESCRAPER_API_KEY]\""
|
72 |
+
],
|
73 |
+
"metadata": {
|
74 |
+
"id": "wxDPsVXSAj6_"
|
75 |
+
},
|
76 |
+
"execution_count": 2,
|
77 |
+
"outputs": []
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"cell_type": "markdown",
|
81 |
+
"source": [
|
82 |
+
"There are two primary methods for extracting webpage content. The first method involves having a list of URLs; one can iterate through this list to retrieve the content of each page. The second method, web crawling, requires using a script or service to extract page URLs from a sitemap or manually following links on the page to access all the content. Initially, we will explore web scraping techniques before discussing how to use a service like usescraper.com to perform web crawling."
|
83 |
+
],
|
84 |
+
"metadata": {
|
85 |
+
"id": "VSc7-1mljmrp"
|
86 |
+
}
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "markdown",
|
90 |
+
"source": [
|
91 |
+
"# 1. Scraping using `newspaper` Library"
|
92 |
+
],
|
93 |
+
"metadata": {
|
94 |
+
"id": "D3r2tYHgeIK9"
|
95 |
+
}
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "markdown",
|
99 |
+
"source": [
|
100 |
+
"## Define URLs"
|
101 |
+
],
|
102 |
+
"metadata": {
|
103 |
+
"id": "it43ZQf8jatw"
|
104 |
+
}
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"source": [
|
109 |
+
"urls = [\n",
|
110 |
+
" \"https://docs.llamaindex.ai/en/stable/understanding\",\n",
|
111 |
+
" \"https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/\",\n",
|
112 |
+
" \"https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/\",\n",
|
113 |
+
" \"https://docs.llamaindex.ai/en/stable/understanding/querying/querying/\"\n",
|
114 |
+
"]"
|
115 |
+
],
|
116 |
+
"metadata": {
|
117 |
+
"id": "x74PqfQ7eIzD"
|
118 |
+
},
|
119 |
+
"execution_count": 52,
|
120 |
+
"outputs": []
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"cell_type": "markdown",
|
124 |
+
"source": [
|
125 |
+
"## Get Page Contents"
|
126 |
+
],
|
127 |
+
"metadata": {
|
128 |
+
"id": "tgxfpfSsjcMC"
|
129 |
+
}
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"cell_type": "code",
|
133 |
+
"source": [
|
134 |
+
"import newspaper\n",
|
135 |
+
"\n",
|
136 |
+
"pages_content = []\n",
|
137 |
+
"\n",
|
138 |
+
"# Retrieve the Content\n",
|
139 |
+
"for url in urls:\n",
|
140 |
+
"\ttry:\n",
|
141 |
+
"\t\tarticle = newspaper.Article( url )\n",
|
142 |
+
"\t\tarticle.download()\n",
|
143 |
+
"\t\tarticle.parse()\n",
|
144 |
+
"\t\tif len(article.text) > 0:\n",
|
145 |
+
"\t\t\tpages_content.append({ \"url\": url, \"title\": article.title, \"text\": article.text })\n",
|
146 |
+
"\texcept:\n",
|
147 |
+
"\t\tcontinue"
|
148 |
+
],
|
149 |
+
"metadata": {
|
150 |
+
"id": "Q6Xs1OhUfVQV"
|
151 |
+
},
|
152 |
+
"execution_count": 56,
|
153 |
+
"outputs": []
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"cell_type": "code",
|
157 |
+
"source": [
|
158 |
+
"pages_content[0]"
|
159 |
+
],
|
160 |
+
"metadata": {
|
161 |
+
"colab": {
|
162 |
+
"base_uri": "https://localhost:8080/"
|
163 |
+
},
|
164 |
+
"id": "3cNdJNi2g1ly",
|
165 |
+
"outputId": "f5184c15-6b55-47ee-98ee-646a06290a4c"
|
166 |
+
},
|
167 |
+
"execution_count": 57,
|
168 |
+
"outputs": [
|
169 |
+
{
|
170 |
+
"output_type": "execute_result",
|
171 |
+
"data": {
|
172 |
+
"text/plain": [
|
173 |
+
"{'url': 'https://docs.llamaindex.ai/en/stable/understanding',\n",
|
174 |
+
" 'title': 'Building an LLM Application',\n",
|
175 |
+
" 'text': \"Building an LLM application#\\n\\nWelcome to the beginning of Understanding LlamaIndex. This is a series of short, bite-sized tutorials on every stage of building an LLM application to get you acquainted with how to use LlamaIndex before diving into more advanced and subtle strategies. If you're an experienced programmer new to LlamaIndex, this is the place to start.\\n\\nKey steps in building an LLM application#\\n\\nTip If you've already read our high-level concepts page you'll recognize several of these steps.\\n\\nThere are a series of key steps involved in building any LLM-powered application, whether it's answering questions about your data, creating a chatbot, or an autonomous agent. Throughout our documentation, you'll notice sections are arranged roughly in the order you'll perform these steps while building your app. You'll learn about:\\n\\nUsing LLMs : whether it's OpenAI or any number of hosted LLMs or a locally-run model of your own, LLMs are used at every step of the way, from indexing and storing to querying and parsing your data. LlamaIndex comes with a huge number of reliable, tested prompts and we'll also show you how to customize your own.\\n\\nLoading : getting your data from wherever it lives, whether that's unstructured text, PDFs, databases, or APIs to other applications. LlamaIndex has hundreds of connectors to every data source over at LlamaHub.\\n\\nIndexing : once you've got your data there are an infinite number of ways to structure access to that data to ensure your applications is always working with the most relevant data. LlamaIndex has a huge number of these strategies built-in and can help you select the best ones.\\n\\nStoring : you will probably find it more efficient to store your data in indexed form, or pre-processed summaries provided by an LLM, often in a specialized database known as a Vector Store (see below). You can also store your indexes, metadata and more.\\n\\nQuerying : every indexing strategy has a corresponding querying strategy and there are lots of ways to improve the relevance, speed and accuracy of what you retrieve and what the LLM does with it before returning it to you, including turning it into structured responses such as an API.\\n\\nPutting it all together : whether you are building question & answering, chatbots, an API, or an autonomous agent, we show you how to get your application into production.\\n\\nTracing and debugging : also called observability , it's especially important with LLM applications to be able to look into the inner workings of what's going on to help you debug problems and spot places to improve.\\n\\nEvaluating: every strategy has pros and cons and a key part of building, shipping and evolving your application is evaluating whether your change has improved your application in terms of accuracy, performance, clarity, cost and more. Reliably evaluating your changes is a crucial part of LLM application development.\\n\\nReady to dive in? Head to using LLMs.\"}"
|
176 |
+
]
|
177 |
+
},
|
178 |
+
"metadata": {},
|
179 |
+
"execution_count": 57
|
180 |
+
}
|
181 |
+
]
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"cell_type": "code",
|
185 |
+
"source": [
|
186 |
+
"len( pages_content )"
|
187 |
+
],
|
188 |
+
"metadata": {
|
189 |
+
"colab": {
|
190 |
+
"base_uri": "https://localhost:8080/"
|
191 |
+
},
|
192 |
+
"id": "WleP60A3gkQM",
|
193 |
+
"outputId": "8c79ab53-e47b-4227-eb6f-0286b8ba2d15"
|
194 |
+
},
|
195 |
+
"execution_count": 38,
|
196 |
+
"outputs": [
|
197 |
+
{
|
198 |
+
"output_type": "execute_result",
|
199 |
+
"data": {
|
200 |
+
"text/plain": [
|
201 |
+
"5"
|
202 |
+
]
|
203 |
+
},
|
204 |
+
"metadata": {},
|
205 |
+
"execution_count": 38
|
206 |
+
}
|
207 |
+
]
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"cell_type": "markdown",
|
211 |
+
"source": [
|
212 |
+
"## Convert to Document"
|
213 |
+
],
|
214 |
+
"metadata": {
|
215 |
+
"id": "i5mCiRfGjfNx"
|
216 |
+
}
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"cell_type": "code",
|
220 |
+
"source": [
|
221 |
+
"from llama_index.core.schema import Document\n",
|
222 |
+
"\n",
|
223 |
+
"# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
|
224 |
+
"documents = [Document(text=row['text'], metadata={\"title\": row['title'], \"url\": row['url']}) for row in pages_content]"
|
225 |
+
],
|
226 |
+
"metadata": {
|
227 |
+
"id": "TOJ3K-CBfVDR"
|
228 |
+
},
|
229 |
+
"execution_count": 58,
|
230 |
+
"outputs": []
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"cell_type": "markdown",
|
234 |
+
"source": [
|
235 |
+
"# 2. Submit the Crawler Job"
|
236 |
+
],
|
237 |
+
"metadata": {
|
238 |
+
"id": "CkjEyEmkJevT"
|
239 |
+
}
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": null,
|
244 |
+
"metadata": {
|
245 |
+
"colab": {
|
246 |
+
"base_uri": "https://localhost:8080/"
|
247 |
+
},
|
248 |
+
"id": "tYpchBo5-brp",
|
249 |
+
"outputId": "927f84c5-c13a-408c-8802-df90bc05c733"
|
250 |
+
},
|
251 |
+
"outputs": [
|
252 |
+
{
|
253 |
+
"output_type": "stream",
|
254 |
+
"name": "stdout",
|
255 |
+
"text": [
|
256 |
+
"{'org': '581', 'id': '7YE3T8VSPJVSCYE6EDQ90DJNFT', 'urls': ['https://docs.llamaindex.ai/en/stable/understanding/'], 'exclude_globs': [], 'exclude_elements': 'nav, header, footer, script, style, noscript, svg, [role=\"alert\"], [role=\"banner\"], [role=\"dialog\"], [role=\"alertdialog\"], [role=\"region\"][aria-label*=\"skip\" i], [aria-modal=\"true\"]', 'output_format': 'markdown', 'output_expiry': 604800, 'min_length': 50, 'page_limit': 10000, 'force_crawling_mode': 'link', 'block_resources': True, 'include_linked_files': False, 'createdAt': 1713883978029, 'status': 'starting', 'use_browser': True, 'sitemapPageCount': 0, 'notices': []}\n"
|
257 |
+
]
|
258 |
+
}
|
259 |
+
],
|
260 |
+
"source": [
|
261 |
+
"import requests\n",
|
262 |
+
"import json\n",
|
263 |
+
"\n",
|
264 |
+
"payload = {\n",
|
265 |
+
" \"urls\": [\"https://docs.llamaindex.ai/en/stable/understanding/\"], # list of urls to crawl\n",
|
266 |
+
" \"output_format\": \"markdown\", # text, html, markdown\n",
|
267 |
+
" \"output_expiry\": 604800, # Automatically delete after X seconds\n",
|
268 |
+
" \"min_length\": 50, # Skip pages with less than X characters\n",
|
269 |
+
" \"page_limit\": 10000, # Maximum number of pages to crawl\n",
|
270 |
+
" \"force_crawling_mode\": \"link\", # \"link\" follows links in the page reccursively, or \"sitemap\" to find pages from website's sitemap\n",
|
271 |
+
" \"block_resources\": True, # skip loading images, stylesheets, or scripts\n",
|
272 |
+
" \"include_linked_files\": False # include files (PDF, text, ...) in output\n",
|
273 |
+
"}\n",
|
274 |
+
"headers = {\n",
|
275 |
+
" \"Authorization\": \"Bearer \" + USESCRAPER_API_KEY,\n",
|
276 |
+
" \"Content-Type\": \"application/json\"\n",
|
277 |
+
"}\n",
|
278 |
+
"\n",
|
279 |
+
"response = requests.request(\"POST\", \"https://api.usescraper.com/crawler/jobs\", json=payload, headers=headers)\n",
|
280 |
+
"\n",
|
281 |
+
"response = json.loads( response.text )\n",
|
282 |
+
"\n",
|
283 |
+
"print(response)"
|
284 |
+
]
|
285 |
+
},
|
286 |
+
{
|
287 |
+
"cell_type": "markdown",
|
288 |
+
"source": [
|
289 |
+
"## Get the Status"
|
290 |
+
],
|
291 |
+
"metadata": {
|
292 |
+
"id": "nx_4MjHxJgxh"
|
293 |
+
}
|
294 |
+
},
|
295 |
+
{
|
296 |
+
"cell_type": "code",
|
297 |
+
"source": [
|
298 |
+
"url = \"https://api.usescraper.com/crawler/jobs/{}\".format(response['id'])\n",
|
299 |
+
"\n",
|
300 |
+
"status_res = requests.request(\"GET\", url, headers=headers)\n",
|
301 |
+
"\n",
|
302 |
+
"status_res = json.loads( status_res.text )\n",
|
303 |
+
"\n",
|
304 |
+
"print( status_res['status'] )\n",
|
305 |
+
"print( status_res['progress'] )"
|
306 |
+
],
|
307 |
+
"metadata": {
|
308 |
+
"colab": {
|
309 |
+
"base_uri": "https://localhost:8080/"
|
310 |
+
},
|
311 |
+
"id": "ZLJ0BUR8c1a8",
|
312 |
+
"outputId": "cfd3aee9-68bf-4171-9340-abe2d03fa5ac"
|
313 |
+
},
|
314 |
+
"execution_count": null,
|
315 |
+
"outputs": [
|
316 |
+
{
|
317 |
+
"output_type": "stream",
|
318 |
+
"name": "stdout",
|
319 |
+
"text": [
|
320 |
+
"running\n",
|
321 |
+
"{'scraped': 9, 'discarded': 0, 'failed': 0}\n"
|
322 |
+
]
|
323 |
+
}
|
324 |
+
]
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"cell_type": "markdown",
|
328 |
+
"source": [
|
329 |
+
"## Get the Data"
|
330 |
+
],
|
331 |
+
"metadata": {
|
332 |
+
"id": "vHcRJIDsJh2i"
|
333 |
+
}
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"cell_type": "code",
|
337 |
+
"source": [
|
338 |
+
"url = \"https://api.usescraper.com/crawler/jobs/{}/data\".format(\"7YE3T8VSPJVSCYE6EDQ90DJNFT\")#response['id'])\n",
|
339 |
+
"\n",
|
340 |
+
"data_res = requests.request(\"GET\", url, headers=headers)\n",
|
341 |
+
"\n",
|
342 |
+
"data_res = json.loads( data_res.text )\n",
|
343 |
+
"\n",
|
344 |
+
"print( data_res )"
|
345 |
+
],
|
346 |
+
"metadata": {
|
347 |
+
"colab": {
|
348 |
+
"base_uri": "https://localhost:8080/"
|
349 |
+
},
|
350 |
+
"id": "J4dUn4cmGGab",
|
351 |
+
"outputId": "15717b0d-dac6-4a67-e13f-1330623d4ced"
|
352 |
+
},
|
353 |
+
"execution_count": 10,
|
354 |
+
"outputs": [
|
355 |
+
{
|
356 |
+
"output_type": "stream",
|
357 |
+
"name": "stdout",
|
358 |
+
"text": [
|
359 |
+
"{'data': [{'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/graphs/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/graphs/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'Knowledge Graphs - LlamaIndex'}}, 'text': ' \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/graphs/#knowledge-graphs)\\n#Knowledge Graphs[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/graphs/#knowledge-graphs)\\nLlamaIndex contains some fantastic guides for building with knowledge graphs.\\n\\nCheck out the end-to-end tutorials/workshops below. Also check out our [knowledge graph query engine guides](https://docs.llamaindex.ai/en/stable/module_guides/deploying/query_engine/modules/).\\n\\n- LlamaIndex Workshop: Building RAG with Knowledge Graphs [https://colab.research.google.com/drive/1tLjOg2ZQuIClfuWrAC2LdiZHCov8oUbs](https://colab.research.google.com/drive/1tLjOg2ZQuIClfuWrAC2LdiZHCov8oUbs)\\n- REBEL + Knowledge Graph Index [https://colab.research.google.com/drive/1G6pcR0pXvSkdMQlAK_P-IrYgo-_staxd?usp=sharing](https://colab.research.google.com/drive/1G6pcR0pXvSkdMQlAK_P-IrYgo-_staxd?usp=sharing)\\nπ¦\\n\\nCTRL + K'}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'Using LLMs - LlamaIndex'}}, 'text': ' \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/#using-llms)\\n#Using LLMs[#](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/#using-llms)\\nTip\\n\\nFor a list of our supported LLMs and a comparison of their functionality, check out our [LLM module guide](https://docs.llamaindex.ai/en/stable/module_guides/models/llms/).\\n\\nOne of the first steps when building an LLM-based application is which LLM to use; you can also use more than one if you wish.\\n\\nLLMs are used at multiple different stages of your pipeline:\\n\\n- During Indexing you may use an LLM to determine the relevance of data (whether to index it at all) or you may use an LLM to summarize the raw data and index the summaries instead.\\n- During Querying LLMs can be used in two ways:\\n- During Retrieval (fetching data from your index) LLMs can be given an array of options (such as multiple different indices) and make decisions about where best to find the information you\\'re looking for. An agentic LLM can also use tools at this stage to query different data sources.\\n- During Response Synthesis (turning the retrieved data into an answer) an LLM can combine answers to multiple sub-queries into a single coherent answer, or it can transform data, such as from unstructured text to JSON or another programmatic output format.\\nLlamaIndex provides a single interface to a large number of different LLMs, allowing you to pass in any LLM you choose to any stage of the pipeline. It can be as simple as this:\\n\\nfrom llama_index.llms.openai import OpenAI\\n\\nresponse = OpenAI().complete(\"Paul Graham is \")\\nprint(response)\\nUsually, you will instantiate an LLM and pass it to Settings, which you then pass to other stages of the pipeline, as in this example:\\n\\nfrom llama_index.llms.openai import OpenAI\\nfrom llama_index.core import Settings\\nfrom llama_index.core import VectorStoreIndex, SimpleDirectoryReader\\n\\nSettings.llm = OpenAI(temperature=0.2, model=\"gpt-4\")\\n\\ndocuments = SimpleDirectoryReader(\"data\").load_data()\\nindex = VectorStoreIndex.from_documents(\\n documents,\\n)\\nIn this case, you\\'ve instantiated OpenAI and customized it to use the gpt-4 model instead of the default gpt-3.5-turbo, and also modified the temperature. The VectorStoreIndex will now use gpt-4 to answer questions when querying.\\n\\nTip\\n\\nThe Settings is a bundle of configuration data that you pass into different parts of LlamaIndex. You can [learn more about Settings](https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/) and how to customize it.\\n\\n##Available LLMs[#](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/#available-llms)\\nWe support integrations with OpenAI, Hugging Face, PaLM, and more. Check out our [module guide to LLMs](https://docs.llamaindex.ai/en/stable/module_guides/models/llms/) for a full list, including how to run a local model.\\n\\nTip\\n\\nA general note on privacy and LLMs can be found on the [privacy page](https://docs.llamaindex.ai/en/stable/understanding/using_llms/privacy/).\\n\\n###Using a local LLM[#](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/#using-a-local-llm)\\nLlamaIndex doesn\\'t just support hosted LLM APIs; you can also [run a local model such as Llama2 locally](https://replicate.com/blog/run-llama-locally).\\n\\nFor example, if you have [Ollama](https://github.com/ollama/ollama) installed and running:\\n\\nfrom llama_index.llms.ollama import Ollama\\nfrom llama_index.core import Settings\\n\\nSettings.llm = Ollama(model=\"llama2\", request_timeout=60.0)\\nSee the [custom LLM\\'s How-To](https://docs.llamaindex.ai/en/stable/module_guides/models/llms/usage_custom/) for more details.\\n\\n##Prompts[#](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/#prompts)\\nBy default LlamaIndex comes with a great set of built-in, battle-tested prompts that handle the tricky work of getting a specific LLM to correctly handle and format data. This is one of the biggest benefits of using LlamaIndex. If you want to, you can [customize the prompts](https://docs.llamaindex.ai/en/stable/module_guides/models/prompts/)\\n\\nπ¦\\n\\nCTRL + K'}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'Indexing & Embedding - LlamaIndex'}}, 'text': ' \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#indexing)\\n#Indexing[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#indexing)\\nWith your data loaded, you now have a list of Document objects (or a list of Nodes). It\\'s time to build an Index over these objects so you can start querying them.\\n\\n##What is an Index?[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#what-is-an-index)\\nIn LlamaIndex terms, an Index is a data structure composed of Document objects, designed to enable querying by an LLM. Your Index is designed to be complementary to your querying strategy.\\n\\nLlamaIndex offers several different index types. We\\'ll cover the two most common here.\\n\\n##Vector Store Index[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#vector-store-index)\\nA VectorStoreIndex is by far the most frequent type of Index you\\'ll encounter. The Vector Store Index takes your Documents and splits them up into Nodes. It then creates vector embeddings of the text of every node, ready to be queried by an LLM.\\n\\n###What is an embedding?[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#what-is-an-embedding)\\nVector embeddings are central to how LLM applications function.\\n\\nA vector embedding, often just called an embedding, is a numerical representation of the semantics, or meaning of your text. Two pieces of text with similar meanings will have mathematically similar embeddings, even if the actual text is quite different.\\n\\nThis mathematical relationship enables semantic search, where a user provides query terms and LlamaIndex can locate text that is related to the meaning of the query terms rather than simple keyword matching. This is a big part of how Retrieval-Augmented Generation works, and how LLMs function in general.\\n\\nThere are [many types of embeddings](https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings/), and they vary in efficiency, effectiveness and computational cost. By default LlamaIndex uses text-embedding-ada-002, which is the default embedding used by OpenAI. If you are using different LLMs you will often want to use different embeddings.\\n\\n###Vector Store Index embeds your documents[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#vector-store-index-embeds-your-documents)\\nVector Store Index turns all of your text into embeddings using an API from your LLM; this is what is meant when we say it \"embeds your text\". If you have a lot of text, generating embeddings can take a long time since it involves many round-trip API calls.\\n\\nWhen you want to search your embeddings, your query is itself turned into a vector embedding, and then a mathematical operation is carried out by VectorStoreIndex to rank all the embeddings by how semantically similar they are to your query.\\n\\n###Top K Retrieval[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#top-k-retrieval)\\nOnce the ranking is complete, VectorStoreIndex returns the most-similar embeddings as their corresponding chunks of text. The number of embeddings it returns is known as k, so the parameter controlling how many embeddings to return is known as top_k. This whole type of search is often referred to as \"top-k semantic retrieval\" for this reason.\\n\\nTop-k retrieval is the simplest form of querying a vector index; you will learn about more complex and subtler strategies when you read the [querying](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/) section.\\n\\n###Using Vector Store Index[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#using-vector-store-index)\\nTo use the Vector Store Index, pass it the list of Documents you created during the loading stage:\\n\\nfrom llama_index.core import VectorStoreIndex\\n\\nindex = VectorStoreIndex.from_documents(documents)\\nTip\\n\\nfrom_documents also takes an optional argument show_progress. Set it to True to display a progress bar during index construction.\\n\\nYou can also choose to build an index over a list of Node objects directly:\\n\\nfrom llama_index.core import VectorStoreIndex\\n\\nindex = VectorStoreIndex(nodes)\\nWith your text indexed, it is now technically ready for [querying](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/)! However, embedding all your text can be time-consuming and, if you are using a hosted LLM, it can also be expensive. To save time and money you will want to [store your embeddings](https://docs.llamaindex.ai/en/stable/understanding/storing/storing/) first.\\n\\n##Summary Index[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#summary-index)\\nA Summary Index is a simpler form of Index best suited to queries where, as the name suggests, you are trying to generate a summary of the text in your Documents. It simply stores all of the Documents and returns all of them to your query engine.\\n\\n##Further Reading[#](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/#further-reading)\\nIf your data is a set of interconnected concepts (in computer science terms, a \"graph\") then you may be interested in our [knowledge graph index](https://docs.llamaindex.ai/en/stable/examples/index_structs/knowledge_graph/KnowledgeGraphDemo/).\\n\\nπ¦\\n\\nCTRL + K'}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'Building an LLM Application - LlamaIndex'}}, 'text': \" \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/#building-an-llm-application)\\n#Building an LLM application[#](https://docs.llamaindex.ai/en/stable/understanding/#building-an-llm-application)\\nWelcome to the beginning of Understanding LlamaIndex. This is a series of short, bite-sized tutorials on every stage of building an LLM application to get you acquainted with how to use LlamaIndex before diving into more advanced and subtle strategies. If you're an experienced programmer new to LlamaIndex, this is the place to start.\\n\\n##Key steps in building an LLM application[#](https://docs.llamaindex.ai/en/stable/understanding/#key-steps-in-building-an-llm-application)\\nTip\\n\\nIf you've already read our [high-level concepts](https://docs.llamaindex.ai/en/stable/getting_started/concepts/) page you'll recognize several of these steps.\\n\\nThere are a series of key steps involved in building any LLM-powered application, whether it's answering questions about your data, creating a chatbot, or an autonomous agent. Throughout our documentation, you'll notice sections are arranged roughly in the order you'll perform these steps while building your app. You'll learn about:\\n\\n-\\n[Using LLMs](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/): whether it's OpenAI or any number of hosted LLMs or a locally-run model of your own, LLMs are used at every step of the way, from indexing and storing to querying and parsing your data. LlamaIndex comes with a huge number of reliable, tested prompts and we'll also show you how to customize your own.\\n\\n-\\n[Loading](https://docs.llamaindex.ai/en/stable/understanding/loading/loading/): getting your data from wherever it lives, whether that's unstructured text, PDFs, databases, or APIs to other applications. LlamaIndex has hundreds of connectors to every data source over at [LlamaHub](https://llamahub.ai/).\\n\\n-\\n[Indexing](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/): once you've got your data there are an infinite number of ways to structure access to that data to ensure your applications is always working with the most relevant data. LlamaIndex has a huge number of these strategies built-in and can help you select the best ones.\\n\\n-\\n[Storing](https://docs.llamaindex.ai/en/stable/understanding/storing/storing/): you will probably find it more efficient to store your data in indexed form, or pre-processed summaries provided by an LLM, often in a specialized database known as a Vector Store (see below). You can also store your indexes, metadata and more.\\n\\n-\\n[Querying](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/): every indexing strategy has a corresponding querying strategy and there are lots of ways to improve the relevance, speed and accuracy of what you retrieve and what the LLM does with it before returning it to you, including turning it into structured responses such as an API.\\n\\n-\\n[Putting it all together](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/): whether you are building question & answering, chatbots, an API, or an autonomous agent, we show you how to get your application into production.\\n\\n-\\n[Tracing and debugging](https://docs.llamaindex.ai/en/stable/understanding/tracing_and_debugging/tracing_and_debugging/): also called observability, it's especially important with LLM applications to be able to look into the inner workings of what's going on to help you debug problems and spot places to improve.\\n\\n-\\n[Evaluating](https://docs.llamaindex.ai/en/stable/understanding/evaluating/evaluating/): every strategy has pros and cons and a key part of building, shipping and evolving your application is evaluating whether your change has improved your application in terms of accuracy, performance, clarity, cost and more. Reliably evaluating your changes is a crucial part of LLM application development.\\n\\n##Let's get started![#](https://docs.llamaindex.ai/en/stable/understanding/#lets-get-started)\\nReady to dive in? Head to [using LLMs](https://docs.llamaindex.ai/en/stable/understanding/using_llms/using_llms/).\\n\\nπ¦\\n\\nCTRL + K\"}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'A Guide to Building a Full-Stack Web App with LLamaIndex - LlamaIndex'}}, 'text': ' \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#a-guide-to-building-a-full-stack-web-app-with-llamaindex)\\n#A Guide to Building a Full-Stack Web App with LLamaIndex[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#a-guide-to-building-a-full-stack-web-app-with-llamaindex)\\nLlamaIndex is a python library, which means that integrating it with a full-stack web application will be a little different than what you might be used to.\\n\\nThis guide seeks to walk through the steps needed to create a basic API service written in python, and how this interacts with a TypeScript+React frontend.\\n\\nAll code examples here are available from the [llama_index_starter_pack](https://github.com/logan-markewich/llama_index_starter_pack/tree/main/flask_react) in the flask_react folder.\\n\\nThe main technologies used in this guide are as follows:\\n\\n- python3.11\\n- llama_index\\n- flask\\n- typescript\\n- react\\n##Flask Backend[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#flask-backend)\\nFor this guide, our backend will use a [Flask](https://flask.palletsprojects.com/en/2.2.x/) API server to communicate with our frontend code. If you prefer, you can also easily translate this to a [FastAPI](https://fastapi.tiangolo.com/) server, or any other python server library of your choice.\\n\\nSetting up a server using Flask is easy. You import the package, create the app object, and then create your endpoints. Let\\'s create a basic skeleton for the server first:\\n\\nfrom flask import Flask\\n\\napp = Flask(__name__)\\n\\n\\n@app.route(\"/\")\\ndef home():\\n return \"Hello World!\"\\n\\n\\nif __name__ == \"__main__\":\\n app.run(host=\"0.0.0.0\", port=5601)\\nflask_demo.py\\n\\nIf you run this file (python flask_demo.py), it will launch a server on port 5601. If you visit http://localhost:5601/, you will see the \"Hello World!\" text rendered in your browser. Nice!\\n\\nThe next step is deciding what functions we want to include in our server, and to start using LlamaIndex.\\n\\nTo keep things simple, the most basic operation we can provide is querying an existing index. Using the [paul graham essay](https://github.com/jerryjliu/llama_index/blob/main/examples/paul_graham_essay/data/paul_graham_essay.txt) from LlamaIndex, create a documents folder and download+place the essay text file inside of it.\\n\\n###Basic Flask - Handling User Index Queries[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#basic-flask-handling-user-index-queries)\\nNow, let\\'s write some code to initialize our index:\\n\\nimport os\\nfrom llama_index.core import (\\n SimpleDirectoryReader,\\n VectorStoreIndex,\\n StorageContext,\\n)\\n\\n# NOTE: for local testing only, do NOT deploy with your key hardcoded\\nos.environ[\"OPENAI_API_KEY\"] = \"your key here\"\\n\\nindex = None\\n\\n\\ndef initialize_index():\\n global index\\n storage_context = StorageContext.from_defaults()\\n if os.path.exists(index_dir):\\n index = load_index_from_storage(storage_context)\\n else:\\n documents = SimpleDirectoryReader(\"./documents\").load_data()\\n index = VectorStoreIndex.from_documents(\\n documents, storage_context=storage_context\\n )\\n storage_context.persist(index_dir)\\nThis function will initialize our index. If we call this just before starting the flask server in the main function, then our index will be ready for user queries!\\n\\nOur query endpoint will accept GET requests with the query text as a parameter. Here\\'s what the full endpoint function will look like:\\n\\nfrom flask import request\\n\\n\\n@app.route(\"/query\", methods=[\"GET\"])\\ndef query_index():\\n global index\\n query_text = request.args.get(\"text\", None)\\n if query_text is None:\\n return (\\n \"No text found, please include a ?text=blah parameter in the URL\",\\n 400,\\n )\\n query_engine = index.as_query_engine()\\n response = query_engine.query(query_text)\\n return str(response), 200\\nNow, we\\'ve introduced a few new concepts to our server:\\n\\n- a new /query endpoint, defined by the function decorator\\n- a new import from flask, request, which is used to get parameters from the request\\n- if the text parameter is missing, then we return an error message and an appropriate HTML response code\\n- otherwise, we query the index, and return the response as a string\\nA full query example that you can test in your browser might look something like this: http://localhost:5601/query?text=what did the author do growing up (once you press enter, the browser will convert the spaces into \"%20\" characters).\\n\\nThings are looking pretty good! We now have a functional API. Using your own documents, you can easily provide an interface for any application to call the flask API and get answers to queries.\\n\\n###Advanced Flask - Handling User Document Uploads[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#advanced-flask-handling-user-document-uploads)\\nThings are looking pretty cool, but how can we take this a step further? What if we want to allow users to build their own indexes by uploading their own documents? Have no fear, Flask can handle it all :muscle:.\\n\\nTo let users upload documents, we have to take some extra precautions. Instead of querying an existing index, the index will become mutable. If you have many users adding to the same index, we need to think about how to handle concurrency. Our Flask server is threaded, which means multiple users can ping the server with requests which will be handled at the same time.\\n\\nOne option might be to create an index for each user or group, and store and fetch things from S3. But for this example, we will assume there is one locally stored index that users are interacting with.\\n\\nTo handle concurrent uploads and ensure sequential inserts into the index, we can use the BaseManager python package to provide sequential access to the index using a separate server and locks. This sounds scary, but it\\'s not so bad! We will just move all our index operations (initializing, querying, inserting) into the BaseManager \"index_server\", which will be called from our Flask server.\\n\\nHere\\'s a basic example of what our index_server.py will look like after we\\'ve moved our code:\\n\\nimport os\\nfrom multiprocessing import Lock\\nfrom multiprocessing.managers import BaseManager\\nfrom llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document\\n\\n# NOTE: for local testing only, do NOT deploy with your key hardcoded\\nos.environ[\"OPENAI_API_KEY\"] = \"your key here\"\\n\\nindex = None\\nlock = Lock()\\n\\n\\ndef initialize_index():\\n global index\\n\\n with lock:\\n # same as before ...\\n pass\\n\\n\\ndef query_index(query_text):\\n global index\\n query_engine = index.as_query_engine()\\n response = query_engine.query(query_text)\\n return str(response)\\n\\n\\nif __name__ == \"__main__\":\\n # init the global index\\n print(\"initializing index...\")\\n initialize_index()\\n\\n # setup server\\n # NOTE: you might want to handle the password in a less hardcoded way\\n manager = BaseManager((\"\", 5602), b\"password\")\\n manager.register(\"query_index\", query_index)\\n server = manager.get_server()\\n\\n print(\"starting server...\")\\n server.serve_forever()\\nindex_server.py\\n\\nSo, we\\'ve moved our functions, introduced the Lock object which ensures sequential access to the global index, registered our single function in the server, and started the server on port 5602 with the password password.\\n\\nThen, we can adjust our flask code as follows:\\n\\nfrom multiprocessing.managers import BaseManager\\nfrom flask import Flask, request\\n\\n# initialize manager connection\\n# NOTE: you might want to handle the password in a less hardcoded way\\nmanager = BaseManager((\"\", 5602), b\"password\")\\nmanager.register(\"query_index\")\\nmanager.connect()\\n\\n\\n@app.route(\"/query\", methods=[\"GET\"])\\ndef query_index():\\n global index\\n query_text = request.args.get(\"text\", None)\\n if query_text is None:\\n return (\\n \"No text found, please include a ?text=blah parameter in the URL\",\\n 400,\\n )\\n response = manager.query_index(query_text)._getvalue()\\n return str(response), 200\\n\\n\\n@app.route(\"/\")\\ndef home():\\n return \"Hello World!\"\\n\\n\\nif __name__ == \"__main__\":\\n app.run(host=\"0.0.0.0\", port=5601)\\nflask_demo.py\\n\\nThe two main changes are connecting to our existing BaseManager server and registering the functions, as well as calling the function through the manager in the /query endpoint.\\n\\nOne special thing to note is that BaseManager servers don\\'t return objects quite as we expect. To resolve the return value into it\\'s original object, we call the _getvalue() function.\\n\\nIf we allow users to upload their own documents, we should probably remove the Paul Graham essay from the documents folder, so let\\'s do that first. Then, let\\'s add an endpoint to upload files! First, let\\'s define our Flask endpoint function:\\n\\n...\\nmanager.register(\"insert_into_index\")\\n...\\n\\n\\n@app.route(\"/uploadFile\", methods=[\"POST\"])\\ndef upload_file():\\n global manager\\n if \"file\" not in request.files:\\n return \"Please send a POST request with a file\", 400\\n\\n filepath = None\\n try:\\n uploaded_file = request.files[\"file\"]\\n filename = secure_filename(uploaded_file.filename)\\n filepath = os.path.join(\"documents\", os.path.basename(filename))\\n uploaded_file.save(filepath)\\n\\n if request.form.get(\"filename_as_doc_id\", None) is not None:\\n manager.insert_into_index(filepath, doc_id=filename)\\n else:\\n manager.insert_into_index(filepath)\\n except Exception as e:\\n # cleanup temp file\\n if filepath is not None and os.path.exists(filepath):\\n os.remove(filepath)\\n return \"Error: {}\".format(str(e)), 500\\n\\n # cleanup temp file\\n if filepath is not None and os.path.exists(filepath):\\n os.remove(filepath)\\n\\n return \"File inserted!\", 200\\nNot too bad! You will notice that we write the file to disk. We could skip this if we only accept basic file formats like txt files, but written to disk we can take advantage of LlamaIndex\\'s SimpleDirectoryReader to take care of a bunch of more complex file formats. Optionally, we also use a second POST argument to either use the filename as a doc_id or let LlamaIndex generate one for us. This will make more sense once we implement the frontend.\\n\\nWith these more complicated requests, I also suggest using a tool like [Postman](https://www.postman.com/downloads/?utm_source=postman-home). Examples of using postman to test our endpoints are in the [repository for this project](https://github.com/logan-markewich/llama_index_starter_pack/tree/main/flask_react/postman_examples).\\n\\nLastly, you\\'ll notice we added a new function to the manager. Let\\'s implement that inside index_server.py:\\n\\ndef insert_into_index(doc_text, doc_id=None):\\n global index\\n document = SimpleDirectoryReader(input_files=[doc_text]).load_data()[0]\\n if doc_id is not None:\\n document.doc_id = doc_id\\n\\n with lock:\\n index.insert(document)\\n index.storage_context.persist()\\n\\n\\n...\\nmanager.register(\"insert_into_index\", insert_into_index)\\n...\\nEasy! If we launch both the index_server.py and then the flask_demo.py python files, we have a Flask API server that can handle multiple requests to insert documents into a vector index and respond to user queries!\\n\\nTo support some functionality in the frontend, I\\'ve adjusted what some responses look like from the Flask API, as well as added some functionality to keep track of which documents are stored in the index (LlamaIndex doesn\\'t currently support this in a user-friendly way, but we can augment it ourselves!). Lastly, I had to add CORS support to the server using the Flask-cors python package.\\n\\nCheck out the complete flask_demo.py and index_server.py scripts in the [repository](https://github.com/logan-markewich/llama_index_starter_pack/tree/main/flask_react) for the final minor changes, therequirements.txt file, and a sample Dockerfile to help with deployment.\\n\\n##React Frontend[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#react-frontend)\\nGenerally, React and Typescript are one of the most popular libraries and languages for writing webapps today. This guide will assume you are familiar with how these tools work, because otherwise this guide will triple in length :smile:.\\n\\nIn the [repository](https://github.com/logan-markewich/llama_index_starter_pack/tree/main/flask_react), the frontend code is organized inside of the react_frontend folder.\\n\\nThe most relevant part of the frontend will be the src/apis folder. This is where we make calls to the Flask server, supporting the following queries:\\n\\n- /query -- make a query to the existing index\\n- /uploadFile -- upload a file to the flask server for insertion into the index\\n- /getDocuments -- list the current document titles and a portion of their texts\\nUsing these three queries, we can build a robust frontend that allows users to upload and keep track of their files, query the index, and view the query response and information about which text nodes were used to form the response.\\n\\n###fetchDocuments.tsx[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#fetchdocumentstsx)\\nThis file contains the function to, you guessed it, fetch the list of current documents in the index. The code is as follows:\\n\\nexport type Document = {\\n id: string;\\n text: string;\\n};\\n\\nconst fetchDocuments = async (): Promise<Document[]> => {\\n const response = await fetch(\"http://localhost:5601/getDocuments\", {\\n mode: \"cors\",\\n });\\n\\n if (!response.ok) {\\n return [];\\n }\\n\\n const documentList = (await response.json()) as Document[];\\n return documentList;\\n};\\nAs you can see, we make a query to the Flask server (here, it assumes running on localhost). Notice that we need to include the mode: \\'cors\\' option, as we are making an external request.\\n\\nThen, we check if the response was ok, and if so, get the response json and return it. Here, the response json is a list of Document objects that are defined in the same file.\\n\\n###queryIndex.tsx[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#queryindextsx)\\nThis file sends the user query to the flask server, and gets the response back, as well as details about which nodes in our index provided the response.\\n\\nexport type ResponseSources = {\\n text: string;\\n doc_id: string;\\n start: number;\\n end: number;\\n similarity: number;\\n};\\n\\nexport type QueryResponse = {\\n text: string;\\n sources: ResponseSources[];\\n};\\n\\nconst queryIndex = async (query: string): Promise<QueryResponse> => {\\n const queryURL = new URL(\"http://localhost:5601/query?text=1\");\\n queryURL.searchParams.append(\"text\", query);\\n\\n const response = await fetch(queryURL, { mode: \"cors\" });\\n if (!response.ok) {\\n return { text: \"Error in query\", sources: [] };\\n }\\n\\n const queryResponse = (await response.json()) as QueryResponse;\\n\\n return queryResponse;\\n};\\n\\nexport default queryIndex;\\nThis is similar to the fetchDocuments.tsx file, with the main difference being we include the query text as a parameter in the URL. Then, we check if the response is ok and return it with the appropriate typescript type.\\n\\n###insertDocument.tsx[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#insertdocumenttsx)\\nProbably the most complex API call is uploading a document. The function here accepts a file object and constructs a POST request using FormData.\\n\\nThe actual response text is not used in the app but could be utilized to provide some user feedback on if the file failed to upload or not.\\n\\nconst insertDocument = async (file: File) => {\\n const formData = new FormData();\\n formData.append(\"file\", file);\\n formData.append(\"filename_as_doc_id\", \"true\");\\n\\n const response = await fetch(\"http://localhost:5601/uploadFile\", {\\n mode: \"cors\",\\n method: \"POST\",\\n body: formData,\\n });\\n\\n const responseText = response.text();\\n return responseText;\\n};\\n\\nexport default insertDocument;\\n###All the Other Frontend Good-ness[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#all-the-other-frontend-good-ness)\\nAnd that pretty much wraps up the frontend portion! The rest of the react frontend code is some pretty basic react components, and my best attempt to make it look at least a little nice :smile:.\\n\\nI encourage to read the rest of the [codebase](https://github.com/logan-markewich/llama_index_starter_pack/tree/main/flask_react/react_frontend) and submit any PRs for improvements!\\n\\n##Conclusion[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/apps/fullstack_app_guide/#conclusion)\\nThis guide has covered a ton of information. We went from a basic \"Hello World\" Flask server written in python, to a fully functioning LlamaIndex powered backend and how to connect that to a frontend application.\\n\\nAs you can see, we can easily augment and wrap the services provided by LlamaIndex (like the little external document tracker) to help provide a good user experience on the frontend.\\n\\nYou could take this and add many features (multi-index/user support, saving objects into S3, adding a Pinecone vector server, etc.). And when you build an app after reading this, be sure to share the final result in the Discord! Good Luck! :muscle:\\n\\nπ¦\\n\\nCTRL + K'}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/querying/querying/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/querying/querying/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'Querying - LlamaIndex'}}, 'text': ' \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#querying)\\n#Querying[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#querying)\\nNow you\\'ve loaded your data, built an index, and stored that index for later, you\\'re ready to get to the most significant part of an LLM application: querying.\\n\\nAt its simplest, querying is just a prompt call to an LLM: it can be a question and get an answer, or a request for summarization, or a much more complex instruction.\\n\\nMore complex querying could involve repeated/chained prompt + LLM calls, or even a reasoning loop across multiple components.\\n\\n##Getting started[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#getting-started)\\nThe basis of all querying is the QueryEngine. The simplest way to get a QueryEngine is to get your index to create one for you, like this:\\n\\nquery_engine = index.as_query_engine()\\nresponse = query_engine.query(\\n \"Write an email to the user given their background information.\"\\n)\\nprint(response)\\n##Stages of querying[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#stages-of-querying)\\nHowever, there is more to querying than initially meets the eye. Querying consists of three distinct stages:\\n\\n- Retrieval is when you find and return the most relevant documents for your query from your Index. As previously discussed in [indexing](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/), the most common type of retrieval is \"top-k\" semantic retrieval, but there are many other retrieval strategies.\\n- Postprocessing is when the Nodes retrieved are optionally reranked, transformed, or filtered, for instance by requiring that they have specific metadata such as keywords attached.\\n- Response synthesis is when your query, your most-relevant data and your prompt are combined and sent to your LLM to return a response.\\nTip\\n\\nYou can find out about [how to attach metadata to documents](https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_documents/) and [nodes](https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_nodes/).\\n\\n##Customizing the stages of querying[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#customizing-the-stages-of-querying)\\nLlamaIndex features a low-level composition API that gives you granular control over your querying.\\n\\nIn this example, we customize our retriever to use a different number for top_k and add a post-processing step that requires that the retrieved nodes reach a minimum similarity score to be included. This would give you a lot of data when you have relevant results but potentially no data if you have nothing relevant.\\n\\nfrom llama_index.core import VectorStoreIndex, get_response_synthesizer\\nfrom llama_index.core.retrievers import VectorIndexRetriever\\nfrom llama_index.core.query_engine import RetrieverQueryEngine\\nfrom llama_index.core.postprocessor import SimilarityPostprocessor\\n\\n# build index\\nindex = VectorStoreIndex.from_documents(documents)\\n\\n# configure retriever\\nretriever = VectorIndexRetriever(\\n index=index,\\n similarity_top_k=10,\\n)\\n\\n# configure response synthesizer\\nresponse_synthesizer = get_response_synthesizer()\\n\\n# assemble query engine\\nquery_engine = RetrieverQueryEngine(\\n retriever=retriever,\\n response_synthesizer=response_synthesizer,\\n node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],\\n)\\n\\n# query\\nresponse = query_engine.query(\"What did the author do growing up?\")\\nprint(response)\\nYou can also add your own retrieval, response synthesis, and overall query logic, by implementing the corresponding interfaces.\\n\\nFor a full list of implemented components and the supported configurations, check out our [reference docs](https://docs.llamaindex.ai/en/stable/api_reference/).\\n\\nLet\\'s go into more detail about customizing each step:\\n\\n###Configuring retriever[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#configuring-retriever)\\nretriever = VectorIndexRetriever(\\n index=index,\\n similarity_top_k=10,\\n)\\nThere are a huge variety of retrievers that you can learn about in our [module guide on retrievers](https://docs.llamaindex.ai/en/stable/module_guides/querying/retriever/).\\n\\n###Configuring node postprocessors[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#configuring-node-postprocessors)\\nWe support advanced Node filtering and augmentation that can further improve the relevancy of the retrieved Node objects. This can help reduce the time/number of LLM calls/cost or improve response quality.\\n\\nFor example:\\n\\n- KeywordNodePostprocessor: filters nodes by required_keywords and exclude_keywords.\\n- SimilarityPostprocessor: filters nodes by setting a threshold on the similarity score (thus only supported by embedding-based retrievers)\\n- PrevNextNodePostprocessor: augments retrieved Node objects with additional relevant context based on Node relationships.\\nThe full list of node postprocessors is documented in the [Node Postprocessor Reference](https://docs.llamaindex.ai/en/stable/api_reference/postprocessor/).\\n\\nTo configure the desired node postprocessors:\\n\\nnode_postprocessors = [\\n KeywordNodePostprocessor(\\n required_keywords=[\"Combinator\"], exclude_keywords=[\"Italy\"]\\n )\\n]\\nquery_engine = RetrieverQueryEngine.from_args(\\n retriever, node_postprocessors=node_postprocessors\\n)\\nresponse = query_engine.query(\"What did the author do growing up?\")\\n###Configuring response synthesis[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#configuring-response-synthesis)\\nAfter a retriever fetches relevant nodes, a BaseSynthesizer synthesizes the final response by combining the information.\\n\\nYou can configure it via\\n\\nquery_engine = RetrieverQueryEngine.from_args(\\n retriever, response_mode=response_mode\\n)\\nRight now, we support the following options:\\n\\n- default: \"create and refine\" an answer by sequentially going through each retrieved Node; This makes a separate LLM call per Node. Good for more detailed answers.\\n- compact: \"compact\" the prompt during each LLM call by stuffing as many Node text chunks that can fit within the maximum prompt size. If there are too many chunks to stuff in one prompt, \"create and refine\" an answer by going through multiple prompts.\\n- tree_summarize: Given a set of Node objects and the query, recursively construct a tree and return the root node as the response. Good for summarization purposes.\\n- no_text: Only runs the retriever to fetch the nodes that would have been sent to the LLM, without actually sending them. Then can be inspected by checking response.source_nodes. The response object is covered in more detail in Section 5.\\n- accumulate: Given a set of Node objects and the query, apply the query to each Node text chunk while accumulating the responses into an array. Returns a concatenated string of all responses. Good for when you need to run the same query separately against each text chunk.\\n##Structured Outputs[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#structured-outputs)\\nYou may want to ensure your output is structured. See our [Query Engines + Pydantic Outputs](https://docs.llamaindex.ai/en/stable/module_guides/querying/structured_outputs/query_engine/) to see how to extract a Pydantic object from a query engine class.\\n\\nAlso make sure to check out our entire [Structured Outputs](https://docs.llamaindex.ai/en/stable/module_guides/querying/structured_outputs/) guide.\\n\\n##Creating your own Query Pipeline[#](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/#creating-your-own-query-pipeline)\\nIf you want to design complex query flows, you can compose your own query pipeline across many different modules, from prompts/LLMs/output parsers to retrievers to response synthesizers to your own custom components.\\n\\nTake a look at our [Query Pipelines Module Guide](https://docs.llamaindex.ai/en/stable/module_guides/querying/pipeline/) for more details.\\n\\nπ¦\\n\\nCTRL + K'}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/using_llms/privacy/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/using_llms/privacy/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'Privacy and Security - LlamaIndex'}}, 'text': \" \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/using_llms/privacy/#privacy-and-security)\\n#Privacy and Security[#](https://docs.llamaindex.ai/en/stable/understanding/using_llms/privacy/#privacy-and-security)\\nBy default, LLamaIndex sends your data to OpenAI for generating embeddings and natural language responses. However, it is important to note that this can be configured according to your preferences. LLamaIndex provides the flexibility to use your own embedding model or run a large language model locally if desired.\\n\\n##Data Privacy[#](https://docs.llamaindex.ai/en/stable/understanding/using_llms/privacy/#data-privacy)\\nRegarding data privacy, when using LLamaIndex with OpenAI, the privacy details and handling of your data are subject to OpenAI's policies. And each custom service other than OpenAI has its policies as well.\\n\\n##Vector stores[#](https://docs.llamaindex.ai/en/stable/understanding/using_llms/privacy/#vector-stores)\\nLLamaIndex offers modules to connect with other vector stores within indexes to store embeddings. It is worth noting that each vector store has its own privacy policies and practices, and LLamaIndex does not assume responsibility for how it handles or uses your data. Also by default, LLamaIndex has a default option to store your embeddings locally.\\n\\nπ¦\\n\\nCTRL + K\"}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'How to Build a Chatbot - LlamaIndex'}}, 'text': ' \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#how-to-build-a-chatbot)\\n#How to Build a Chatbot[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#how-to-build-a-chatbot)\\nLlamaIndex serves as a bridge between your data and Large Language Models (LLMs), providing a toolkit that enables you to establish a query interface around your data for a variety of tasks, such as question-answering and summarization.\\n\\nIn this tutorial, we\\'ll walk you through building a context-augmented chatbot using a [Data Agent](https://gpt-index.readthedocs.io/en/stable/core_modules/agent_modules/agents/root.html). This agent, powered by LLMs, is capable of intelligently executing tasks over your data. The end result is a chatbot agent equipped with a robust set of data interface tools provided by LlamaIndex to answer queries about your data.\\n\\nNote: This tutorial builds upon initial work on creating a query interface over SEC 10-K filings - [check it out here](https://medium.com/@jerryjliu98/how-unstructured-and-llamaindex-can-help-bring-the-power-of-llms-to-your-own-data-3657d063e30d).\\n\\n###Context[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#context)\\nIn this guide, weβll build a \"10-K Chatbot\" that uses raw UBER 10-K HTML filings from Dropbox. Users can interact with the chatbot to ask questions related to the 10-K filings.\\n\\n###Preparation[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#preparation)\\nimport os\\nimport openai\\n\\nos.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\\nopenai.api_key = os.environ[\"OPENAI_API_KEY\"]\\n\\nimport nest_asyncio\\n\\nnest_asyncio.apply()\\n###Ingest Data[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#ingest-data)\\nLet\\'s first download the raw 10-k files, from 2019-2022.\\n\\n# NOTE: the code examples assume you\\'re operating within a Jupyter notebook.\\n# download files\\n!mkdir data\\n!wget \"https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1\" -O data/UBER.zip\\n!unzip data/UBER.zip -d data\\nTo parse the HTML files into formatted text, we use the [Unstructured](https://github.com/Unstructured-IO/unstructured) library. Thanks to [LlamaHub](https://llamahub.ai/), we can directly integrate with Unstructured, allowing conversion of any text into a Document format that LlamaIndex can ingest.\\n\\nFirst we install the necessary packages:\\n\\n!pip install llama-hub unstructured\\nThen we can use the UnstructuredReader to parse the HTML files into a list of Document objects.\\n\\nfrom llama_index.readers.file import UnstructuredReader\\nfrom pathlib import Path\\n\\nyears = [2022, 2021, 2020, 2019]\\n\\nloader = UnstructuredReader()\\ndoc_set = {}\\nall_docs = []\\nfor year in years:\\n year_docs = loader.load_data(\\n file=Path(f\"./data/UBER/UBER_{year}.html\"), split_documents=False\\n )\\n # insert year metadata into each year\\n for d in year_docs:\\n d.metadata = {\"year\": year}\\n doc_set[year] = year_docs\\n all_docs.extend(year_docs)\\n###Setting up Vector Indices for each year[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#setting-up-vector-indices-for-each-year)\\nWe first setup a vector index for each year. Each vector index allows us to ask questions about the 10-K filing of a given year.\\n\\nWe build each index and save it to disk.\\n\\n# initialize simple vector indices\\nfrom llama_index.core import VectorStoreIndex, StorageContext\\nfrom llama_index.core import Settings\\n\\nSettings.chunk_size = 512\\nindex_set = {}\\nfor year in years:\\n storage_context = StorageContext.from_defaults()\\n cur_index = VectorStoreIndex.from_documents(\\n doc_set[year],\\n storage_context=storage_context,\\n )\\n index_set[year] = cur_index\\n storage_context.persist(persist_dir=f\"./storage/{year}\")\\nTo load an index from disk, do the following\\n\\n# Load indices from disk\\nfrom llama_index.core import load_index_from_storage\\n\\nindex_set = {}\\nfor year in years:\\n storage_context = StorageContext.from_defaults(\\n persist_dir=f\"./storage/{year}\"\\n )\\n cur_index = load_index_from_storage(\\n storage_context,\\n )\\n index_set[year] = cur_index\\n###Setting up a Sub Question Query Engine to Synthesize Answers Across 10-K Filings[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#setting-up-a-sub-question-query-engine-to-synthesize-answers-across-10-k-filings)\\nSince we have access to documents of 4 years, we may not only want to ask questions regarding the 10-K document of a given year, but ask questions that require analysis over all 10-K filings.\\n\\nTo address this, we can use a [Sub Question Query Engine](https://gpt-index.readthedocs.io/en/stable/examples/query_engine/sub_question_query_engine.html). It decomposes a query into subqueries, each answered by an individual vector index, and synthesizes the results to answer the overall query.\\n\\nLlamaIndex provides some wrappers around indices (and query engines) so that they can be used by query engines and agents. First we define a QueryEngineTool for each vector index. Each tool has a name and a description; these are what the LLM agent sees to decide which tool to choose.\\n\\nfrom llama_index.core.tools import QueryEngineTool, ToolMetadata\\n\\nindividual_query_engine_tools = [\\n QueryEngineTool(\\n query_engine=index_set[year].as_query_engine(),\\n metadata=ToolMetadata(\\n name=f\"vector_index_{year}\",\\n description=f\"useful for when you want to answer queries about the {year} SEC 10-K for Uber\",\\n ),\\n )\\n for year in years\\n]\\nNow we can create the Sub Question Query Engine, which will allow us to synthesize answers across the 10-K filings. We pass in the individual_query_engine_tools we defined above, as well as an llm that will be used to run the subqueries.\\n\\nfrom llama_index.llms.openai import OpenAI\\nfrom llama_index.core.query_engine import SubQuestionQueryEngine\\n\\nquery_engine = SubQuestionQueryEngine.from_defaults(\\n query_engine_tools=individual_query_engine_tools,\\n llm=OpenAI(model=\"gpt-3.5-turbo\"),\\n)\\n###Setting up the Chatbot Agent[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#setting-up-the-chatbot-agent)\\nWe use a LlamaIndex Data Agent to setup the outer chatbot agent, which has access to a set of Tools. Specifically, we will use an OpenAIAgent, that takes advantage of OpenAI API function calling. We want to use the separate Tools we defined previously for each index (corresponding to a given year), as well as a tool for the sub question query engine we defined above.\\n\\nFirst we define a QueryEngineTool for the sub question query engine:\\n\\nquery_engine_tool = QueryEngineTool(\\n query_engine=query_engine,\\n metadata=ToolMetadata(\\n name=\"sub_question_query_engine\",\\n description=\"useful for when you want to answer queries that require analyzing multiple SEC 10-K documents for Uber\",\\n ),\\n)\\nThen, we combine the Tools we defined above into a single list of tools for the agent:\\n\\ntools = individual_query_engine_tools + [query_engine_tool]\\nFinally, we call OpenAIAgent.from_tools to create the agent, passing in the list of tools we defined above.\\n\\nfrom llama_index.agent.openai import OpenAIAgent\\n\\nagent = OpenAIAgent.from_tools(tools, verbose=True)\\n###Testing the Agent[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#testing-the-agent)\\nWe can now test the agent with various queries.\\n\\nIf we test it with a simple \"hello\" query, the agent does not use any Tools.\\n\\nresponse = agent.chat(\"hi, i am bob\")\\nprint(str(response))\\nHello Bob! How can I assist you today?\\nIf we test it with a query regarding the 10-k of a given year, the agent will use the relevant vector index Tool.\\n\\nresponse = agent.chat(\\n \"What were some of the biggest risk factors in 2020 for Uber?\"\\n)\\nprint(str(response))\\n=== Calling Function ===\\nCalling function: vector_index_2020 with args: {\\n \"input\": \"biggest risk factors\"\\n}\\nGot output: The biggest risk factors mentioned in the context are:\\n1. The adverse impact of the COVID-19 pandemic and actions taken to mitigate it on the business.\\n2. The potential reclassification of drivers as employees, workers, or quasi-employees instead of independent contractors.\\n3. Intense competition in the mobility, delivery, and logistics industries, with low-cost alternatives and well-capitalized competitors.\\n4. The need to lower fares or service fees and offer driver incentives and consumer discounts to remain competitive.\\n5. Significant losses incurred and the uncertainty of achieving profitability.\\n6. The risk of not attracting or maintaining a critical mass of platform users.\\n7. Operational, compliance, and cultural challenges related to the workplace culture and forward-leaning approach.\\n8. The potential negative impact of international investments and the challenges of conducting business in foreign countries.\\n9. Risks associated with operational and compliance challenges, localization, laws and regulations, competition, social acceptance, technological compatibility, improper business practices, liability uncertainty, managing international operations, currency fluctuations, cash transactions, tax consequences, and payment fraud.\\n========================\\nSome of the biggest risk factors for Uber in 2020 were:\\n\\n1. The adverse impact of the COVID-19 pandemic and actions taken to mitigate it on the business.\\n2. The potential reclassification of drivers as employees, workers, or quasi-employees instead of independent contractors.\\n3. Intense competition in the mobility, delivery, and logistics industries, with low-cost alternatives and well-capitalized competitors.\\n4. The need to lower fares or service fees and offer driver incentives and consumer discounts to remain competitive.\\n5. Significant losses incurred and the uncertainty of achieving profitability.\\n6. The risk of not attracting or maintaining a critical mass of platform users.\\n7. Operational, compliance, and cultural challenges related to the workplace culture and forward-leaning approach.\\n8. The potential negative impact of international investments and the challenges of conducting business in foreign countries.\\n9. Risks associated with operational and compliance challenges, localization, laws and regulations, competition, social acceptance, technological compatibility, improper business practices, liability uncertainty, managing international operations, currency fluctuations, cash transactions, tax consequences, and payment fraud.\\n\\nThese risk factors highlight the challenges and uncertainties that Uber faced in 2020.\\nFinally, if we test it with a query to compare/contrast risk factors across years, the agent will use the Sub Question Query Engine Tool.\\n\\ncross_query_str = \"Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points.\"\\n\\nresponse = agent.chat(cross_query_str)\\nprint(str(response))\\n=== Calling Function ===\\nCalling function: sub_question_query_engine with args: {\\n \"input\": \"Compare/contrast the risk factors described in the Uber 10-K across years\"\\n}\\nGenerated 4 sub questions.\\n[vector_index_2022] Q: What are the risk factors described in the 2022 SEC 10-K for Uber?\\n[vector_index_2021] Q: What are the risk factors described in the 2021 SEC 10-K for Uber?\\n[vector_index_2020] Q: What are the risk factors described in the 2020 SEC 10-K for Uber?\\n[vector_index_2019] Q: What are the risk factors described in the 2019 SEC 10-K for Uber?\\n[vector_index_2021] A: The risk factors described in the 2021 SEC 10-K for Uber include the adverse impact of the COVID-19 pandemic on their business, the potential reclassification of drivers as employees instead of independent contractors, intense competition in the mobility, delivery, and logistics industries, the need to lower fares and offer incentives to remain competitive, significant losses incurred by the company, the importance of attracting and maintaining a critical mass of platform users, and the ongoing legal challenges regarding driver classification.\\n[vector_index_2020] A: The risk factors described in the 2020 SEC 10-K for Uber include the adverse impact of the COVID-19 pandemic on their business, the potential reclassification of drivers as employees instead of independent contractors, intense competition in the mobility, delivery, and logistics industries, the need to lower fares and offer incentives to remain competitive, significant losses and the uncertainty of achieving profitability, the importance of attracting and retaining a critical mass of drivers and users, and the challenges associated with their workplace culture and operational compliance.\\n[vector_index_2022] A: The risk factors described in the 2022 SEC 10-K for Uber include the potential adverse effect on their business if drivers were classified as employees instead of independent contractors, the highly competitive nature of the mobility, delivery, and logistics industries, the need to lower fares or service fees to remain competitive in certain markets, the company\\'s history of significant losses and the expectation of increased operating expenses in the future, and the potential impact on their platform if they are unable to attract or maintain a critical mass of drivers, consumers, merchants, shippers, and carriers.\\n[vector_index_2019] A: The risk factors described in the 2019 SEC 10-K for Uber include the loss of their license to operate in London, the complexity of their business and operating model due to regulatory uncertainties, the potential for additional regulations for their other products in the Other Bets segment, the evolving laws and regulations regarding the development and deployment of autonomous vehicles, and the increasing number of data protection and privacy laws around the world. Additionally, there are legal proceedings, litigation, claims, and government investigations that Uber is involved in, which could impose a burden on management and employees and come with defense costs or unfavorable rulings.\\nGot output: The risk factors described in the Uber 10-K reports across the years include the potential reclassification of drivers as employees instead of independent contractors, intense competition in the mobility, delivery, and logistics industries, the need to lower fares and offer incentives to remain competitive, significant losses incurred by the company, the importance of attracting and maintaining a critical mass of platform users, and the ongoing legal challenges regarding driver classification. Additionally, there are specific risk factors mentioned in each year\\'s report, such as the adverse impact of the COVID-19 pandemic in 2020 and 2021, the loss of their license to operate in London in 2019, and the evolving laws and regulations regarding autonomous vehicles in 2019. Overall, while there are some similarities in the risk factors mentioned, there are also specific factors that vary across the years.\\n========================\\n=== Calling Function ===\\nCalling function: vector_index_2022 with args: {\\n \"input\": \"risk factors\"\\n}\\nGot output: Some of the risk factors mentioned in the context include the potential adverse effect on the business if drivers were classified as employees instead of independent contractors, the highly competitive nature of the mobility, delivery, and logistics industries, the need to lower fares or service fees to remain competitive, the company\\'s history of significant losses and the expectation of increased operating expenses, the impact of future pandemics or disease outbreaks on the business and financial results, and the potential harm to the business due to economic conditions and their effect on discretionary consumer spending.\\n========================\\n=== Calling Function ===\\nCalling function: vector_index_2021 with args: {\\n \"input\": \"risk factors\"\\n}\\nGot output: The COVID-19 pandemic and the impact of actions to mitigate the pandemic have adversely affected and may continue to adversely affect parts of our business. Our business would be adversely affected if Drivers were classified as employees, workers or quasi-employees instead of independent contractors. The mobility, delivery, and logistics industries are highly competitive, with well-established and low-cost alternatives that have been available for decades, low barriers to entry, low switching costs, and well-capitalized competitors in nearly every major geographic region. To remain competitive in certain markets, we have in the past lowered, and may continue to lower, fares or service fees, and we have in the past offered, and may continue to offer, significant Driver incentives and consumer discounts and promotions. We have incurred significant losses since inception, including in the United States and other major markets. We expect our operating expenses to increase significantly in the foreseeable future, and we may not achieve or maintain profitability. If we are unable to attract or maintain a critical mass of Drivers, consumers, merchants, shippers, and carriers, whether as a result of competition or other factors, our platform will become less appealing to platform users.\\n========================\\n=== Calling Function ===\\nCalling function: vector_index_2020 with args: {\\n \"input\": \"risk factors\"\\n}\\nGot output: The risk factors mentioned in the context include the adverse impact of the COVID-19 pandemic on the business, the potential reclassification of drivers as employees, the highly competitive nature of the mobility, delivery, and logistics industries, the need to lower fares or service fees to remain competitive, the company\\'s history of significant losses and potential future expenses, the importance of attracting and maintaining a critical mass of platform users, and the operational and cultural challenges faced by the company.\\n========================\\n=== Calling Function ===\\nCalling function: vector_index_2019 with args: {\\n \"input\": \"risk factors\"\\n}\\nGot output: The risk factors mentioned in the context include competition with local companies, differing levels of social acceptance, technological compatibility issues, exposure to improper business practices, legal uncertainty, difficulties in managing international operations, fluctuations in currency exchange rates, regulations governing local currencies, tax consequences, financial accounting burdens, difficulties in implementing financial systems, import and export restrictions, political and economic instability, public health concerns, reduced protection for intellectual property rights, limited influence over minority-owned affiliates, and regulatory complexities. These risk factors could adversely affect the international operations, business, financial condition, and operating results of the company.\\n========================\\nHere is a comparison of the risk factors described in the Uber 10-K reports across years:\\n\\n2022 Risk Factors:\\n- Potential adverse effect if drivers were classified as employees instead of independent contractors.\\n- Highly competitive nature of the mobility, delivery, and logistics industries.\\n- Need to lower fares or service fees to remain competitive.\\n- History of significant losses and expectation of increased operating expenses.\\n- Impact of future pandemics or disease outbreaks on the business and financial results.\\n- Potential harm to the business due to economic conditions and their effect on discretionary consumer spending.\\n\\n2021 Risk Factors:\\n- Adverse impact of the COVID-19 pandemic and actions to mitigate it on the business.\\n- Potential reclassification of drivers as employees instead of independent contractors.\\n- Highly competitive nature of the mobility, delivery, and logistics industries.\\n- Need to lower fares or service fees and offer incentives to remain competitive.\\n- History of significant losses and uncertainty of achieving profitability.\\n- Importance of attracting and maintaining a critical mass of platform users.\\n\\n2020 Risk Factors:\\n- Adverse impact of the COVID-19 pandemic on the business.\\n- Potential reclassification of drivers as employees.\\n- Highly competitive nature of the mobility, delivery, and logistics industries.\\n- Need to lower fares or service fees to remain competitive.\\n- History of significant losses and potential future expenses.\\n- Importance of attracting and maintaining a critical mass of platform users.\\n- Operational and cultural challenges faced by the company.\\n\\n2019 Risk Factors:\\n- Competition with local companies.\\n- Differing levels of social acceptance.\\n- Technological compatibility issues.\\n- Exposure to improper business practices.\\n- Legal uncertainty.\\n- Difficulties in managing international operations.\\n- Fluctuations in currency exchange rates.\\n- Regulations governing local currencies.\\n- Tax consequences.\\n- Financial accounting burdens.\\n- Difficulties in implementing financial systems.\\n- Import and export restrictions.\\n- Political and economic instability.\\n- Public health concerns.\\n- Reduced protection for intellectual property rights.\\n- Limited influence over minority-owned affiliates.\\n- Regulatory complexities.\\n\\nThese comparisons highlight both common and unique risk factors that Uber faced in different years.\\n###Setting up the Chatbot Loop[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#setting-up-the-chatbot-loop)\\nNow that we have the chatbot setup, it only takes a few more steps to setup a basic interactive loop to chat with our SEC-augmented chatbot!\\n\\nagent = OpenAIAgent.from_tools(tools) # verbose=False by default\\n\\nwhile True:\\n text_input = input(\"User: \")\\n if text_input == \"exit\":\\n break\\n response = agent.chat(text_input)\\n print(f\"Agent: {response}\")\\nHere\\'s an example of the loop in action:\\n\\nUser: What were some of the legal proceedings against Uber in 2022?\\nAgent: In 2022, Uber faced several legal proceedings. Some of the notable ones include:\\n\\n1. Petition against Proposition 22: A petition was filed in California alleging that Proposition 22, which classifies app-based drivers as independent contractors, is unconstitutional.\\n\\n2. Lawsuit by Massachusetts Attorney General: The Massachusetts Attorney General filed a lawsuit against Uber, claiming that drivers should be classified as employees and entitled to protections under wage and labor laws.\\n\\n3. Allegations by New York Attorney General: The New York Attorney General made allegations against Uber regarding the misclassification of drivers and related employment violations.\\n\\n4. Swiss social security rulings: Swiss social security rulings classified Uber drivers as employees, which could have implications for Uber\\'s operations in Switzerland.\\n\\n5. Class action lawsuits in Australia: Uber faced class action lawsuits in Australia, with allegations that the company conspired to harm participants in the taxi, hire-car, and limousine industries.\\n\\nIt\\'s important to note that the outcomes of these legal proceedings are uncertain and may vary.\\n\\nUser:\\n###Notebook[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot/#notebook)\\nTake a look at our [corresponding notebook](https://docs.llamaindex.ai/en/stable/examples/agent/Chatbot_SEC/).\\n\\nπ¦\\n\\nCTRL + K'}, {'meta': {'url': 'https://docs.llamaindex.ai/en/stable/understanding/loading/llamahub/', 'fetchedUrl': 'https://docs.llamaindex.ai/en/stable/understanding/loading/llamahub/', 'fetchedUrlStatusCode': 200, 'meta': {'title': 'LlamaHub - LlamaIndex'}}, 'text': ' \\n[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/loading/llamahub/#llamahub)\\n#LlamaHub[#](https://docs.llamaindex.ai/en/stable/understanding/loading/llamahub/#llamahub)\\nOur data connectors are offered through [LlamaHub](https://llamahub.ai/) π¦. LlamaHub contains a registry of open-source data connectors that you can easily plug into any LlamaIndex application (+ Agent Tools, and Llama Packs).\\n\\n\\n\\n##Usage Pattern[#](https://docs.llamaindex.ai/en/stable/understanding/loading/llamahub/#usage-pattern)\\nGet started with:\\n\\nfrom llama_index.core import download_loader\\n\\nfrom llama_index.readers.google import GoogleDocsReader\\n\\nloader = GoogleDocsReader()\\ndocuments = loader.load_data(document_ids=[...])\\n##Built-in connector: SimpleDirectoryReader[#](https://docs.llamaindex.ai/en/stable/understanding/loading/llamahub/#built-in-connector-simpledirectoryreader)\\nSimpleDirectoryReader. Can support parsing a wide range of file types including .md, .pdf, .jpg, .png, .docx, as well as audio and video types. It is available directly as part of LlamaIndex:\\n\\nfrom llama_index.core import SimpleDirectoryReader\\n\\ndocuments = SimpleDirectoryReader(\"./data\").load_data()\\n##Available connectors[#](https://docs.llamaindex.ai/en/stable/understanding/loading/llamahub/#available-connectors)\\nBrowse [LlamaHub](https://llamahub.ai/) directly to see the hundreds of connectors available, including:\\n\\n- [Notion](https://developers.notion.com/) (NotionPageReader)\\n- [Google Docs](https://developers.google.com/docs/api) (GoogleDocsReader)\\n- [Slack](https://api.slack.com/) (SlackReader)\\n- [Discord](https://discord.com/developers/docs/intro) (DiscordReader)\\n- [Apify Actors](https://llamahub.ai/l/apify-actor) (ApifyActor). Can crawl the web, scrape webpages, extract text content, download files including .pdf, .jpg, .png, .docx, etc.\\nπ¦\\n\\nCTRL + K'}]}\n"
|
360 |
+
]
|
361 |
+
}
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"source": [
|
367 |
+
"print( \"URL:\", data_res['data'][0]['meta']['url'] )\n",
|
368 |
+
"print( \"Title:\", data_res['data'][0]['meta']['meta']['title'] )\n",
|
369 |
+
"print( \"Content:\", data_res['data'][0]['text'][0:500], \"...\" )"
|
370 |
+
],
|
371 |
+
"metadata": {
|
372 |
+
"colab": {
|
373 |
+
"base_uri": "https://localhost:8080/"
|
374 |
+
},
|
375 |
+
"id": "F8VEQvJkITLJ",
|
376 |
+
"outputId": "b54ec108-7221-4230-8b61-d0a4be503a66"
|
377 |
+
},
|
378 |
+
"execution_count": 11,
|
379 |
+
"outputs": [
|
380 |
+
{
|
381 |
+
"output_type": "stream",
|
382 |
+
"name": "stdout",
|
383 |
+
"text": [
|
384 |
+
"URL: https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/graphs/\n",
|
385 |
+
"Title: Knowledge Graphs - LlamaIndex\n",
|
386 |
+
"Content: \n",
|
387 |
+
"[ Skip to content ](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/graphs/#knowledge-graphs)\n",
|
388 |
+
"#Knowledge Graphs[#](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/graphs/#knowledge-graphs)\n",
|
389 |
+
"LlamaIndex contains some fantastic guides for building with knowledge graphs.\n",
|
390 |
+
"\n",
|
391 |
+
"Check out the end-to-end tutorials/workshops below. Also check out our [knowledge graph query engine guides](https://docs.llamaindex.ai/en/stable/module_guides/deploying/query_ ...\n"
|
392 |
+
]
|
393 |
+
}
|
394 |
+
]
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"cell_type": "markdown",
|
398 |
+
"source": [
|
399 |
+
"## Convert to Document"
|
400 |
+
],
|
401 |
+
"metadata": {
|
402 |
+
"id": "rt2nyuLhSYLR"
|
403 |
+
}
|
404 |
+
},
|
405 |
+
{
|
406 |
+
"cell_type": "code",
|
407 |
+
"source": [
|
408 |
+
"from llama_index.core.schema import Document\n",
|
409 |
+
"\n",
|
410 |
+
"# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
|
411 |
+
"documents = [Document(text=row['text'], metadata={\"title\": row['meta']['meta']['title'], \"url\": row['meta']['url']}) for row in data_res['data']]"
|
412 |
+
],
|
413 |
+
"metadata": {
|
414 |
+
"id": "YEieGzSFSXas"
|
415 |
+
},
|
416 |
+
"execution_count": 12,
|
417 |
+
"outputs": []
|
418 |
+
},
|
419 |
+
{
|
420 |
+
"cell_type": "markdown",
|
421 |
+
"source": [
|
422 |
+
"# Create RAG Pipeline"
|
423 |
+
],
|
424 |
+
"metadata": {
|
425 |
+
"id": "vqbJG5a1i3Jo"
|
426 |
+
}
|
427 |
+
},
|
428 |
+
{
|
429 |
+
"cell_type": "code",
|
430 |
+
"source": [
|
431 |
+
"from llama_index.llms.openai import OpenAI\n",
|
432 |
+
"\n",
|
433 |
+
"llm = OpenAI(model=\"gpt-3.5-turbo\")"
|
434 |
+
],
|
435 |
+
"metadata": {
|
436 |
+
"id": "wxmiQDv3SXV6"
|
437 |
+
},
|
438 |
+
"execution_count": 18,
|
439 |
+
"outputs": []
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"cell_type": "code",
|
443 |
+
"source": [
|
444 |
+
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
|
445 |
+
"\n",
|
446 |
+
"embed_model = OpenAIEmbedding(model=\"text-embedding-3-large\")"
|
447 |
+
],
|
448 |
+
"metadata": {
|
449 |
+
"id": "tCVhv4OkSXTV"
|
450 |
+
},
|
451 |
+
"execution_count": 19,
|
452 |
+
"outputs": []
|
453 |
+
},
|
454 |
+
{
|
455 |
+
"cell_type": "code",
|
456 |
+
"source": [
|
457 |
+
"from llama_index.core.node_parser import SentenceSplitter\n",
|
458 |
+
"\n",
|
459 |
+
"text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=30)"
|
460 |
+
],
|
461 |
+
"metadata": {
|
462 |
+
"id": "quwJI61dNVr-"
|
463 |
+
},
|
464 |
+
"execution_count": null,
|
465 |
+
"outputs": []
|
466 |
+
},
|
467 |
+
{
|
468 |
+
"cell_type": "code",
|
469 |
+
"source": [
|
470 |
+
"from llama_index.core import Settings\n",
|
471 |
+
"\n",
|
472 |
+
"Settings.llm = llm\n",
|
473 |
+
"Settings.embed_model = embed_model\n",
|
474 |
+
"Settings.text_splitter = text_splitter"
|
475 |
+
],
|
476 |
+
"metadata": {
|
477 |
+
"id": "6KpeCRMBUgup"
|
478 |
+
},
|
479 |
+
"execution_count": 21,
|
480 |
+
"outputs": []
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"cell_type": "code",
|
484 |
+
"source": [
|
485 |
+
"from llama_index.core import VectorStoreIndex\n",
|
486 |
+
"\n",
|
487 |
+
"index = VectorStoreIndex.from_documents( documents )"
|
488 |
+
],
|
489 |
+
"metadata": {
|
490 |
+
"id": "nWTBidwoZSO0"
|
491 |
+
},
|
492 |
+
"execution_count": 22,
|
493 |
+
"outputs": []
|
494 |
+
},
|
495 |
+
{
|
496 |
+
"cell_type": "code",
|
497 |
+
"source": [
|
498 |
+
"query_engine = index.as_query_engine()"
|
499 |
+
],
|
500 |
+
"metadata": {
|
501 |
+
"id": "RUuJO0IIYSeU"
|
502 |
+
},
|
503 |
+
"execution_count": 25,
|
504 |
+
"outputs": []
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"cell_type": "code",
|
508 |
+
"source": [
|
509 |
+
"res = query_engine.query(\"What is a query engine?\")"
|
510 |
+
],
|
511 |
+
"metadata": {
|
512 |
+
"id": "6_s2LkH6YX1V"
|
513 |
+
},
|
514 |
+
"execution_count": 26,
|
515 |
+
"outputs": []
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"cell_type": "code",
|
519 |
+
"source": [
|
520 |
+
"res.response"
|
521 |
+
],
|
522 |
+
"metadata": {
|
523 |
+
"colab": {
|
524 |
+
"base_uri": "https://localhost:8080/",
|
525 |
+
"height": 71
|
526 |
+
},
|
527 |
+
"id": "02zdJNqIZKep",
|
528 |
+
"outputId": "76340610-0d98-4fd0-d237-ddb9f1752391"
|
529 |
+
},
|
530 |
+
"execution_count": 28,
|
531 |
+
"outputs": [
|
532 |
+
{
|
533 |
+
"output_type": "execute_result",
|
534 |
+
"data": {
|
535 |
+
"text/plain": [
|
536 |
+
"'A query engine is a fundamental component used in querying processes. It is responsible for retrieving the most relevant documents from an index based on a query, postprocessing the retrieved nodes if needed, and then synthesizing a response by combining the query, relevant data, and prompt to be sent to the language model for generating an answer.'"
|
537 |
+
],
|
538 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
539 |
+
"type": "string"
|
540 |
+
}
|
541 |
+
},
|
542 |
+
"metadata": {},
|
543 |
+
"execution_count": 28
|
544 |
+
}
|
545 |
+
]
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"cell_type": "code",
|
549 |
+
"source": [
|
550 |
+
"# Show the retrieved nodes\n",
|
551 |
+
"for src in res.source_nodes:\n",
|
552 |
+
" print(\"Node ID\\t\", src.node_id)\n",
|
553 |
+
" print(\"Title\\t\", src.metadata['title'])\n",
|
554 |
+
" print(\"URL\\t\", src.metadata['url'])\n",
|
555 |
+
" print(\"Score\\t\", src.score)\n",
|
556 |
+
" print(\"-_\"*20)"
|
557 |
+
],
|
558 |
+
"metadata": {
|
559 |
+
"colab": {
|
560 |
+
"base_uri": "https://localhost:8080/"
|
561 |
+
},
|
562 |
+
"id": "PuCcgP0nZSIl",
|
563 |
+
"outputId": "e136cdbb-2ee4-4dfb-f532-f6c9365e519e"
|
564 |
+
},
|
565 |
+
"execution_count": 32,
|
566 |
+
"outputs": [
|
567 |
+
{
|
568 |
+
"output_type": "stream",
|
569 |
+
"name": "stdout",
|
570 |
+
"text": [
|
571 |
+
"Node ID\t 081b6c8c-d9ea-4476-bac0-1008facd3db8\n",
|
572 |
+
"Title\t Querying - LlamaIndex\n",
|
573 |
+
"URL\t https://docs.llamaindex.ai/en/stable/understanding/querying/querying/\n",
|
574 |
+
"Score\t 0.46212738505767387\n",
|
575 |
+
"-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
|
576 |
+
"Node ID\t 3786c195-c5de-4bba-98b6-996031349a88\n",
|
577 |
+
"Title\t Querying - LlamaIndex\n",
|
578 |
+
"URL\t https://docs.llamaindex.ai/en/stable/understanding/querying/querying/\n",
|
579 |
+
"Score\t 0.43141762602042416\n",
|
580 |
+
"-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
|
581 |
+
]
|
582 |
+
}
|
583 |
+
]
|
584 |
+
}
|
585 |
+
]
|
586 |
+
}
|