Spaces:
Sleeping
Sleeping
Vamsikrishna Chemudupati
commited on
Commit
β’
f755dcf
1
Parent(s):
f52b26d
Using a Vector DB lesson modified notebook
Browse files- notebooks/04-RAG_with_VectorStore.ipynb +436 -330
notebooks/04-RAG_with_VectorStore.ipynb
CHANGED
@@ -1,345 +1,451 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
"source": [
|
12 |
-
"<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
{
|
16 |
-
"cell_type": "markdown",
|
17 |
-
"metadata": {
|
18 |
-
"id": "5BGJ3fxhOk2V"
|
19 |
-
},
|
20 |
-
"source": [
|
21 |
-
"# Install Packages and Setup Variables"
|
22 |
-
]
|
23 |
-
},
|
24 |
-
{
|
25 |
-
"cell_type": "code",
|
26 |
-
"execution_count": 23,
|
27 |
-
"metadata": {
|
28 |
-
"colab": {
|
29 |
-
"base_uri": "https://localhost:8080/"
|
30 |
},
|
31 |
-
"id": "QPJzr-I9XQ7l",
|
32 |
-
"outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
|
33 |
-
},
|
34 |
-
"outputs": [],
|
35 |
-
"source": [
|
36 |
-
"!pip install -q llama-index==0.10.5 openai==1.12.0 cohere==4.47 tiktoken==0.6.0 chromadb==0.4.22"
|
37 |
-
]
|
38 |
-
},
|
39 |
-
{
|
40 |
-
"cell_type": "code",
|
41 |
-
"execution_count": 24,
|
42 |
-
"metadata": {
|
43 |
-
"id": "riuXwpSPcvWC"
|
44 |
-
},
|
45 |
-
"outputs": [],
|
46 |
-
"source": [
|
47 |
-
"import os\n",
|
48 |
-
"\n",
|
49 |
-
"# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
|
50 |
-
"os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
|
51 |
-
]
|
52 |
-
},
|
53 |
-
{
|
54 |
-
"cell_type": "markdown",
|
55 |
-
"metadata": {
|
56 |
-
"id": "I9JbAzFcjkpn"
|
57 |
-
},
|
58 |
-
"source": [
|
59 |
-
"# Load the Dataset (CSV)"
|
60 |
-
]
|
61 |
-
},
|
62 |
-
{
|
63 |
-
"cell_type": "markdown",
|
64 |
-
"metadata": {
|
65 |
-
"id": "_Tif8-JoRH68"
|
66 |
-
},
|
67 |
-
"source": [
|
68 |
-
"## Download"
|
69 |
-
]
|
70 |
-
},
|
71 |
-
{
|
72 |
-
"cell_type": "markdown",
|
73 |
-
"metadata": {
|
74 |
-
"id": "4fQaa1LN1mXL"
|
75 |
-
},
|
76 |
-
"source": [
|
77 |
-
"The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
|
78 |
-
]
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"cell_type": "code",
|
82 |
-
"execution_count": 25,
|
83 |
-
"metadata": {
|
84 |
-
"id": "-QTUkdfJjY4N"
|
85 |
-
},
|
86 |
-
"outputs": [
|
87 |
{
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
"
|
93 |
-
|
94 |
-
|
95 |
-
}
|
96 |
-
],
|
97 |
-
"source": [
|
98 |
-
"!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
|
99 |
-
]
|
100 |
-
},
|
101 |
-
{
|
102 |
-
"cell_type": "markdown",
|
103 |
-
"metadata": {
|
104 |
-
"id": "zk-4alIxROo8"
|
105 |
-
},
|
106 |
-
"source": [
|
107 |
-
"## Read File"
|
108 |
-
]
|
109 |
-
},
|
110 |
-
{
|
111 |
-
"cell_type": "code",
|
112 |
-
"execution_count": 26,
|
113 |
-
"metadata": {
|
114 |
-
"colab": {
|
115 |
-
"base_uri": "https://localhost:8080/"
|
116 |
},
|
117 |
-
"id": "7CYwRT6R0o0I",
|
118 |
-
"outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
|
119 |
-
},
|
120 |
-
"outputs": [
|
121 |
{
|
122 |
-
|
123 |
-
"
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
]
|
126 |
-
},
|
127 |
-
"execution_count": 26,
|
128 |
-
"metadata": {},
|
129 |
-
"output_type": "execute_result"
|
130 |
-
}
|
131 |
-
],
|
132 |
-
"source": [
|
133 |
-
"import csv\n",
|
134 |
-
"\n",
|
135 |
-
"text = \"\"\n",
|
136 |
-
"\n",
|
137 |
-
"# Load the file as a JSON\n",
|
138 |
-
"with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
|
139 |
-
" csv_reader = csv.reader(file)\n",
|
140 |
-
"\n",
|
141 |
-
" for row in csv_reader:\n",
|
142 |
-
" text += row[0]\n",
|
143 |
-
"\n",
|
144 |
-
"# The number of characters in the dataset.\n",
|
145 |
-
"len( text )"
|
146 |
-
]
|
147 |
-
},
|
148 |
-
{
|
149 |
-
"cell_type": "markdown",
|
150 |
-
"metadata": {
|
151 |
-
"id": "S17g2RYOjmf2"
|
152 |
-
},
|
153 |
-
"source": [
|
154 |
-
"# Chunking"
|
155 |
-
]
|
156 |
-
},
|
157 |
-
{
|
158 |
-
"cell_type": "code",
|
159 |
-
"execution_count": 27,
|
160 |
-
"metadata": {
|
161 |
-
"colab": {
|
162 |
-
"base_uri": "https://localhost:8080/"
|
163 |
},
|
164 |
-
"id": "STACTMUR1z9N",
|
165 |
-
"outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
|
166 |
-
},
|
167 |
-
"outputs": [
|
168 |
{
|
169 |
-
|
170 |
-
"
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
]
|
173 |
-
},
|
174 |
-
"execution_count": 27,
|
175 |
-
"metadata": {},
|
176 |
-
"output_type": "execute_result"
|
177 |
}
|
178 |
-
|
179 |
-
|
180 |
-
"chunk_size = 512\n",
|
181 |
-
"chunks = []\n",
|
182 |
-
"\n",
|
183 |
-
"# Split the long text into smaller manageable chunks of 512 characters.\n",
|
184 |
-
"for i in range(0, len(text), chunk_size):\n",
|
185 |
-
" chunks.append(text[i:i + chunk_size])\n",
|
186 |
-
"\n",
|
187 |
-
"len( chunks )"
|
188 |
-
]
|
189 |
-
},
|
190 |
-
{
|
191 |
-
"cell_type": "code",
|
192 |
-
"execution_count": 28,
|
193 |
-
"metadata": {
|
194 |
-
"id": "CtdsIUQ81_hT"
|
195 |
-
},
|
196 |
-
"outputs": [],
|
197 |
-
"source": [
|
198 |
-
"from llama_index.core import Document\n",
|
199 |
-
"\n",
|
200 |
-
"# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
|
201 |
-
"documents = [Document(text=t) for t in chunks]"
|
202 |
-
]
|
203 |
-
},
|
204 |
-
{
|
205 |
-
"cell_type": "markdown",
|
206 |
-
"metadata": {
|
207 |
-
"id": "OWaT6rL7ksp8"
|
208 |
-
},
|
209 |
-
"source": [
|
210 |
-
"# Save on Chroma"
|
211 |
-
]
|
212 |
-
},
|
213 |
-
{
|
214 |
-
"cell_type": "code",
|
215 |
-
"execution_count": 29,
|
216 |
-
"metadata": {
|
217 |
-
"id": "mXi56KTXk2sp"
|
218 |
-
},
|
219 |
-
"outputs": [],
|
220 |
-
"source": [
|
221 |
-
"import chromadb\n",
|
222 |
-
"\n",
|
223 |
-
"# create client and a new collection\n",
|
224 |
-
"# chromadb.EphemeralClient saves data in-memory.\n",
|
225 |
-
"chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
|
226 |
-
"chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
|
227 |
-
]
|
228 |
-
},
|
229 |
-
{
|
230 |
-
"cell_type": "code",
|
231 |
-
"execution_count": 30,
|
232 |
-
"metadata": {
|
233 |
-
"id": "jKXURvLtkuTS"
|
234 |
-
},
|
235 |
-
"outputs": [],
|
236 |
-
"source": [
|
237 |
-
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
|
238 |
-
"from llama_index.core import StorageContext\n",
|
239 |
-
"\n",
|
240 |
-
"# Define a storage context object using the created vector database.\n",
|
241 |
-
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
|
242 |
-
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
|
243 |
-
]
|
244 |
-
},
|
245 |
-
{
|
246 |
-
"cell_type": "code",
|
247 |
-
"execution_count": 31,
|
248 |
-
"metadata": {
|
249 |
-
"id": "WsD52wtrlESi"
|
250 |
-
},
|
251 |
-
"outputs": [],
|
252 |
-
"source": [
|
253 |
-
"from llama_index.core import VectorStoreIndex\n",
|
254 |
-
"\n",
|
255 |
-
"# Add the documents to the database and create Index / embeddings\n",
|
256 |
-
"index = VectorStoreIndex.from_documents(\n",
|
257 |
-
" documents, storage_context=storage_context\n",
|
258 |
-
")"
|
259 |
-
]
|
260 |
-
},
|
261 |
-
{
|
262 |
-
"cell_type": "markdown",
|
263 |
-
"metadata": {
|
264 |
-
"id": "8JPD8yAinVSq"
|
265 |
-
},
|
266 |
-
"source": [
|
267 |
-
"# Query Dataset"
|
268 |
-
]
|
269 |
-
},
|
270 |
-
{
|
271 |
-
"cell_type": "code",
|
272 |
-
"execution_count": 32,
|
273 |
-
"metadata": {
|
274 |
-
"id": "mzS13x1ZlZ5X"
|
275 |
-
},
|
276 |
-
"outputs": [],
|
277 |
-
"source": [
|
278 |
-
"from llama_index.llms.openai import OpenAI\n",
|
279 |
-
"# Define a query engine that is responsible for retrieving related pieces of text,\n",
|
280 |
-
"# and using a LLM to formulate the final answer.\n",
|
281 |
-
"\n",
|
282 |
-
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)\n",
|
283 |
-
"query_engine = index.as_query_engine(llm=llm)"
|
284 |
-
]
|
285 |
-
},
|
286 |
-
{
|
287 |
-
"cell_type": "code",
|
288 |
-
"execution_count": 33,
|
289 |
-
"metadata": {
|
290 |
"colab": {
|
291 |
-
|
292 |
},
|
293 |
-
"
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
{
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
303 |
}
|
304 |
-
],
|
305 |
-
"source": [
|
306 |
-
"response = query_engine.query(\n",
|
307 |
-
" \"How many parameters LLaMA2 model has?\"\n",
|
308 |
-
")\n",
|
309 |
-
"print(response)"
|
310 |
-
]
|
311 |
-
},
|
312 |
-
{
|
313 |
-
"cell_type": "code",
|
314 |
-
"execution_count": null,
|
315 |
-
"metadata": {},
|
316 |
-
"outputs": [],
|
317 |
-
"source": []
|
318 |
-
}
|
319 |
-
],
|
320 |
-
"metadata": {
|
321 |
-
"colab": {
|
322 |
-
"authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
|
323 |
-
"include_colab_link": true,
|
324 |
-
"provenance": []
|
325 |
-
},
|
326 |
-
"kernelspec": {
|
327 |
-
"display_name": "Python 3",
|
328 |
-
"name": "python3"
|
329 |
},
|
330 |
-
"
|
331 |
-
|
332 |
-
"name": "ipython",
|
333 |
-
"version": 3
|
334 |
-
},
|
335 |
-
"file_extension": ".py",
|
336 |
-
"mimetype": "text/x-python",
|
337 |
-
"name": "python",
|
338 |
-
"nbconvert_exporter": "python",
|
339 |
-
"pygments_lexer": "ipython3",
|
340 |
-
"version": "3.11.8"
|
341 |
-
}
|
342 |
-
},
|
343 |
-
"nbformat": 4,
|
344 |
-
"nbformat_minor": 0
|
345 |
}
|
|
|
1 |
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "view-in-github"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
10 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
{
|
13 |
+
"cell_type": "markdown",
|
14 |
+
"metadata": {
|
15 |
+
"id": "5BGJ3fxhOk2V"
|
16 |
+
},
|
17 |
+
"source": [
|
18 |
+
"# Install Packages and Setup Variables"
|
19 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
},
|
|
|
|
|
|
|
|
|
21 |
{
|
22 |
+
"cell_type": "code",
|
23 |
+
"execution_count": 1,
|
24 |
+
"metadata": {
|
25 |
+
"id": "QPJzr-I9XQ7l"
|
26 |
+
},
|
27 |
+
"outputs": [],
|
28 |
+
"source": [
|
29 |
+
"!pip install -q llama-index==0.10.5 llama-index-vector-stores-chroma==0.1.7 langchain==0.1.17 langchain-chroma==0.1.0 langchain_openai==0.1.5 openai==1.12.0 cohere==4.47 tiktoken==0.6.0 chromadb==0.4.22"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 2,
|
35 |
+
"metadata": {
|
36 |
+
"id": "riuXwpSPcvWC"
|
37 |
+
},
|
38 |
+
"outputs": [],
|
39 |
+
"source": [
|
40 |
+
"import os\n",
|
41 |
+
"\n",
|
42 |
+
"# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
|
43 |
+
"os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "markdown",
|
48 |
+
"metadata": {
|
49 |
+
"id": "I9JbAzFcjkpn"
|
50 |
+
},
|
51 |
+
"source": [
|
52 |
+
"# Load the Dataset (CSV)"
|
53 |
+
]
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"cell_type": "markdown",
|
57 |
+
"metadata": {
|
58 |
+
"id": "_Tif8-JoRH68"
|
59 |
+
},
|
60 |
+
"source": [
|
61 |
+
"## Download"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "markdown",
|
66 |
+
"metadata": {
|
67 |
+
"id": "4fQaa1LN1mXL"
|
68 |
+
},
|
69 |
+
"source": [
|
70 |
+
"The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"cell_type": "code",
|
75 |
+
"execution_count": 3,
|
76 |
+
"metadata": {
|
77 |
+
"colab": {
|
78 |
+
"base_uri": "https://localhost:8080/"
|
79 |
+
},
|
80 |
+
"id": "-QTUkdfJjY4N",
|
81 |
+
"outputId": "a88b2f8a-0c84-45a0-9b32-5088fe596612"
|
82 |
+
},
|
83 |
+
"outputs": [
|
84 |
+
{
|
85 |
+
"name": "stdout",
|
86 |
+
"output_type": "stream",
|
87 |
+
"text": [
|
88 |
+
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
|
89 |
+
" Dload Upload Total Spent Left Speed\n",
|
90 |
+
"100 169k 100 169k 0 0 277k 0 --:--:-- --:--:-- --:--:-- 281k\n"
|
91 |
+
]
|
92 |
+
}
|
93 |
+
],
|
94 |
+
"source": [
|
95 |
+
"!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "markdown",
|
100 |
+
"metadata": {
|
101 |
+
"id": "zk-4alIxROo8"
|
102 |
+
},
|
103 |
+
"source": [
|
104 |
+
"## Read File"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"cell_type": "code",
|
109 |
+
"execution_count": 4,
|
110 |
+
"metadata": {
|
111 |
+
"colab": {
|
112 |
+
"base_uri": "https://localhost:8080/"
|
113 |
+
},
|
114 |
+
"id": "7CYwRT6R0o0I",
|
115 |
+
"outputId": "351f170f-9a00-4b09-ae08-b45c3c48fce5"
|
116 |
+
},
|
117 |
+
"outputs": [
|
118 |
+
{
|
119 |
+
"data": {
|
120 |
+
"text/plain": [
|
121 |
+
"841"
|
122 |
+
]
|
123 |
+
},
|
124 |
+
"execution_count": 4,
|
125 |
+
"metadata": {},
|
126 |
+
"output_type": "execute_result"
|
127 |
+
}
|
128 |
+
],
|
129 |
+
"source": [
|
130 |
+
"import csv\n",
|
131 |
+
"\n",
|
132 |
+
"text = \"\"\n",
|
133 |
+
"\n",
|
134 |
+
"# Load the file as a JSON\n",
|
135 |
+
"with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
|
136 |
+
" csv_reader = csv.reader(file)\n",
|
137 |
+
"\n",
|
138 |
+
" for row in csv_reader:\n",
|
139 |
+
" text += row[0]\n",
|
140 |
+
"\n",
|
141 |
+
"# The number of characters in the dataset.\n",
|
142 |
+
"len( text )"
|
143 |
+
]
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"cell_type": "markdown",
|
147 |
+
"metadata": {
|
148 |
+
"id": "S17g2RYOjmf2"
|
149 |
+
},
|
150 |
+
"source": [
|
151 |
+
"# Chunking"
|
152 |
+
]
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"cell_type": "code",
|
156 |
+
"execution_count": 5,
|
157 |
+
"metadata": {
|
158 |
+
"colab": {
|
159 |
+
"base_uri": "https://localhost:8080/"
|
160 |
+
},
|
161 |
+
"id": "STACTMUR1z9N",
|
162 |
+
"outputId": "15a61eac-8774-4cdb-db8d-e2eb5b07e517"
|
163 |
+
},
|
164 |
+
"outputs": [
|
165 |
+
{
|
166 |
+
"data": {
|
167 |
+
"text/plain": [
|
168 |
+
"2"
|
169 |
+
]
|
170 |
+
},
|
171 |
+
"execution_count": 5,
|
172 |
+
"metadata": {},
|
173 |
+
"output_type": "execute_result"
|
174 |
+
}
|
175 |
+
],
|
176 |
+
"source": [
|
177 |
+
"chunk_size = 512\n",
|
178 |
+
"chunks = []\n",
|
179 |
+
"\n",
|
180 |
+
"# Split the long text into smaller manageable chunks of 512 characters.\n",
|
181 |
+
"for i in range(0, len(text), chunk_size):\n",
|
182 |
+
" chunks.append(text[i:i + chunk_size])\n",
|
183 |
+
"\n",
|
184 |
+
"len( chunks )"
|
185 |
+
]
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"cell_type": "markdown",
|
189 |
+
"metadata": {
|
190 |
+
"id": "9fOomeMGqu10"
|
191 |
+
},
|
192 |
+
"source": [
|
193 |
+
"#Interface of Chroma with LlamaIndex"
|
194 |
+
]
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"cell_type": "code",
|
198 |
+
"execution_count": 6,
|
199 |
+
"metadata": {
|
200 |
+
"id": "CtdsIUQ81_hT"
|
201 |
+
},
|
202 |
+
"outputs": [],
|
203 |
+
"source": [
|
204 |
+
"from llama_index.core import Document\n",
|
205 |
+
"\n",
|
206 |
+
"# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
|
207 |
+
"documents = [Document(text=t) for t in chunks]"
|
208 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
},
|
|
|
|
|
|
|
|
|
210 |
{
|
211 |
+
"cell_type": "markdown",
|
212 |
+
"metadata": {
|
213 |
+
"id": "OWaT6rL7ksp8"
|
214 |
+
},
|
215 |
+
"source": [
|
216 |
+
"Save on Chroma\n",
|
217 |
+
"\n"
|
218 |
+
]
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"cell_type": "code",
|
222 |
+
"execution_count": 7,
|
223 |
+
"metadata": {
|
224 |
+
"id": "mXi56KTXk2sp"
|
225 |
+
},
|
226 |
+
"outputs": [],
|
227 |
+
"source": [
|
228 |
+
"import chromadb\n",
|
229 |
+
"\n",
|
230 |
+
"# create client and a new collection\n",
|
231 |
+
"# chromadb.EphemeralClient saves data in-memory.\n",
|
232 |
+
"chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
|
233 |
+
"chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
|
234 |
+
]
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"cell_type": "code",
|
238 |
+
"execution_count": 8,
|
239 |
+
"metadata": {
|
240 |
+
"id": "jKXURvLtkuTS"
|
241 |
+
},
|
242 |
+
"outputs": [],
|
243 |
+
"source": [
|
244 |
+
"from llama_index.vector_stores.chroma import ChromaVectorStore\n",
|
245 |
+
"from llama_index.core import StorageContext\n",
|
246 |
+
"# Define a storage context object using the created vector database.\n",
|
247 |
+
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
|
248 |
+
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
|
249 |
+
]
|
250 |
+
},
|
251 |
+
{
|
252 |
+
"cell_type": "code",
|
253 |
+
"execution_count": 9,
|
254 |
+
"metadata": {
|
255 |
+
"id": "WsD52wtrlESi"
|
256 |
+
},
|
257 |
+
"outputs": [],
|
258 |
+
"source": [
|
259 |
+
"from llama_index.core import VectorStoreIndex\n",
|
260 |
+
"\n",
|
261 |
+
"# Add the documents to the database and create Index / embeddings\n",
|
262 |
+
"index = VectorStoreIndex.from_documents(\n",
|
263 |
+
" documents, storage_context=storage_context\n",
|
264 |
+
")"
|
265 |
+
]
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"cell_type": "markdown",
|
269 |
+
"metadata": {
|
270 |
+
"id": "8JPD8yAinVSq"
|
271 |
+
},
|
272 |
+
"source": [
|
273 |
+
"Query Dataset"
|
274 |
+
]
|
275 |
+
},
|
276 |
+
{
|
277 |
+
"cell_type": "code",
|
278 |
+
"execution_count": 10,
|
279 |
+
"metadata": {
|
280 |
+
"id": "mzS13x1ZlZ5X"
|
281 |
+
},
|
282 |
+
"outputs": [],
|
283 |
+
"source": [
|
284 |
+
"from llama_index.llms.openai import OpenAI\n",
|
285 |
+
"# Define a query engine that is responsible for retrieving related pieces of text,\n",
|
286 |
+
"# and using a LLM to formulate the final answer.\n",
|
287 |
+
"\n",
|
288 |
+
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)\n",
|
289 |
+
"query_engine = index.as_query_engine(llm=llm)"
|
290 |
+
]
|
291 |
+
},
|
292 |
+
{
|
293 |
+
"cell_type": "code",
|
294 |
+
"execution_count": 11,
|
295 |
+
"metadata": {
|
296 |
+
"colab": {
|
297 |
+
"base_uri": "https://localhost:8080/"
|
298 |
+
},
|
299 |
+
"id": "AYsQ4uLN_Oxg",
|
300 |
+
"outputId": "5066a06c-77ff-48a2-ee61-3abe2e9755e2"
|
301 |
+
},
|
302 |
+
"outputs": [
|
303 |
+
{
|
304 |
+
"name": "stdout",
|
305 |
+
"output_type": "stream",
|
306 |
+
"text": [
|
307 |
+
"The LLaMA2 model has 7 billion parameters.\n"
|
308 |
+
]
|
309 |
+
}
|
310 |
+
],
|
311 |
+
"source": [
|
312 |
+
"response = query_engine.query(\n",
|
313 |
+
" \"How many parameters LLaMA2 model has?\"\n",
|
314 |
+
")\n",
|
315 |
+
"print(response)"
|
316 |
+
]
|
317 |
+
},
|
318 |
+
{
|
319 |
+
"cell_type": "markdown",
|
320 |
+
"metadata": {
|
321 |
+
"id": "kWK571VNg-qR"
|
322 |
+
},
|
323 |
+
"source": [
|
324 |
+
"#Interface of Chroma with LangChain"
|
325 |
+
]
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"cell_type": "code",
|
329 |
+
"execution_count": 12,
|
330 |
+
"metadata": {
|
331 |
+
"id": "SMPAniL2e4NP"
|
332 |
+
},
|
333 |
+
"outputs": [],
|
334 |
+
"source": [
|
335 |
+
"from langchain.schema.document import Document\n",
|
336 |
+
"# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
|
337 |
+
"documents = [Document(page_content=t) for t in chunks]"
|
338 |
+
]
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"cell_type": "markdown",
|
342 |
+
"metadata": {
|
343 |
+
"id": "QBt8qGxArUPD"
|
344 |
+
},
|
345 |
+
"source": [
|
346 |
+
"Save on Chroma"
|
347 |
+
]
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"cell_type": "code",
|
351 |
+
"execution_count": 13,
|
352 |
+
"metadata": {
|
353 |
+
"id": "2xas7HkuhJ8A"
|
354 |
+
},
|
355 |
+
"outputs": [],
|
356 |
+
"source": [
|
357 |
+
"from langchain_chroma import Chroma\n",
|
358 |
+
"from langchain_openai import OpenAIEmbeddings\n",
|
359 |
+
"# create client and a new collection\n",
|
360 |
+
"# chromadb.EphemeralClient saves data in-memory.\n",
|
361 |
+
"# Add the documents to the database and create Index / embeddings\n",
|
362 |
+
"\n",
|
363 |
+
"embeddings = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n",
|
364 |
+
"chroma_db = Chroma.from_documents(\n",
|
365 |
+
" documents=documents,\n",
|
366 |
+
" embedding=embeddings,\n",
|
367 |
+
" persist_directory=\"./mini-chunked-dataset\",\n",
|
368 |
+
" collection_name=\"mini-chunked-dataset\"\n",
|
369 |
+
")"
|
370 |
+
]
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"cell_type": "markdown",
|
374 |
+
"metadata": {
|
375 |
+
"id": "P8AXJJyBrZWF"
|
376 |
+
},
|
377 |
+
"source": [
|
378 |
+
"Query Dataset"
|
379 |
+
]
|
380 |
+
},
|
381 |
+
{
|
382 |
+
"cell_type": "code",
|
383 |
+
"execution_count": 14,
|
384 |
+
"metadata": {
|
385 |
+
"id": "-H64YLxshM2b"
|
386 |
+
},
|
387 |
+
"outputs": [],
|
388 |
+
"source": [
|
389 |
+
"from langchain_openai import ChatOpenAI\n",
|
390 |
+
"# Define a query engine that is responsible for retrieving related pieces of text,\n",
|
391 |
+
"# and using a LLM to formulate the final answer.\n",
|
392 |
+
"\n",
|
393 |
+
"llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)"
|
394 |
+
]
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"cell_type": "code",
|
398 |
+
"execution_count": 16,
|
399 |
+
"metadata": {
|
400 |
+
"colab": {
|
401 |
+
"base_uri": "https://localhost:8080/"
|
402 |
+
},
|
403 |
+
"id": "AxBqPNtthPaa",
|
404 |
+
"outputId": "93c9ad64-1cd1-4f52-c51e-6f3ec5d6542d"
|
405 |
+
},
|
406 |
+
"outputs": [
|
407 |
+
{
|
408 |
+
"name": "stdout",
|
409 |
+
"output_type": "stream",
|
410 |
+
"text": [
|
411 |
+
"The LLaMA-2 model has 7 billion parameters.\n"
|
412 |
+
]
|
413 |
+
}
|
414 |
+
],
|
415 |
+
"source": [
|
416 |
+
"from langchain.chains import RetrievalQA\n",
|
417 |
+
"query = \"How many parameters LLaMA2 model has?\"\n",
|
418 |
+
"retriever = chroma_db.as_retriever(search_kwargs={\"k\": 2})\n",
|
419 |
+
"chain = RetrievalQA.from_chain_type(llm=llm,\n",
|
420 |
+
" chain_type=\"stuff\",\n",
|
421 |
+
" retriever=retriever)\n",
|
422 |
+
"\n",
|
423 |
+
"response = chain(query)\n",
|
424 |
+
"print(response[\"result\"])"
|
425 |
]
|
|
|
|
|
|
|
|
|
426 |
}
|
427 |
+
],
|
428 |
+
"metadata": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
"colab": {
|
430 |
+
"provenance": []
|
431 |
},
|
432 |
+
"kernelspec": {
|
433 |
+
"display_name": "Python 3",
|
434 |
+
"name": "python3"
|
435 |
+
},
|
436 |
+
"language_info": {
|
437 |
+
"codemirror_mode": {
|
438 |
+
"name": "ipython",
|
439 |
+
"version": 3
|
440 |
+
},
|
441 |
+
"file_extension": ".py",
|
442 |
+
"mimetype": "text/x-python",
|
443 |
+
"name": "python",
|
444 |
+
"nbconvert_exporter": "python",
|
445 |
+
"pygments_lexer": "ipython3",
|
446 |
+
"version": "3.11.8"
|
447 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
},
|
449 |
+
"nbformat": 4,
|
450 |
+
"nbformat_minor": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
}
|