AlaFalaki commited on
Commit
fbacdb1
β€’
1 Parent(s): 579611d

Created using Colaboratory

Browse files
notebooks/03-RAG_with_LlamaIndex.ipynb CHANGED
@@ -4,7 +4,7 @@
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
- "authorship_tag": "ABX9TyNbBT3cLvlEHCfKEcPSqeML",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
@@ -26,9 +26,18 @@
26
  "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/03-RAG_with_LlamaIndex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
27
  ]
28
  },
 
 
 
 
 
 
 
 
 
29
  {
30
  "cell_type": "code",
31
- "execution_count": 1,
32
  "metadata": {
33
  "colab": {
34
  "base_uri": "https://localhost:8080/"
@@ -66,14 +75,42 @@
66
  "source": [
67
  "import os\n",
68
  "\n",
 
69
  "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
70
  ],
71
  "metadata": {
72
  "id": "XuzgSNqcABpV"
73
  },
74
- "execution_count": 4,
75
  "outputs": []
76
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  {
78
  "cell_type": "code",
79
  "source": [
@@ -86,7 +123,7 @@
86
  "id": "3ImRCP7pACaI",
87
  "outputId": "9a63bdea-54f7-4923-ccbb-cab03b312774"
88
  },
89
- "execution_count": 5,
90
  "outputs": [
91
  {
92
  "output_type": "stream",
@@ -110,7 +147,7 @@
110
  {
111
  "cell_type": "markdown",
112
  "source": [
113
- "### Read JSON"
114
  ],
115
  "metadata": {
116
  "id": "bZZLK_wyEc-L"
@@ -121,18 +158,11 @@
121
  "source": [
122
  "import json\n",
123
  "\n",
 
124
  "with open('./mini-dataset.json', 'r') as file:\n",
125
- " data = json.load(file)"
126
- ],
127
- "metadata": {
128
- "id": "PBk0zgq6ACXA"
129
- },
130
- "execution_count": 15,
131
- "outputs": []
132
- },
133
- {
134
- "cell_type": "code",
135
- "source": [
136
  "len( data['chunks'] )"
137
  ],
138
  "metadata": {
@@ -142,7 +172,7 @@
142
  "id": "miUqycqAEfr7",
143
  "outputId": "10005d5f-15c0-4565-a58a-6cb7e466acb4"
144
  },
145
- "execution_count": 16,
146
  "outputs": [
147
  {
148
  "output_type": "execute_result",
@@ -159,18 +189,19 @@
159
  {
160
  "cell_type": "code",
161
  "source": [
 
162
  "texts = [item['text'] for item in data['chunks']]"
163
  ],
164
  "metadata": {
165
  "id": "Mq5WKj0QEfpk"
166
  },
167
- "execution_count": 18,
168
  "outputs": []
169
  },
170
  {
171
  "cell_type": "markdown",
172
  "source": [
173
- "### Apply Embedding"
174
  ],
175
  "metadata": {
176
  "id": "f86yksB9K571"
@@ -181,12 +212,13 @@
181
  "source": [
182
  "from llama_index import Document\n",
183
  "\n",
 
184
  "documents = [Document(text=t) for t in texts]"
185
  ],
186
  "metadata": {
187
  "id": "iXrr5-tnEfm9"
188
  },
189
- "execution_count": 24,
190
  "outputs": []
191
  },
192
  {
@@ -194,13 +226,13 @@
194
  "source": [
195
  "from llama_index import VectorStoreIndex\n",
196
  "\n",
197
- "# build index / generate embeddings using OpenAI\n",
198
  "index = VectorStoreIndex.from_documents(documents)"
199
  ],
200
  "metadata": {
201
  "id": "qQit27lBEfkV"
202
  },
203
- "execution_count": 25,
204
  "outputs": []
205
  },
206
  {
@@ -212,13 +244,13 @@
212
  "metadata": {
213
  "id": "xxB0A9ZYM-OD"
214
  },
215
- "execution_count": 29,
216
  "outputs": []
217
  },
218
  {
219
  "cell_type": "markdown",
220
  "source": [
221
- "### Query Dataset"
222
  ],
223
  "metadata": {
224
  "id": "3DoUxd8KK--Q"
@@ -227,12 +259,14 @@
227
  {
228
  "cell_type": "code",
229
  "source": [
 
 
230
  "query_engine = index.as_query_engine()"
231
  ],
232
  "metadata": {
233
  "id": "bUaNH97dEfh9"
234
  },
235
- "execution_count": 27,
236
  "outputs": []
237
  },
238
  {
@@ -250,7 +284,7 @@
250
  "id": "tEgFx_aeFS5e",
251
  "outputId": "9133bd0c-f0c5-4124-9c4b-ab6c4c32b07a"
252
  },
253
- "execution_count": 28,
254
  "outputs": [
255
  {
256
  "output_type": "stream",
 
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
+ "authorship_tag": "ABX9TyMcuy0u2XnwzWnARu0WjaRq",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
 
26
  "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/03-RAG_with_LlamaIndex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
27
  ]
28
  },
29
+ {
30
+ "cell_type": "markdown",
31
+ "source": [
32
+ "# Install Packages and Setup Variables"
33
+ ],
34
+ "metadata": {
35
+ "id": "v9bpz99INAc1"
36
+ }
37
+ },
38
  {
39
  "cell_type": "code",
40
+ "execution_count": null,
41
  "metadata": {
42
  "colab": {
43
  "base_uri": "https://localhost:8080/"
 
75
  "source": [
76
  "import os\n",
77
  "\n",
78
+ "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
79
  "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
80
  ],
81
  "metadata": {
82
  "id": "XuzgSNqcABpV"
83
  },
84
+ "execution_count": null,
85
  "outputs": []
86
  },
87
+ {
88
+ "cell_type": "markdown",
89
+ "source": [
90
+ "# Load Dataset"
91
+ ],
92
+ "metadata": {
93
+ "id": "f5eV5EnvNCMM"
94
+ }
95
+ },
96
+ {
97
+ "cell_type": "markdown",
98
+ "source": [
99
+ "## Download"
100
+ ],
101
+ "metadata": {
102
+ "id": "q-7mRQ-mNJlm"
103
+ }
104
+ },
105
+ {
106
+ "cell_type": "markdown",
107
+ "source": [
108
+ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model."
109
+ ],
110
+ "metadata": {
111
+ "id": "3PsdOdMUNmEi"
112
+ }
113
+ },
114
  {
115
  "cell_type": "code",
116
  "source": [
 
123
  "id": "3ImRCP7pACaI",
124
  "outputId": "9a63bdea-54f7-4923-ccbb-cab03b312774"
125
  },
126
+ "execution_count": null,
127
  "outputs": [
128
  {
129
  "output_type": "stream",
 
147
  {
148
  "cell_type": "markdown",
149
  "source": [
150
+ "## Read File"
151
  ],
152
  "metadata": {
153
  "id": "bZZLK_wyEc-L"
 
158
  "source": [
159
  "import json\n",
160
  "\n",
161
+ "# Load the file as a JSON\n",
162
  "with open('./mini-dataset.json', 'r') as file:\n",
163
+ " data = json.load(file)\n",
164
+ "\n",
165
+ "# The number of chunks in the dataset.\n",
 
 
 
 
 
 
 
 
166
  "len( data['chunks'] )"
167
  ],
168
  "metadata": {
 
172
  "id": "miUqycqAEfr7",
173
  "outputId": "10005d5f-15c0-4565-a58a-6cb7e466acb4"
174
  },
175
+ "execution_count": null,
176
  "outputs": [
177
  {
178
  "output_type": "execute_result",
 
189
  {
190
  "cell_type": "code",
191
  "source": [
192
+ "# Flatten the JSON variable to a list of texts.\n",
193
  "texts = [item['text'] for item in data['chunks']]"
194
  ],
195
  "metadata": {
196
  "id": "Mq5WKj0QEfpk"
197
  },
198
+ "execution_count": null,
199
  "outputs": []
200
  },
201
  {
202
  "cell_type": "markdown",
203
  "source": [
204
+ "# Generate Embedding"
205
  ],
206
  "metadata": {
207
  "id": "f86yksB9K571"
 
212
  "source": [
213
  "from llama_index import Document\n",
214
  "\n",
215
+ "# Convert the texts to Document objects so the LlamaIndex framework can process them.\n",
216
  "documents = [Document(text=t) for t in texts]"
217
  ],
218
  "metadata": {
219
  "id": "iXrr5-tnEfm9"
220
  },
221
+ "execution_count": null,
222
  "outputs": []
223
  },
224
  {
 
226
  "source": [
227
  "from llama_index import VectorStoreIndex\n",
228
  "\n",
229
+ "# Build index / generate embeddings using OpenAI.\n",
230
  "index = VectorStoreIndex.from_documents(documents)"
231
  ],
232
  "metadata": {
233
  "id": "qQit27lBEfkV"
234
  },
235
+ "execution_count": null,
236
  "outputs": []
237
  },
238
  {
 
244
  "metadata": {
245
  "id": "xxB0A9ZYM-OD"
246
  },
247
+ "execution_count": null,
248
  "outputs": []
249
  },
250
  {
251
  "cell_type": "markdown",
252
  "source": [
253
+ "# Query Dataset"
254
  ],
255
  "metadata": {
256
  "id": "3DoUxd8KK--Q"
 
259
  {
260
  "cell_type": "code",
261
  "source": [
262
+ "# Define a query engine that is responsible for retrieving related pieces of text,\n",
263
+ "# and using a LLM to formulate the final answer.\n",
264
  "query_engine = index.as_query_engine()"
265
  ],
266
  "metadata": {
267
  "id": "bUaNH97dEfh9"
268
  },
269
+ "execution_count": null,
270
  "outputs": []
271
  },
272
  {
 
284
  "id": "tEgFx_aeFS5e",
285
  "outputId": "9133bd0c-f0c5-4124-9c4b-ab6c4c32b07a"
286
  },
287
+ "execution_count": null,
288
  "outputs": [
289
  {
290
  "output_type": "stream",