derek-thomas HF staff commited on
Commit
649fdfc
1 Parent(s): 6f49ee6

Adding updated notebooks

Browse files
notebooks/01_get_data.ipynb CHANGED
@@ -19,7 +19,7 @@
19
  },
20
  {
21
  "cell_type": "code",
22
- "execution_count": null,
23
  "id": "ea8ae64c-f597-4c94-b93d-1b78060d7953",
24
  "metadata": {
25
  "tags": []
@@ -32,15 +32,15 @@
32
  },
33
  {
34
  "cell_type": "code",
35
- "execution_count": null,
36
  "id": "2f9527f9-4756-478b-99ac-a3c8c26ab63e",
37
  "metadata": {
38
  "tags": []
39
  },
40
  "outputs": [],
41
  "source": [
42
- "proj_dir = str(Path.cwd().parent)\n",
43
- "proj_dir\n",
44
  "\n",
45
  "# So we can import later\n",
46
  "sys.path.append(proj_dir)"
@@ -56,12 +56,20 @@
56
  },
57
  {
58
  "cell_type": "code",
59
- "execution_count": null,
60
  "id": "8bec29e3-8434-491f-914c-13f303dc68f3",
61
  "metadata": {
62
  "tags": []
63
  },
64
- "outputs": [],
 
 
 
 
 
 
 
 
65
  "source": [
66
  "%pip install -q -r \"$proj_dir\"/requirements.txt"
67
  ]
@@ -84,12 +92,20 @@
84
  },
85
  {
86
  "cell_type": "code",
87
- "execution_count": null,
88
  "id": "fe4b357f-88fe-44b5-9fce-354404b1447f",
89
  "metadata": {
90
  "tags": []
91
  },
92
- "outputs": [],
 
 
 
 
 
 
 
 
93
  "source": [
94
  "!curl -I https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2 --silent | grep \"Last-Modified\""
95
  ]
@@ -104,14 +120,32 @@
104
  },
105
  {
106
  "cell_type": "code",
107
- "execution_count": null,
108
  "id": "0f309c12-12de-4460-a03f-bd5b6fcc942c",
109
  "metadata": {
110
  "tags": []
111
  },
112
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  "source": [
114
- "!wget -P \"$proj_dir\"/data/raw https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2"
115
  ]
116
  },
117
  {
@@ -119,17 +153,39 @@
119
  "id": "46af5df6-5785-400a-986c-54a2c98768ea",
120
  "metadata": {},
121
  "source": [
122
- "## Extract XML into jsonl"
 
123
  ]
124
  },
125
  {
126
  "cell_type": "code",
127
- "execution_count": null,
128
  "id": "c22dedcd-73b3-4aad-8eb7-1063954967ed",
129
- "metadata": {},
130
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  "source": [
132
- "!wikiextractor -o \"$proj_dir\"/data/raw/output --json simplewiki-latest-pages-articles-multistream.xml.bz2 "
133
  ]
134
  },
135
  {
@@ -139,12 +195,14 @@
139
  "source": [
140
  "## Consolidate into json\n",
141
  "\n",
142
- "Some of this is boring, and most people dont care how you format it, just that its correct. Feel free to check out the consolidate file for more details."
 
 
143
  ]
144
  },
145
  {
146
  "cell_type": "code",
147
- "execution_count": null,
148
  "id": "0a4ce3aa-9c1e-45e4-8219-a1714f482371",
149
  "metadata": {
150
  "tags": []
@@ -156,13 +214,39 @@
156
  },
157
  {
158
  "cell_type": "code",
159
- "execution_count": null,
160
  "id": "3e93da6a-e304-450c-a81e-ffecaf0d8a9a",
161
- "metadata": {},
162
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  "source": [
164
- "folder = proj_dir / 'data/raw/output'\n",
165
- "file_out = proj_dir / 'data/consolidated/simple_wiki.json'\n",
166
  "folder_to_json(folder, file_out)"
167
  ]
168
  }
@@ -183,7 +267,7 @@
183
  "name": "python",
184
  "nbconvert_exporter": "python",
185
  "pygments_lexer": "ipython3",
186
- "version": "3.10.13"
187
  }
188
  },
189
  "nbformat": 4,
 
19
  },
20
  {
21
  "cell_type": "code",
22
+ "execution_count": 1,
23
  "id": "ea8ae64c-f597-4c94-b93d-1b78060d7953",
24
  "metadata": {
25
  "tags": []
 
32
  },
33
  {
34
  "cell_type": "code",
35
+ "execution_count": 16,
36
  "id": "2f9527f9-4756-478b-99ac-a3c8c26ab63e",
37
  "metadata": {
38
  "tags": []
39
  },
40
  "outputs": [],
41
  "source": [
42
+ "proj_dir_path = Path.cwd().parent\n",
43
+ "proj_dir = str(proj_dir_path)\n",
44
  "\n",
45
  "# So we can import later\n",
46
  "sys.path.append(proj_dir)"
 
56
  },
57
  {
58
  "cell_type": "code",
59
+ "execution_count": 3,
60
  "id": "8bec29e3-8434-491f-914c-13f303dc68f3",
61
  "metadata": {
62
  "tags": []
63
  },
64
+ "outputs": [
65
+ {
66
+ "name": "stdout",
67
+ "output_type": "stream",
68
+ "text": [
69
+ "Note: you may need to restart the kernel to use updated packages.\n"
70
+ ]
71
+ }
72
+ ],
73
  "source": [
74
  "%pip install -q -r \"$proj_dir\"/requirements.txt"
75
  ]
 
92
  },
93
  {
94
  "cell_type": "code",
95
+ "execution_count": 4,
96
  "id": "fe4b357f-88fe-44b5-9fce-354404b1447f",
97
  "metadata": {
98
  "tags": []
99
  },
100
+ "outputs": [
101
+ {
102
+ "name": "stdout",
103
+ "output_type": "stream",
104
+ "text": [
105
+ "Last-Modified: Sun, 01 Oct 2023 23:32:27 GMT\n"
106
+ ]
107
+ }
108
+ ],
109
  "source": [
110
  "!curl -I https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2 --silent | grep \"Last-Modified\""
111
  ]
 
120
  },
121
  {
122
  "cell_type": "code",
123
+ "execution_count": 5,
124
  "id": "0f309c12-12de-4460-a03f-bd5b6fcc942c",
125
  "metadata": {
126
  "tags": []
127
  },
128
+ "outputs": [
129
+ {
130
+ "name": "stdout",
131
+ "output_type": "stream",
132
+ "text": [
133
+ "--2023-10-18 10:55:38-- https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2\n",
134
+ "Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142\n",
135
+ "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.\n",
136
+ "HTTP request sent, awaiting response... 200 OK\n",
137
+ "Length: 286759308 (273M) [application/octet-stream]\n",
138
+ "Saving to: ‘/home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2’\n",
139
+ "\n",
140
+ "100%[======================================>] 286,759,308 4.22MB/s in 66s \n",
141
+ "\n",
142
+ "2023-10-18 10:56:45 (4.13 MB/s) - ‘/home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2’ saved [286759308/286759308]\n",
143
+ "\n"
144
+ ]
145
+ }
146
+ ],
147
  "source": [
148
+ "!wget -nc -P \"$proj_dir\"/data/raw https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2"
149
  ]
150
  },
151
  {
 
153
  "id": "46af5df6-5785-400a-986c-54a2c98768ea",
154
  "metadata": {},
155
  "source": [
156
+ "## Extract from XML\n",
157
+ "The download format from wikipedia is in XML. `wikiextractor` will convert this into a jsonl format split into many folders and files."
158
  ]
159
  },
160
  {
161
  "cell_type": "code",
162
+ "execution_count": 9,
163
  "id": "c22dedcd-73b3-4aad-8eb7-1063954967ed",
164
+ "metadata": {
165
+ "tags": []
166
+ },
167
+ "outputs": [
168
+ {
169
+ "name": "stdout",
170
+ "output_type": "stream",
171
+ "text": [
172
+ "INFO: Preprocessing '/home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2' to collect template definitions: this may take some time.\n",
173
+ "INFO: Preprocessed 100000 pages\n",
174
+ "INFO: Preprocessed 200000 pages\n",
175
+ "INFO: Preprocessed 300000 pages\n",
176
+ "INFO: Preprocessed 400000 pages\n",
177
+ "INFO: Loaded 36594 templates in 54.1s\n",
178
+ "INFO: Starting page extraction from /home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2.\n",
179
+ "INFO: Using 3 extract processes.\n",
180
+ "INFO: Extracted 100000 articles (3481.4 art/s)\n",
181
+ "INFO: Extracted 200000 articles (3764.9 art/s)\n",
182
+ "INFO: Extracted 300000 articles (4175.8 art/s)\n",
183
+ "INFO: Finished 3-process extraction of 332024 articles in 86.9s (3822.7 art/s)\n"
184
+ ]
185
+ }
186
+ ],
187
  "source": [
188
+ "!wikiextractor -o \"$proj_dir\"/data/raw/output --json \"$proj_dir\"/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2 "
189
  ]
190
  },
191
  {
 
195
  "source": [
196
  "## Consolidate into json\n",
197
  "\n",
198
+ "The split format is tedious to deal with, so now we we will consolidate this into 1 json file. This is fine since our data fits easily in RAM. But if it didnt, there are better options.\n",
199
+ "\n",
200
+ "Feel free to check out the [consolidate file](../src/preprocessing/consolidate.py) for more details."
201
  ]
202
  },
203
  {
204
  "cell_type": "code",
205
+ "execution_count": 14,
206
  "id": "0a4ce3aa-9c1e-45e4-8219-a1714f482371",
207
  "metadata": {
208
  "tags": []
 
214
  },
215
  {
216
  "cell_type": "code",
217
+ "execution_count": 17,
218
  "id": "3e93da6a-e304-450c-a81e-ffecaf0d8a9a",
219
+ "metadata": {
220
+ "tags": []
221
+ },
222
+ "outputs": [
223
+ {
224
+ "data": {
225
+ "application/vnd.jupyter.widget-view+json": {
226
+ "model_id": "3f045c61ef544f34a1d6f7c4236b206c",
227
+ "version_major": 2,
228
+ "version_minor": 0
229
+ },
230
+ "text/plain": [
231
+ "Processing: 0%| | 0/206 [00:00<?, ?file/s]"
232
+ ]
233
+ },
234
+ "metadata": {},
235
+ "output_type": "display_data"
236
+ },
237
+ {
238
+ "name": "stdout",
239
+ "output_type": "stream",
240
+ "text": [
241
+ "Wiki processed in 2.92 seconds!\n",
242
+ "Writing file!\n",
243
+ "File written in 3.08 seconds!\n"
244
+ ]
245
+ }
246
+ ],
247
  "source": [
248
+ "folder = proj_dir_path / 'data/raw/output'\n",
249
+ "file_out = proj_dir_path / 'data/consolidated/simple_wiki.json'\n",
250
  "folder_to_json(folder, file_out)"
251
  ]
252
  }
 
267
  "name": "python",
268
  "nbconvert_exporter": "python",
269
  "pygments_lexer": "ipython3",
270
+ "version": "3.10.9"
271
  }
272
  },
273
  "nbformat": 4,
notebooks/02_preprocessing.ipynb CHANGED
@@ -37,7 +37,7 @@
37
  "name": "stdout",
38
  "output_type": "stream",
39
  "text": [
40
- "/Users/derekthomas/projects/spaces/RAGDemo\n"
41
  ]
42
  }
43
  ],
@@ -56,7 +56,7 @@
56
  },
57
  {
58
  "cell_type": "code",
59
- "execution_count": 3,
60
  "id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
61
  "metadata": {
62
  "tags": []
@@ -64,7 +64,7 @@
64
  "outputs": [],
65
  "source": [
66
  "file_in = proj_dir / 'data/consolidated/simple_wiki.json'\n",
67
- "file_out = proj_dir / 'data/processed/simple_wiki_processed.plk'"
68
  ]
69
  },
70
  {
@@ -96,7 +96,16 @@
96
  "metadata": {
97
  "tags": []
98
  },
99
- "outputs": [],
 
 
 
 
 
 
 
 
 
100
  "source": [
101
  "pp = PreProcessor(clean_whitespace = True,\n",
102
  " clean_header_footer = False,\n",
@@ -139,10 +148,10 @@
139
  "name": "stderr",
140
  "output_type": "stream",
141
  "text": [
142
- "Preprocessing: 0%|█▏ | 1510/332023 [00:01<04:12, 1308.89docs/s]We found one or more sentences whose word count is higher than the split length.\n",
143
- "Preprocessing: 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 276351/332023 [01:06<00:10, 5510.66docs/s]Document 81972e5bc1997b1ed4fb86d17f061a41 is 21206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
144
  "Document 5e63e848e42966ddc747257fb7cf4092 is 11206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
145
- "Preprocessing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 332023/332023 [01:15<00:00, 4403.36docs/s]\n"
146
  ]
147
  }
148
  ],
@@ -175,7 +184,7 @@
175
  {
176
  "data": {
177
  "application/vnd.jupyter.widget-view+json": {
178
- "model_id": "dbdb8eadba804b4485cc3e7f11a8b863",
179
  "version_major": 2,
180
  "version_minor": 0
181
  },
@@ -206,7 +215,9 @@
206
  "cell_type": "code",
207
  "execution_count": 8,
208
  "id": "02c1c6c8-6283-49a8-9d29-c355f1b08540",
209
- "metadata": {},
 
 
210
  "outputs": [
211
  {
212
  "data": {
@@ -271,7 +282,7 @@
271
  },
272
  {
273
  "cell_type": "code",
274
- "execution_count": 13,
275
  "id": "5485cc27-3d3f-4b96-8884-accf5324da2d",
276
  "metadata": {
277
  "tags": []
@@ -340,7 +351,7 @@
340
  "name": "python",
341
  "nbconvert_exporter": "python",
342
  "pygments_lexer": "ipython3",
343
- "version": "3.10.13"
344
  }
345
  },
346
  "nbformat": 4,
 
37
  "name": "stdout",
38
  "output_type": "stream",
39
  "text": [
40
+ "/home/ec2-user/RAGDemo\n"
41
  ]
42
  }
43
  ],
 
56
  },
57
  {
58
  "cell_type": "code",
59
+ "execution_count": 13,
60
  "id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
61
  "metadata": {
62
  "tags": []
 
64
  "outputs": [],
65
  "source": [
66
  "file_in = proj_dir / 'data/consolidated/simple_wiki.json'\n",
67
+ "file_out = proj_dir / 'data/processed/simple_wiki_processed.pkl'"
68
  ]
69
  },
70
  {
 
96
  "metadata": {
97
  "tags": []
98
  },
99
+ "outputs": [
100
+ {
101
+ "name": "stderr",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...\n",
105
+ "[nltk_data] Unzipping tokenizers/punkt.zip.\n"
106
+ ]
107
+ }
108
+ ],
109
  "source": [
110
  "pp = PreProcessor(clean_whitespace = True,\n",
111
  " clean_header_footer = False,\n",
 
148
  "name": "stderr",
149
  "output_type": "stream",
150
  "text": [
151
+ "Preprocessing: 0%|▌ | 1551/332023 [00:02<09:44, 565.82docs/s]We found one or more sentences whose word count is higher than the split length.\n",
152
+ "Preprocessing: 83%|████████████████████████████████████████████████████████████████████████████████████████████████▌ | 276427/332023 [02:12<00:20, 2652.57docs/s]Document 81972e5bc1997b1ed4fb86d17f061a41 is 21206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
153
  "Document 5e63e848e42966ddc747257fb7cf4092 is 11206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
154
+ "Preprocessing: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████��██████| 332023/332023 [02:29<00:00, 2219.16docs/s]\n"
155
  ]
156
  }
157
  ],
 
184
  {
185
  "data": {
186
  "application/vnd.jupyter.widget-view+json": {
187
+ "model_id": "ba764e7bf29f4202a74e08576a29f4e4",
188
  "version_major": 2,
189
  "version_minor": 0
190
  },
 
215
  "cell_type": "code",
216
  "execution_count": 8,
217
  "id": "02c1c6c8-6283-49a8-9d29-c355f1b08540",
218
+ "metadata": {
219
+ "tags": []
220
+ },
221
  "outputs": [
222
  {
223
  "data": {
 
282
  },
283
  {
284
  "cell_type": "code",
285
+ "execution_count": 11,
286
  "id": "5485cc27-3d3f-4b96-8884-accf5324da2d",
287
  "metadata": {
288
  "tags": []
 
351
  "name": "python",
352
  "nbconvert_exporter": "python",
353
  "pygments_lexer": "ipython3",
354
+ "version": "3.10.9"
355
  }
356
  },
357
  "nbformat": 4,
notebooks/03_get_embeddings.ipynb ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "a0f21cb1-fbc8-4282-b902-f47d92974df8",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Pre-requisites"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "3102abce-ea42-4da6-8c98-c6dd4edf7f0b",
14
+ "metadata": {},
15
+ "source": [
16
+ "## Start TEI\n",
17
+ "Run [TEI](https://github.com/huggingface/text-embeddings-inference#docker), I have this running in a nvidia-docker container, but you can install as you like. \n",
18
+ "\n",
19
+ "Note that as its running, its always going to pull the latest. Its at a very early stage at the time of writing. \n",
20
+ "\n",
21
+ "I chose the smaller [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) instead of the large. Its just as good on [mteb/leaderboard](https://huggingface.co/spaces/mteb/leaderboard) but its faster and smaller. TEI is fast, but this will make our life easier for storage and retrieval.\n",
22
+ "\n",
23
+ "I use the `revision=refs/pr/1` because this has the pull request with [safetensors](https://github.com/huggingface/safetensors) which is required by TEI. Check out the [pull request](https://huggingface.co/BAAI/bge-base-en-v1.5/discussions/1) if you want to use a different embedding model and it doesnt have safetensors."
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 1,
29
+ "id": "7e873652-8257-4aae-92bc-94e1bac54b73",
30
+ "metadata": {
31
+ "tags": []
32
+ },
33
+ "outputs": [],
34
+ "source": [
35
+ "%%bash\n",
36
+ "\n",
37
+ "# volume=$PWD/data\n",
38
+ "# model=BAAI/bge-base-en-v1.5\n",
39
+ "# revision=refs/pr/1\n",
40
+ "# docker run \\\n",
41
+ "# --gpus all \\\n",
42
+ "# -p 8080:80 \\\n",
43
+ "# -v $volume:/data \\\n",
44
+ "# --pull always \\\n",
45
+ "# ghcr.io/huggingface/text-embeddings-inference:latest \\\n",
46
+ "# --model-id $model \\\n",
47
+ "# --revision $revision \\\n",
48
+ "# --pooling cls \\\n",
49
+ "# --max-batch-tokens 65536"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "markdown",
54
+ "id": "86a5ff83-1038-4880-8c90-dc3cab75cb49",
55
+ "metadata": {},
56
+ "source": [
57
+ "## Test Endpoint"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 2,
63
+ "id": "52edfc97-5b6f-44f9-8d89-8578cf79fae9",
64
+ "metadata": {
65
+ "tags": []
66
+ },
67
+ "outputs": [
68
+ {
69
+ "name": "stdout",
70
+ "output_type": "stream",
71
+ "text": [
72
+ "passed\n"
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "%%bash\n",
78
+ "\n",
79
+ "response_code=$(curl -s -o /dev/null -w \"%{http_code}\" 127.0.0.1:8080/embed \\\n",
80
+ " -X POST \\\n",
81
+ " -d '{\"inputs\":\"What is Deep Learning?\"}' \\\n",
82
+ " -H 'Content-Type: application/json')\n",
83
+ "\n",
84
+ "if [ \"$response_code\" -eq 200 ]; then\n",
85
+ " echo \"passed\"\n",
86
+ "else\n",
87
+ " echo \"failed\"\n",
88
+ "fi"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "markdown",
93
+ "id": "b1b28232-b65d-41ce-88de-fd70b93a528d",
94
+ "metadata": {},
95
+ "source": [
96
+ "# Imports"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 3,
102
+ "id": "88408486-566a-4791-8ef2-5ee3e6941156",
103
+ "metadata": {
104
+ "tags": []
105
+ },
106
+ "outputs": [],
107
+ "source": [
108
+ "from IPython.core.interactiveshell import InteractiveShell\n",
109
+ "InteractiveShell.ast_node_interactivity = 'all'"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 4,
115
+ "id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
116
+ "metadata": {
117
+ "tags": []
118
+ },
119
+ "outputs": [],
120
+ "source": [
121
+ "import asyncio\n",
122
+ "from pathlib import Path\n",
123
+ "import pickle\n",
124
+ "\n",
125
+ "import aiohttp\n",
126
+ "from tqdm.notebook import tqdm"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 5,
132
+ "id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
133
+ "metadata": {
134
+ "tags": []
135
+ },
136
+ "outputs": [
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "/home/ec2-user/RAGDemo\n"
142
+ ]
143
+ }
144
+ ],
145
+ "source": [
146
+ "proj_dir = Path.cwd().parent\n",
147
+ "print(proj_dir)"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "markdown",
152
+ "id": "76119e74-f601-436d-a253-63c5a19d1c83",
153
+ "metadata": {},
154
+ "source": [
155
+ "# Config"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "markdown",
160
+ "id": "0d2bcda7-b245-45e3-a347-34166f217e1e",
161
+ "metadata": {},
162
+ "source": [
163
+ "I'm putting the documents in pickle files. The compression is nice, though its important to note pickles are known to be a security risk."
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": 6,
169
+ "id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
170
+ "metadata": {
171
+ "tags": []
172
+ },
173
+ "outputs": [],
174
+ "source": [
175
+ "file_in = proj_dir / 'data/processed/simple_wiki_processed.pkl'\n",
176
+ "file_out = proj_dir / 'data/processed/simple_wiki_embeddings.pkl'"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "markdown",
181
+ "id": "d2dd0df0-4274-45b3-9ee5-0205494e4d75",
182
+ "metadata": {
183
+ "tags": []
184
+ },
185
+ "source": [
186
+ "# Setup\n",
187
+ "Read in our list of dictionaries"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 7,
193
+ "id": "3c08e039-3686-4eca-9f87-7c469e3f19bc",
194
+ "metadata": {
195
+ "tags": []
196
+ },
197
+ "outputs": [
198
+ {
199
+ "name": "stdout",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "CPU times: user 6.24 s, sys: 928 ms, total: 7.17 s\n",
203
+ "Wall time: 6.61 s\n"
204
+ ]
205
+ }
206
+ ],
207
+ "source": [
208
+ "%%time\n",
209
+ "with open(file_in, 'rb') as handle:\n",
210
+ " documents = pickle.load(handle)\n",
211
+ "\n",
212
+ "documents = [document.to_dict() for document in documents]"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "markdown",
217
+ "id": "5e73235d-6274-4958-9e57-977afeeb5f1b",
218
+ "metadata": {},
219
+ "source": [
220
+ "# Embed\n",
221
+ "## Strategy\n",
222
+ "TEI allows multiple concurrent requests, so its important that we dont waste the potential we have. I used the default `max-concurrent-requests` value of `512`, so I want to use that many `MAX_WORKERS`.\n",
223
+ "\n",
224
+ "Im using an `async` way of making requests that uses `aiohttp` as well as a nice progress bar. "
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 8,
230
+ "id": "949d6bf8-804f-496b-a59a-834483cc7073",
231
+ "metadata": {
232
+ "tags": []
233
+ },
234
+ "outputs": [],
235
+ "source": [
236
+ "# Constants\n",
237
+ "ENDPOINT = \"http://127.0.0.1:8080/embed\"\n",
238
+ "HEADERS = {'Content-Type': 'application/json'}\n",
239
+ "MAX_WORKERS = 512"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "markdown",
244
+ "id": "cf3da8cc-1651-4704-9091-39c2a1b835be",
245
+ "metadata": {},
246
+ "source": [
247
+ "Note that Im using `'truncate':True` as even with our `350` word split earlier, there are always exceptions. Its important that as this scales we have as few issues as possible when embedding. "
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 9,
253
+ "id": "3353c849-a36c-4047-bb81-93dac6c49b68",
254
+ "metadata": {
255
+ "tags": []
256
+ },
257
+ "outputs": [],
258
+ "source": [
259
+ "async def fetch(session, url, document):\n",
260
+ " payload = {\"inputs\": [document[\"content\"]], 'truncate':True}\n",
261
+ " async with session.post(url, json=payload) as response:\n",
262
+ " if response.status == 200:\n",
263
+ " resp_json = await response.json()\n",
264
+ " # Assuming the server's response contains an 'embedding' field\n",
265
+ " document[\"embedding\"] = resp_json[0]\n",
266
+ " else:\n",
267
+ " print(f\"Error {response.status}: {await response.text()}\")\n",
268
+ " # Handle error appropriately if needed\n",
269
+ "\n",
270
+ "async def main(documents):\n",
271
+ " async with aiohttp.ClientSession(headers=HEADERS) as session:\n",
272
+ " tasks = [fetch(session, ENDPOINT, doc) for doc in documents]\n",
273
+ " await asyncio.gather(*tasks)"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 10,
279
+ "id": "f0d17264-72dc-40be-aa46-17cde38c8189",
280
+ "metadata": {
281
+ "tags": []
282
+ },
283
+ "outputs": [
284
+ {
285
+ "data": {
286
+ "application/vnd.jupyter.widget-view+json": {
287
+ "model_id": "f0ff772e915f4432971317e2150b60f2",
288
+ "version_major": 2,
289
+ "version_minor": 0
290
+ },
291
+ "text/plain": [
292
+ "Processing documents: 0%| | 0/526 [00:00<?, ?it/s]"
293
+ ]
294
+ },
295
+ "metadata": {},
296
+ "output_type": "display_data"
297
+ }
298
+ ],
299
+ "source": [
300
+ "# Create a list of async tasks\n",
301
+ "tasks = [main(documents[i:i+MAX_WORKERS]) for i in range(0, len(documents), MAX_WORKERS)]\n",
302
+ "\n",
303
+ "# Add a progress bar for visual feedback and run tasks\n",
304
+ "for task in tqdm(tasks, desc=\"Processing documents\"):\n",
305
+ " await task"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "markdown",
310
+ "id": "f90a0ed7-b5e9-4ae4-9e87-4c04875ebcc9",
311
+ "metadata": {},
312
+ "source": [
313
+ "Lets double check that we got all the embeddings we expected!"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "code",
318
+ "execution_count": 11,
319
+ "id": "3950fa88-9961-4b33-9719-d5804509d4cf",
320
+ "metadata": {
321
+ "tags": []
322
+ },
323
+ "outputs": [
324
+ {
325
+ "data": {
326
+ "text/plain": [
327
+ "268980"
328
+ ]
329
+ },
330
+ "execution_count": 11,
331
+ "metadata": {},
332
+ "output_type": "execute_result"
333
+ },
334
+ {
335
+ "data": {
336
+ "text/plain": [
337
+ "268980"
338
+ ]
339
+ },
340
+ "execution_count": 11,
341
+ "metadata": {},
342
+ "output_type": "execute_result"
343
+ }
344
+ ],
345
+ "source": [
346
+ "count = 0\n",
347
+ "for document in documents:\n",
348
+ " if len(document['embedding']) == 768:\n",
349
+ " count += 1\n",
350
+ "count\n",
351
+ "len(documents)"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "markdown",
356
+ "id": "5b78bfa4-d365-4906-a71c-f444eabf6bf8",
357
+ "metadata": {
358
+ "tags": []
359
+ },
360
+ "source": [
361
+ "Great, we can see that they match.\n",
362
+ "\n",
363
+ "Let's write our embeddings to file"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": 12,
369
+ "id": "58d437a5-473f-4eae-9dbf-e8e6992754f6",
370
+ "metadata": {
371
+ "tags": []
372
+ },
373
+ "outputs": [
374
+ {
375
+ "name": "stdout",
376
+ "output_type": "stream",
377
+ "text": [
378
+ "CPU times: user 5.68 s, sys: 640 ms, total: 6.32 s\n",
379
+ "Wall time: 14.1 s\n"
380
+ ]
381
+ }
382
+ ],
383
+ "source": [
384
+ "%%time\n",
385
+ "with open(file_out, 'wb') as handle:\n",
386
+ " pickle.dump(documents, handle, protocol=pickle.HIGHEST_PROTOCOL)"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "markdown",
391
+ "id": "fc1e7cc5-b878-42bb-9fb4-e810f3f5006a",
392
+ "metadata": {
393
+ "tags": []
394
+ },
395
+ "source": [
396
+ "# Next Steps\n",
397
+ "We need to import this into a vector db. "
398
+ ]
399
+ }
400
+ ],
401
+ "metadata": {
402
+ "kernelspec": {
403
+ "display_name": "Python 3 (ipykernel)",
404
+ "language": "python",
405
+ "name": "python3"
406
+ },
407
+ "language_info": {
408
+ "codemirror_mode": {
409
+ "name": "ipython",
410
+ "version": 3
411
+ },
412
+ "file_extension": ".py",
413
+ "mimetype": "text/x-python",
414
+ "name": "python",
415
+ "nbconvert_exporter": "python",
416
+ "pygments_lexer": "ipython3",
417
+ "version": "3.10.9"
418
+ }
419
+ },
420
+ "nbformat": 4,
421
+ "nbformat_minor": 5
422
+ }