derek-thomas HF staff commited on
Commit
6115cdf
β€’
1 Parent(s): 70bad37

Successful run!

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ lancedb/**/* filter=lfs diff=lfs merge=lfs -text
notebooks/01_get_data.ipynb CHANGED
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "cell_type": "code",
35
- "execution_count": 16,
36
  "id": "2f9527f9-4756-478b-99ac-a3c8c26ab63e",
37
  "metadata": {
38
  "tags": []
@@ -102,7 +102,7 @@
102
  "name": "stdout",
103
  "output_type": "stream",
104
  "text": [
105
- "Last-Modified: Sun, 01 Oct 2023 23:32:27 GMT\n"
106
  ]
107
  }
108
  ],
@@ -130,16 +130,17 @@
130
  "name": "stdout",
131
  "output_type": "stream",
132
  "text": [
133
- "--2023-10-18 10:55:38-- https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2\n",
134
  "Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142\n",
135
  "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.\n",
136
  "HTTP request sent, awaiting response... 200 OK\n",
137
- "Length: 286759308 (273M) [application/octet-stream]\n",
138
- "Saving to: β€˜/home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2’\n",
139
  "\n",
140
- "100%[======================================>] 286,759,308 4.22MB/s in 66s \n",
141
  "\n",
142
- "2023-10-18 10:56:45 (4.13 MB/s) - β€˜/home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2’ saved [286759308/286759308]\n"
 
143
  ]
144
  }
145
  ],
@@ -158,7 +159,7 @@
158
  },
159
  {
160
  "cell_type": "code",
161
- "execution_count": 9,
162
  "id": "c22dedcd-73b3-4aad-8eb7-1063954967ed",
163
  "metadata": {
164
  "tags": []
@@ -168,18 +169,68 @@
168
  "name": "stdout",
169
  "output_type": "stream",
170
  "text": [
171
- "INFO: Preprocessing '/home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2' to collect template definitions: this may take some time.\n",
172
  "INFO: Preprocessed 100000 pages\n",
173
  "INFO: Preprocessed 200000 pages\n",
174
  "INFO: Preprocessed 300000 pages\n",
175
  "INFO: Preprocessed 400000 pages\n",
176
- "INFO: Loaded 36594 templates in 54.1s\n",
177
- "INFO: Starting page extraction from /home/ec2-user/RAGDemo/data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2.\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  "INFO: Using 3 extract processes.\n",
179
- "INFO: Extracted 100000 articles (3481.4 art/s)\n",
180
- "INFO: Extracted 200000 articles (3764.9 art/s)\n",
181
- "INFO: Extracted 300000 articles (4175.8 art/s)\n",
182
- "INFO: Finished 3-process extraction of 332024 articles in 86.9s (3822.7 art/s)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  ]
184
  }
185
  ],
@@ -201,7 +252,7 @@
201
  },
202
  {
203
  "cell_type": "code",
204
- "execution_count": 14,
205
  "id": "0a4ce3aa-9c1e-45e4-8219-a1714f482371",
206
  "metadata": {
207
  "tags": []
@@ -213,33 +264,24 @@
213
  },
214
  {
215
  "cell_type": "code",
216
- "execution_count": 17,
217
  "id": "3e93da6a-e304-450c-a81e-ffecaf0d8a9a",
218
  "metadata": {
219
  "tags": []
220
  },
221
  "outputs": [
222
  {
223
- "data": {
224
- "application/vnd.jupyter.widget-view+json": {
225
- "model_id": "3f045c61ef544f34a1d6f7c4236b206c",
226
- "version_major": 2,
227
- "version_minor": 0
228
- },
229
- "text/plain": [
230
- "Processing: 0%| | 0/206 [00:00<?, ?file/s]"
231
- ]
232
- },
233
- "metadata": {},
234
- "output_type": "display_data"
235
  },
236
  {
237
  "name": "stdout",
238
  "output_type": "stream",
239
  "text": [
240
- "Wiki processed in 2.92 seconds!\n",
241
- "Writing file!\n",
242
- "File written in 3.08 seconds!\n"
243
  ]
244
  }
245
  ],
@@ -248,6 +290,14 @@
248
  "folder_out = proj_dir_path / 'data/consolidated/'\n",
249
  "folder_to_json(folder, folder_out, 'ar_wiki')"
250
  ]
 
 
 
 
 
 
 
 
251
  }
252
  ],
253
  "metadata": {
@@ -266,7 +316,7 @@
266
  "name": "python",
267
  "nbconvert_exporter": "python",
268
  "pygments_lexer": "ipython3",
269
- "version": "3.10.9"
270
  }
271
  },
272
  "nbformat": 4,
 
32
  },
33
  {
34
  "cell_type": "code",
35
+ "execution_count": 2,
36
  "id": "2f9527f9-4756-478b-99ac-a3c8c26ab63e",
37
  "metadata": {
38
  "tags": []
 
102
  "name": "stdout",
103
  "output_type": "stream",
104
  "text": [
105
+ "Last-Modified: Sat, 21 Oct 2023 02:57:42 GMT\n"
106
  ]
107
  }
108
  ],
 
130
  "name": "stdout",
131
  "output_type": "stream",
132
  "text": [
133
+ "--2023-10-28 08:09:45-- https://dumps.wikimedia.org/arwiki/latest/arwiki-latest-pages-articles-multistream.xml.bz2\n",
134
  "Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142\n",
135
  "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.\n",
136
  "HTTP request sent, awaiting response... 200 OK\n",
137
+ "Length: 1671369109 (1.6G) [application/octet-stream]\n",
138
+ "Saving to: β€˜/home/ec2-user/arabic-wiki/data/raw/arwiki-latest-pages-articles-multistream.xml.bz2’\n",
139
  "\n",
140
+ "100%[====================================>] 1,671,369,109 4.54MB/s in 5m 54s \n",
141
  "\n",
142
+ "2023-10-28 08:15:39 (4.51 MB/s) - β€˜/home/ec2-user/arabic-wiki/data/raw/arwiki-latest-pages-articles-multistream.xml.bz2’ saved [1671369109/1671369109]\n",
143
+ "\n"
144
  ]
145
  }
146
  ],
 
159
  },
160
  {
161
  "cell_type": "code",
162
+ "execution_count": 6,
163
  "id": "c22dedcd-73b3-4aad-8eb7-1063954967ed",
164
  "metadata": {
165
  "tags": []
 
169
  "name": "stdout",
170
  "output_type": "stream",
171
  "text": [
172
+ "INFO: Preprocessing '/home/ec2-user/arabic-wiki/data/raw/arwiki-latest-pages-articles-multistream.xml.bz2' to collect template definitions: this may take some time.\n",
173
  "INFO: Preprocessed 100000 pages\n",
174
  "INFO: Preprocessed 200000 pages\n",
175
  "INFO: Preprocessed 300000 pages\n",
176
  "INFO: Preprocessed 400000 pages\n",
177
+ "INFO: Preprocessed 500000 pages\n",
178
+ "INFO: Preprocessed 600000 pages\n",
179
+ "INFO: Preprocessed 700000 pages\n",
180
+ "INFO: Preprocessed 800000 pages\n",
181
+ "INFO: Preprocessed 900000 pages\n",
182
+ "INFO: Preprocessed 1000000 pages\n",
183
+ "INFO: Preprocessed 1100000 pages\n",
184
+ "INFO: Preprocessed 1200000 pages\n",
185
+ "INFO: Preprocessed 1300000 pages\n",
186
+ "INFO: Preprocessed 1400000 pages\n",
187
+ "INFO: Preprocessed 1500000 pages\n",
188
+ "INFO: Preprocessed 1600000 pages\n",
189
+ "INFO: Preprocessed 1700000 pages\n",
190
+ "INFO: Preprocessed 1800000 pages\n",
191
+ "INFO: Preprocessed 1900000 pages\n",
192
+ "INFO: Preprocessed 2000000 pages\n",
193
+ "INFO: Preprocessed 2100000 pages\n",
194
+ "INFO: Preprocessed 2200000 pages\n",
195
+ "INFO: Preprocessed 2300000 pages\n",
196
+ "INFO: Preprocessed 2400000 pages\n",
197
+ "INFO: Preprocessed 2500000 pages\n",
198
+ "INFO: Preprocessed 2600000 pages\n",
199
+ "INFO: Preprocessed 2700000 pages\n",
200
+ "INFO: Preprocessed 2800000 pages\n",
201
+ "INFO: Preprocessed 2900000 pages\n",
202
+ "INFO: Preprocessed 3000000 pages\n",
203
+ "INFO: Preprocessed 3100000 pages\n",
204
+ "INFO: Preprocessed 3200000 pages\n",
205
+ "INFO: Preprocessed 3300000 pages\n",
206
+ "INFO: Preprocessed 3400000 pages\n",
207
+ "INFO: Preprocessed 3500000 pages\n",
208
+ "INFO: Loaded 130917 templates in 407.5s\n",
209
+ "INFO: Starting page extraction from /home/ec2-user/arabic-wiki/data/raw/arwiki-latest-pages-articles-multistream.xml.bz2.\n",
210
  "INFO: Using 3 extract processes.\n",
211
+ "INFO: Extracted 100000 articles (1963.4 art/s)\n",
212
+ "INFO: Extracted 200000 articles (3371.1 art/s)\n",
213
+ "INFO: Extracted 300000 articles (3008.9 art/s)\n",
214
+ "INFO: Extracted 400000 articles (2935.2 art/s)\n",
215
+ "INFO: Extracted 500000 articles (3563.3 art/s)\n",
216
+ "INFO: Extracted 600000 articles (4209.0 art/s)\n",
217
+ "INFO: Extracted 700000 articles (6075.1 art/s)\n",
218
+ "INFO: Extracted 800000 articles (3531.2 art/s)\n",
219
+ "INFO: Extracted 900000 articles (3466.4 art/s)\n",
220
+ "INFO: Extracted 1000000 articles (3789.2 art/s)\n",
221
+ "INFO: Extracted 1100000 articles (2282.6 art/s)\n",
222
+ "INFO: Extracted 1200000 articles (4499.8 art/s)\n",
223
+ "INFO: Extracted 1300000 articles (5143.6 art/s)\n",
224
+ "INFO: Extracted 1400000 articles (5474.2 art/s)\n",
225
+ "INFO: Extracted 1500000 articles (6086.9 art/s)\n",
226
+ "INFO: Extracted 1600000 articles (5453.4 art/s)\n",
227
+ "INFO: Extracted 1700000 articles (5911.6 art/s)\n",
228
+ "INFO: Extracted 1800000 articles (5087.4 art/s)\n",
229
+ "INFO: Extracted 1900000 articles (3782.4 art/s)\n",
230
+ "INFO: Extracted 2000000 articles (2493.9 art/s)\n",
231
+ "INFO: Extracted 2100000 articles (2742.2 art/s)\n",
232
+ "INFO: Extracted 2200000 articles (2416.5 art/s)\n",
233
+ "INFO: Finished 3-process extraction of 2254650 articles in 641.2s (3516.3 art/s)\n"
234
  ]
235
  }
236
  ],
 
252
  },
253
  {
254
  "cell_type": "code",
255
+ "execution_count": 9,
256
  "id": "0a4ce3aa-9c1e-45e4-8219-a1714f482371",
257
  "metadata": {
258
  "tags": []
 
264
  },
265
  {
266
  "cell_type": "code",
267
+ "execution_count": 10,
268
  "id": "3e93da6a-e304-450c-a81e-ffecaf0d8a9a",
269
  "metadata": {
270
  "tags": []
271
  },
272
  "outputs": [
273
  {
274
+ "name": "stderr",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "Processing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6119/6119 [01:11<00:00, 85.38file/s, File: wiki_18 | Dir: /home/ec2-user/arabic-wiki/data/raw/output/CJ]\n"
278
+ ]
 
 
 
 
 
 
 
279
  },
280
  {
281
  "name": "stdout",
282
  "output_type": "stream",
283
  "text": [
284
+ "Wiki processed in 72.87 seconds!\n"
 
 
285
  ]
286
  }
287
  ],
 
290
  "folder_out = proj_dir_path / 'data/consolidated/'\n",
291
  "folder_to_json(folder, folder_out, 'ar_wiki')"
292
  ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "id": "553039d0-6315-40b8-b8f5-c6672598a5f3",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": []
301
  }
302
  ],
303
  "metadata": {
 
316
  "name": "python",
317
  "nbconvert_exporter": "python",
318
  "pygments_lexer": "ipython3",
319
+ "version": "3.10.13"
320
  }
321
  },
322
  "nbformat": 4,
notebooks/02_preprocessing.ipynb DELETED
@@ -1,359 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "b1b28232-b65d-41ce-88de-fd70b93a528d",
6
- "metadata": {},
7
- "source": [
8
- "# Imports"
9
- ]
10
- },
11
- {
12
- "cell_type": "code",
13
- "execution_count": 1,
14
- "id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
15
- "metadata": {
16
- "tags": []
17
- },
18
- "outputs": [],
19
- "source": [
20
- "import json\n",
21
- "from pathlib import Path\n",
22
- "import pickle\n",
23
- "from tqdm.auto import tqdm\n",
24
- "\n",
25
- "from haystack.nodes.preprocessor import PreProcessor"
26
- ]
27
- },
28
- {
29
- "cell_type": "code",
30
- "execution_count": 2,
31
- "id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
32
- "metadata": {
33
- "tags": []
34
- },
35
- "outputs": [
36
- {
37
- "name": "stdout",
38
- "output_type": "stream",
39
- "text": [
40
- "/home/ec2-user/RAGDemo\n"
41
- ]
42
- }
43
- ],
44
- "source": [
45
- "proj_dir = Path.cwd().parent\n",
46
- "print(proj_dir)"
47
- ]
48
- },
49
- {
50
- "cell_type": "markdown",
51
- "id": "76119e74-f601-436d-a253-63c5a19d1c83",
52
- "metadata": {},
53
- "source": [
54
- "# Config"
55
- ]
56
- },
57
- {
58
- "cell_type": "code",
59
- "execution_count": 13,
60
- "id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
61
- "metadata": {
62
- "tags": []
63
- },
64
- "outputs": [],
65
- "source": [
66
- "files_in = list((proj_dir / 'data/consolidated').glob('*.ndjson'))\n",
67
- "folder_out = proj_dir / 'data/processed'"
68
- ]
69
- },
70
- {
71
- "cell_type": "markdown",
72
- "id": "6a643cf2-abce-48a9-b4e0-478bcbee28c3",
73
- "metadata": {},
74
- "source": [
75
- "# Preprocessing"
76
- ]
77
- },
78
- {
79
- "cell_type": "markdown",
80
- "id": "a8f9630e-447e-423e-9f6c-e1dbc654f2dd",
81
- "metadata": {},
82
- "source": [
83
- "Its important to choose good pre-processing options. \n",
84
- "\n",
85
- "Clean whitespace helps each stage of RAG. It adds noise to the embeddings, and wastes space when we prompt with it.\n",
86
- "\n",
87
- "I chose to split by word as it would be tedious to tokenize here, and that doesnt scale well. The context length for most embedding models ends up being 512 tokens. This is ~400 words. \n",
88
- "\n",
89
- "I like to respect the sentence boundary, thats why I gave a ~50 word buffer."
90
- ]
91
- },
92
- {
93
- "cell_type": "code",
94
- "execution_count": 4,
95
- "id": "18807aea-24e4-4d74-bf10-55b24f3cb52c",
96
- "metadata": {
97
- "tags": []
98
- },
99
- "outputs": [
100
- {
101
- "name": "stderr",
102
- "output_type": "stream",
103
- "text": [
104
- "[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...\n",
105
- "[nltk_data] Unzipping tokenizers/punkt.zip.\n"
106
- ]
107
- }
108
- ],
109
- "source": [
110
- "pp = PreProcessor(clean_whitespace = True,\n",
111
- " clean_header_footer = False,\n",
112
- " clean_empty_lines = True,\n",
113
- " remove_substrings = None,\n",
114
- " split_by='word',\n",
115
- " split_length = 350,\n",
116
- " split_overlap = 50,\n",
117
- " split_respect_sentence_boundary = True,\n",
118
- " tokenizer_model_folder = None,\n",
119
- " language = \"en\",\n",
120
- " id_hash_keys = None,\n",
121
- " progress_bar = True,\n",
122
- " add_page_number = False,\n",
123
- " max_chars_check = 10_000)"
124
- ]
125
- },
126
- {
127
- "cell_type": "code",
128
- "execution_count": 5,
129
- "id": "dab1658a-79a7-40f2-9a8c-1798e0d124bf",
130
- "metadata": {
131
- "tags": []
132
- },
133
- "outputs": [],
134
- "source": [
135
- "with open(file_in, 'r', encoding='utf-8') as f:\n",
136
- " list_of_articles = json.load(f)"
137
- ]
138
- },
139
- {
140
- "cell_type": "code",
141
- "execution_count": 6,
142
- "id": "4ca6e576-4b7d-4c1a-916f-41d1b82be647",
143
- "metadata": {
144
- "tags": []
145
- },
146
- "outputs": [
147
- {
148
- "name": "stderr",
149
- "output_type": "stream",
150
- "text": [
151
- "Preprocessing: 0%|β–Œ | 1551/332023 [00:02<09:44, 565.82docs/s]We found one or more sentences whose word count is higher than the split length.\n",
152
- "Preprocessing: 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 276427/332023 [02:12<00:20, 2652.57docs/s]Document 81972e5bc1997b1ed4fb86d17f061a41 is 21206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
153
- "Document 5e63e848e42966ddc747257fb7cf4092 is 11206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
154
- "Preprocessing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 332023/332023 [02:29<00:00, 2219.16docs/s]\n"
155
- ]
156
- }
157
- ],
158
- "source": [
159
- "documents = pp.process(list_of_articles)"
160
- ]
161
- },
162
- {
163
- "cell_type": "markdown",
164
- "id": "f00dbdb2-906f-4d5a-a3f1-b0d84385d85a",
165
- "metadata": {},
166
- "source": [
167
- "When we break a wikipedia article up, we lose some of the context. The local context is somewhat preserved by the `split_overlap`. Im trying to preserve the global context by adding a prefix that has the article's title.\n",
168
- "\n",
169
- "You could enhance this with the summary as well. This is mostly to help the retrieval step of RAG. Note that the way Im doing it alters some of `haystack`'s features like the hash and the lengths, but those arent too necessary. \n",
170
- "\n",
171
- "A more advanced way for many business applications would be to summarize the document and add that as a prefix for sub-documents.\n",
172
- "\n",
173
- "One last thing to note, is that it would be prudent (in some use-cases) to preserve the original document without the summary to give to the reader (retrieve with the summary but prompt without), but since this is a simple use-case I wont be doing that."
174
- ]
175
- },
176
- {
177
- "cell_type": "code",
178
- "execution_count": 7,
179
- "id": "076e115d-3e88-49d2-bc5d-f725a94e4964",
180
- "metadata": {
181
- "tags": []
182
- },
183
- "outputs": [
184
- {
185
- "data": {
186
- "application/vnd.jupyter.widget-view+json": {
187
- "model_id": "ba764e7bf29f4202a74e08576a29f4e4",
188
- "version_major": 2,
189
- "version_minor": 0
190
- },
191
- "text/plain": [
192
- " 0%| | 0/268980 [00:00<?, ?it/s]"
193
- ]
194
- },
195
- "metadata": {},
196
- "output_type": "display_data"
197
- }
198
- ],
199
- "source": [
200
- "# Prefix each document's content\n",
201
- "for document in tqdm(documents):\n",
202
- " if document.meta['_split_id'] != 0:\n",
203
- " document.content = f'Title: {document.meta[\"title\"]}. ' + document.content"
204
- ]
205
- },
206
- {
207
- "cell_type": "markdown",
208
- "id": "72c1849c-1f4d-411f-b74b-6208b1e48217",
209
- "metadata": {},
210
- "source": [
211
- "## Pre-processing Examples"
212
- ]
213
- },
214
- {
215
- "cell_type": "code",
216
- "execution_count": 8,
217
- "id": "02c1c6c8-6283-49a8-9d29-c355f1b08540",
218
- "metadata": {
219
- "tags": []
220
- },
221
- "outputs": [
222
- {
223
- "data": {
224
- "text/plain": [
225
- "<Document: {'content': \"April (Apr.) is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of the four months to have 30 days.\\nApril always begins on the same day of the week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\\nThe Month.\\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart.\\nIn common years, April starts on the same day of the week as October of the previous year, and in leap years, May of the previous year. In common years, April finishes on the same day of the week as July of the previous year, and in leap years, February and October of the previous year. In common years immediately after other common years, April starts on the same day of the week as January of the previous year, and in leap years and years immediately after that, April finishes on the same day of the week as January of the previous year.\\nIn years immediately before common years, April starts on the same day of the week as September and December of the following year, and in years immediately before leap years, June of the following year. In years immediately before common years, April finishes on the same day of the week as September of the following year, and in years immediately before leap years, March and June of the following year.\\nApril is a spring month in the Northern Hemisphere and an autumn/fall month in the Southern Hemisphere. \", 'content_type': 'text', 'score': None, 'meta': {'id': '1', 'revid': '9086769', 'url': 'https://simple.wikipedia.org/wiki?curid=1', 'title': 'April', '_split_id': 0, '_split_overlap': [{'doc_id': '79a74c1e6444dd0a1acd72840e9dd7c0', 'range': (1529, 1835)}]}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'a1c2acf337dbc3baa6f7f58403dfb95d'}>"
226
- ]
227
- },
228
- "execution_count": 8,
229
- "metadata": {},
230
- "output_type": "execute_result"
231
- }
232
- ],
233
- "source": [
234
- "documents[0]"
235
- ]
236
- },
237
- {
238
- "cell_type": "code",
239
- "execution_count": 9,
240
- "id": "b34890bf-9dba-459a-9b0d-aa4b5929cbe8",
241
- "metadata": {
242
- "tags": []
243
- },
244
- "outputs": [
245
- {
246
- "data": {
247
- "text/plain": [
248
- "<Document: {'content': 'Title: April. In years immediately before common years, April finishes on the same day of the week as September of the following year, and in years immediately before leap years, March and June of the following year.\\nApril is a spring month in the Northern Hemisphere and an autumn/fall month in the Southern Hemisphere. In each hemisphere, it is the seasonal equivalent of October in the other.\\nIt is unclear as to where April got its name. A common theory is that it comes from the Latin word \"aperire\", meaning \"to open\", referring to flowers opening in spring. Another theory is that the name could come from Aphrodite, the Greek goddess of love. It was originally the second month in the old Roman Calendar, before the start of the new year was put to January 1.\\nQuite a few festivals are held in this month. In many Southeast Asian cultures, new year is celebrated in this month (including Songkran). In Western Christianity, Easter can be celebrated on a Sunday between March 22 and April 25. In Orthodox Christianity, it can fall between April 4 and May 8. At the end of the month, Central and Northern European cultures celebrate Walpurgis Night on April 30, marking the transition from winter into summer.\\nApril in poetry.\\nPoets use \"April\" to mean the end of winter. For example: \"April showers bring May flowers.\"', 'content_type': 'text', 'score': None, 'meta': {'id': '1', 'revid': '9086769', 'url': 'https://simple.wikipedia.org/wiki?curid=1', 'title': 'April', '_split_id': 1, '_split_overlap': [{'doc_id': 'a1c2acf337dbc3baa6f7f58403dfb95d', 'range': (0, 306)}]}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '79a74c1e6444dd0a1acd72840e9dd7c0'}>"
249
- ]
250
- },
251
- "execution_count": 9,
252
- "metadata": {},
253
- "output_type": "execute_result"
254
- }
255
- ],
256
- "source": [
257
- "documents[1]"
258
- ]
259
- },
260
- {
261
- "cell_type": "code",
262
- "execution_count": 10,
263
- "id": "e6f50c27-a486-47e9-ba60-d567f5e530db",
264
- "metadata": {
265
- "tags": []
266
- },
267
- "outputs": [
268
- {
269
- "data": {
270
- "text/plain": [
271
- "<Document: {'content': 'Title: Chief Joseph. He knew he could not trust them anymore. He was tired of being considered a savage. He felt it was not fair for people who were born on the same land to be treated differently. He delivered a lot of speeches on this subject, which are still really good examples of eloquence. But he did not feel listened to, and when he died in his reservation in 1904, the doctor said he \"died from sadness\". He was buried in Colville Native American Burial Ground, in Washington State.', 'content_type': 'text', 'score': None, 'meta': {'id': '19310', 'revid': '16695', 'url': 'https://simple.wikipedia.org/wiki?curid=19310', 'title': 'Chief Joseph', '_split_id': 1, '_split_overlap': [{'doc_id': '4bdf9cecd46c3bfac6b225aed940e798', 'range': (0, 275)}]}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '91bc8240c5d067ab24f35c11f8916fc6'}>"
272
- ]
273
- },
274
- "execution_count": 10,
275
- "metadata": {},
276
- "output_type": "execute_result"
277
- }
278
- ],
279
- "source": [
280
- "documents[10102]"
281
- ]
282
- },
283
- {
284
- "cell_type": "code",
285
- "execution_count": 11,
286
- "id": "5485cc27-3d3f-4b96-8884-accf5324da2d",
287
- "metadata": {
288
- "tags": []
289
- },
290
- "outputs": [
291
- {
292
- "name": "stdout",
293
- "output_type": "stream",
294
- "text": [
295
- "Number of Articles: 332023\n",
296
- "Number of processed articles: 237724\n",
297
- "Number of processed documents: 268980\n"
298
- ]
299
- }
300
- ],
301
- "source": [
302
- "print(f'Number of Articles: {len(list_of_articles)}')\n",
303
- "processed_articles = len([d for d in documents if d.meta['_split_id'] == 0])\n",
304
- "print(f'Number of processed articles: {processed_articles}')\n",
305
- "print(f'Number of processed documents: {len(documents)}')"
306
- ]
307
- },
308
- {
309
- "cell_type": "markdown",
310
- "id": "23ce57a8-d14e-426d-abc2-0ce5cdbc881a",
311
- "metadata": {},
312
- "source": [
313
- "# Write to file"
314
- ]
315
- },
316
- {
317
- "cell_type": "code",
318
- "execution_count": 14,
319
- "id": "0d044870-7a30-4e09-aad2-42f24a52780d",
320
- "metadata": {
321
- "tags": []
322
- },
323
- "outputs": [],
324
- "source": [
325
- "with open(file_out, 'wb') as handle:\n",
326
- " pickle.dump(documents, handle, protocol=pickle.HIGHEST_PROTOCOL)"
327
- ]
328
- },
329
- {
330
- "cell_type": "code",
331
- "execution_count": null,
332
- "id": "c5833dba-1bf6-48aa-be6f-0d70c71e54aa",
333
- "metadata": {},
334
- "outputs": [],
335
- "source": []
336
- }
337
- ],
338
- "metadata": {
339
- "kernelspec": {
340
- "display_name": "Python 3 (ipykernel)",
341
- "language": "python",
342
- "name": "python3"
343
- },
344
- "language_info": {
345
- "codemirror_mode": {
346
- "name": "ipython",
347
- "version": 3
348
- },
349
- "file_extension": ".py",
350
- "mimetype": "text/x-python",
351
- "name": "python",
352
- "nbconvert_exporter": "python",
353
- "pygments_lexer": "ipython3",
354
- "version": "3.10.9"
355
- }
356
- },
357
- "nbformat": 4,
358
- "nbformat_minor": 5
359
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/03_get_embeddings.ipynb DELETED
@@ -1,441 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "a0f21cb1-fbc8-4282-b902-f47d92974df8",
6
- "metadata": {},
7
- "source": [
8
- "# Pre-requisites"
9
- ]
10
- },
11
- {
12
- "cell_type": "markdown",
13
- "id": "5f625807-0707-4e2f-a0e0-8fbcdf08c865",
14
- "metadata": {},
15
- "source": [
16
- "## Why TEI\n",
17
- "There are 2 **unsung** challenges with RAG at scale:\n",
18
- "1. Getting the embeddings efficiently\n",
19
- "1. Efficient ingestion into the vector DB\n",
20
- "\n",
21
- "The issue with `1.` is that there are techniques but they are not widely *applied*. TEI solves a number of aspects:\n",
22
- "- Token Based Dynamic Batching\n",
23
- "- Using latest optimizations (Flash Attention, Candle and cuBLASLt)\n",
24
- "- Fast loading with safetensors\n",
25
- "\n",
26
- "The issue with `2.` is that it takes a bit of planning. We wont go much into that side of things here though."
27
- ]
28
- },
29
- {
30
- "cell_type": "markdown",
31
- "id": "3102abce-ea42-4da6-8c98-c6dd4edf7f0b",
32
- "metadata": {},
33
- "source": [
34
- "## Start TEI\n",
35
- "Run [TEI](https://github.com/huggingface/text-embeddings-inference#docker), I have this running in a nvidia-docker container, but you can install as you like. Note that I ran this in a different terminal for monitoring and seperation. \n",
36
- "\n",
37
- "Note that as its running, its always going to pull the latest. Its at a very early stage at the time of writing. \n",
38
- "\n",
39
- "I chose the smaller [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) instead of the large. Its just as good on [mteb/leaderboard](https://huggingface.co/spaces/mteb/leaderboard) but its faster and smaller. TEI is fast, but this will make our life easier for storage and retrieval.\n",
40
- "\n",
41
- "I use the `revision=refs/pr/1` because this has the pull request with [safetensors](https://github.com/huggingface/safetensors) which is required by TEI. Check out the [pull request](https://huggingface.co/BAAI/bge-base-en-v1.5/discussions/1) if you want to use a different embedding model and it doesnt have safetensors."
42
- ]
43
- },
44
- {
45
- "cell_type": "code",
46
- "execution_count": 1,
47
- "id": "7e873652-8257-4aae-92bc-94e1bac54b73",
48
- "metadata": {
49
- "tags": []
50
- },
51
- "outputs": [],
52
- "source": [
53
- "%%bash\n",
54
- "\n",
55
- "# volume=$PWD/data\n",
56
- "# model=BAAI/bge-base-en-v1.5\n",
57
- "# revision=refs/pr/1\n",
58
- "# docker run \\\n",
59
- "# --gpus all \\\n",
60
- "# -p 8080:80 \\\n",
61
- "# -v $volume:/data \\\n",
62
- "# --pull always \\\n",
63
- "# ghcr.io/huggingface/text-embeddings-inference:latest \\\n",
64
- "# --model-id $model \\\n",
65
- "# --revision $revision \\\n",
66
- "# --pooling cls \\\n",
67
- "# --max-batch-tokens 65536"
68
- ]
69
- },
70
- {
71
- "cell_type": "markdown",
72
- "id": "86a5ff83-1038-4880-8c90-dc3cab75cb49",
73
- "metadata": {},
74
- "source": [
75
- "## Test Endpoint"
76
- ]
77
- },
78
- {
79
- "cell_type": "code",
80
- "execution_count": 2,
81
- "id": "52edfc97-5b6f-44f9-8d89-8578cf79fae9",
82
- "metadata": {
83
- "tags": []
84
- },
85
- "outputs": [
86
- {
87
- "name": "stdout",
88
- "output_type": "stream",
89
- "text": [
90
- "passed\n"
91
- ]
92
- }
93
- ],
94
- "source": [
95
- "%%bash\n",
96
- "\n",
97
- "response_code=$(curl -s -o /dev/null -w \"%{http_code}\" 127.0.0.1:8080/embed \\\n",
98
- " -X POST \\\n",
99
- " -d '{\"inputs\":\"What is Deep Learning?\"}' \\\n",
100
- " -H 'Content-Type: application/json')\n",
101
- "\n",
102
- "if [ \"$response_code\" -eq 200 ]; then\n",
103
- " echo \"passed\"\n",
104
- "else\n",
105
- " echo \"failed\"\n",
106
- "fi"
107
- ]
108
- },
109
- {
110
- "cell_type": "markdown",
111
- "id": "b1b28232-b65d-41ce-88de-fd70b93a528d",
112
- "metadata": {},
113
- "source": [
114
- "# Imports"
115
- ]
116
- },
117
- {
118
- "cell_type": "code",
119
- "execution_count": 3,
120
- "id": "88408486-566a-4791-8ef2-5ee3e6941156",
121
- "metadata": {
122
- "tags": []
123
- },
124
- "outputs": [],
125
- "source": [
126
- "from IPython.core.interactiveshell import InteractiveShell\n",
127
- "InteractiveShell.ast_node_interactivity = 'all'"
128
- ]
129
- },
130
- {
131
- "cell_type": "code",
132
- "execution_count": 4,
133
- "id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
134
- "metadata": {
135
- "tags": []
136
- },
137
- "outputs": [],
138
- "source": [
139
- "import asyncio\n",
140
- "from pathlib import Path\n",
141
- "import pickle\n",
142
- "\n",
143
- "import aiohttp\n",
144
- "from tqdm.notebook import tqdm"
145
- ]
146
- },
147
- {
148
- "cell_type": "code",
149
- "execution_count": 5,
150
- "id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
151
- "metadata": {
152
- "tags": []
153
- },
154
- "outputs": [
155
- {
156
- "name": "stdout",
157
- "output_type": "stream",
158
- "text": [
159
- "/home/ec2-user/RAGDemo\n"
160
- ]
161
- }
162
- ],
163
- "source": [
164
- "proj_dir = Path.cwd().parent\n",
165
- "print(proj_dir)"
166
- ]
167
- },
168
- {
169
- "cell_type": "markdown",
170
- "id": "76119e74-f601-436d-a253-63c5a19d1c83",
171
- "metadata": {},
172
- "source": [
173
- "# Config"
174
- ]
175
- },
176
- {
177
- "cell_type": "markdown",
178
- "id": "0d2bcda7-b245-45e3-a347-34166f217e1e",
179
- "metadata": {},
180
- "source": [
181
- "I'm putting the documents in pickle files. The compression is nice, though its important to note pickles are known to be a security risk."
182
- ]
183
- },
184
- {
185
- "cell_type": "code",
186
- "execution_count": 6,
187
- "id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
188
- "metadata": {
189
- "tags": []
190
- },
191
- "outputs": [],
192
- "source": [
193
- "file_in = proj_dir / 'data/processed/simple_wiki_processed.pkl'\n",
194
- "file_out = proj_dir / 'data/processed/simple_wiki_embeddings.pkl'"
195
- ]
196
- },
197
- {
198
- "cell_type": "markdown",
199
- "id": "d2dd0df0-4274-45b3-9ee5-0205494e4d75",
200
- "metadata": {
201
- "tags": []
202
- },
203
- "source": [
204
- "# Setup\n",
205
- "Read in our list of documents and convert them to dictionaries for processing."
206
- ]
207
- },
208
- {
209
- "cell_type": "code",
210
- "execution_count": 7,
211
- "id": "3c08e039-3686-4eca-9f87-7c469e3f19bc",
212
- "metadata": {
213
- "tags": []
214
- },
215
- "outputs": [
216
- {
217
- "name": "stdout",
218
- "output_type": "stream",
219
- "text": [
220
- "CPU times: user 6.24 s, sys: 928 ms, total: 7.17 s\n",
221
- "Wall time: 6.61 s\n"
222
- ]
223
- }
224
- ],
225
- "source": [
226
- "%%time\n",
227
- "with open(file_in, 'rb') as handle:\n",
228
- " documents = pickle.load(handle)\n",
229
- "\n",
230
- "documents = [document.to_dict() for document in documents]"
231
- ]
232
- },
233
- {
234
- "cell_type": "markdown",
235
- "id": "5e73235d-6274-4958-9e57-977afeeb5f1b",
236
- "metadata": {},
237
- "source": [
238
- "# Embed\n",
239
- "## Strategy\n",
240
- "TEI allows multiple concurrent requests, so its important that we dont waste the potential we have. I used the default `max-concurrent-requests` value of `512`, so I want to use that many `MAX_WORKERS`.\n",
241
- "\n",
242
- "Im using an `async` way of making requests that uses `aiohttp` as well as a nice progress bar. "
243
- ]
244
- },
245
- {
246
- "cell_type": "code",
247
- "execution_count": 8,
248
- "id": "949d6bf8-804f-496b-a59a-834483cc7073",
249
- "metadata": {
250
- "tags": []
251
- },
252
- "outputs": [],
253
- "source": [
254
- "# Constants\n",
255
- "ENDPOINT = \"http://127.0.0.1:8080/embed\"\n",
256
- "HEADERS = {'Content-Type': 'application/json'}\n",
257
- "MAX_WORKERS = 512"
258
- ]
259
- },
260
- {
261
- "cell_type": "markdown",
262
- "id": "cf3da8cc-1651-4704-9091-39c2a1b835be",
263
- "metadata": {},
264
- "source": [
265
- "Note that Im using `'truncate':True` as even with our `350` word split earlier, there are always exceptions. Its important that as this scales we have as few issues as possible when embedding. "
266
- ]
267
- },
268
- {
269
- "cell_type": "code",
270
- "execution_count": 9,
271
- "id": "3353c849-a36c-4047-bb81-93dac6c49b68",
272
- "metadata": {
273
- "tags": []
274
- },
275
- "outputs": [],
276
- "source": [
277
- "async def fetch(session, url, document):\n",
278
- " payload = {\"inputs\": [document[\"content\"]], 'truncate':True}\n",
279
- " async with session.post(url, json=payload) as response:\n",
280
- " if response.status == 200:\n",
281
- " resp_json = await response.json()\n",
282
- " # Assuming the server's response contains an 'embedding' field\n",
283
- " document[\"embedding\"] = resp_json[0]\n",
284
- " else:\n",
285
- " print(f\"Error {response.status}: {await response.text()}\")\n",
286
- " # Handle error appropriately if needed\n",
287
- "\n",
288
- "async def main(documents):\n",
289
- " async with aiohttp.ClientSession(headers=HEADERS) as session:\n",
290
- " tasks = [fetch(session, ENDPOINT, doc) for doc in documents]\n",
291
- " await asyncio.gather(*tasks)"
292
- ]
293
- },
294
- {
295
- "cell_type": "code",
296
- "execution_count": 10,
297
- "id": "f0d17264-72dc-40be-aa46-17cde38c8189",
298
- "metadata": {
299
- "tags": []
300
- },
301
- "outputs": [
302
- {
303
- "data": {
304
- "application/vnd.jupyter.widget-view+json": {
305
- "model_id": "f0ff772e915f4432971317e2150b60f2",
306
- "version_major": 2,
307
- "version_minor": 0
308
- },
309
- "text/plain": [
310
- "Processing documents: 0%| | 0/526 [00:00<?, ?it/s]"
311
- ]
312
- },
313
- "metadata": {},
314
- "output_type": "display_data"
315
- }
316
- ],
317
- "source": [
318
- "%%time\n",
319
- "# Create a list of async tasks\n",
320
- "tasks = [main(documents[i:i+MAX_WORKERS]) for i in range(0, len(documents), MAX_WORKERS)]\n",
321
- "\n",
322
- "# Add a progress bar for visual feedback and run tasks\n",
323
- "for task in tqdm(tasks, desc=\"Processing documents\"):\n",
324
- " await task"
325
- ]
326
- },
327
- {
328
- "cell_type": "markdown",
329
- "id": "f90a0ed7-b5e9-4ae4-9e87-4c04875ebcc9",
330
- "metadata": {},
331
- "source": [
332
- "Lets double check that we got all the embeddings we expected!"
333
- ]
334
- },
335
- {
336
- "cell_type": "code",
337
- "execution_count": 11,
338
- "id": "3950fa88-9961-4b33-9719-d5804509d4cf",
339
- "metadata": {
340
- "tags": []
341
- },
342
- "outputs": [
343
- {
344
- "data": {
345
- "text/plain": [
346
- "268980"
347
- ]
348
- },
349
- "execution_count": 11,
350
- "metadata": {},
351
- "output_type": "execute_result"
352
- },
353
- {
354
- "data": {
355
- "text/plain": [
356
- "268980"
357
- ]
358
- },
359
- "execution_count": 11,
360
- "metadata": {},
361
- "output_type": "execute_result"
362
- }
363
- ],
364
- "source": [
365
- "count = 0\n",
366
- "for document in documents:\n",
367
- " if len(document['embedding']) == 768:\n",
368
- " count += 1\n",
369
- "count\n",
370
- "len(documents)"
371
- ]
372
- },
373
- {
374
- "cell_type": "markdown",
375
- "id": "5b78bfa4-d365-4906-a71c-f444eabf6bf8",
376
- "metadata": {
377
- "tags": []
378
- },
379
- "source": [
380
- "Great, we can see that they match.\n",
381
- "\n",
382
- "Let's write our embeddings to file"
383
- ]
384
- },
385
- {
386
- "cell_type": "code",
387
- "execution_count": 12,
388
- "id": "58d437a5-473f-4eae-9dbf-e8e6992754f6",
389
- "metadata": {
390
- "tags": []
391
- },
392
- "outputs": [
393
- {
394
- "name": "stdout",
395
- "output_type": "stream",
396
- "text": [
397
- "CPU times: user 5.68 s, sys: 640 ms, total: 6.32 s\n",
398
- "Wall time: 14.1 s\n"
399
- ]
400
- }
401
- ],
402
- "source": [
403
- "%%time\n",
404
- "with open(file_out, 'wb') as handle:\n",
405
- " pickle.dump(documents, handle, protocol=pickle.HIGHEST_PROTOCOL)"
406
- ]
407
- },
408
- {
409
- "cell_type": "markdown",
410
- "id": "fc1e7cc5-b878-42bb-9fb4-e810f3f5006a",
411
- "metadata": {
412
- "tags": []
413
- },
414
- "source": [
415
- "# Next Steps\n",
416
- "We need to import this into a vector db. "
417
- ]
418
- }
419
- ],
420
- "metadata": {
421
- "kernelspec": {
422
- "display_name": "Python 3 (ipykernel)",
423
- "language": "python",
424
- "name": "python3"
425
- },
426
- "language_info": {
427
- "codemirror_mode": {
428
- "name": "ipython",
429
- "version": 3
430
- },
431
- "file_extension": ".py",
432
- "mimetype": "text/x-python",
433
- "name": "python",
434
- "nbconvert_exporter": "python",
435
- "pygments_lexer": "ipython3",
436
- "version": "3.10.9"
437
- }
438
- },
439
- "nbformat": 4,
440
- "nbformat_minor": 5
441
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/04_vector_db.ipynb DELETED
@@ -1,241 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "6a151ade-7d86-4a2e-bfe7-462089f4e04c",
6
- "metadata": {},
7
- "source": [
8
- "# Approach\n",
9
- "There are a number of aspects of choosing a vector db that might be unique to your situation. You should think through your HW, utilization, latency requirements, scale, etc before choosing. \n",
10
- "\n",
11
- "Im targeting a demo (low utilization, latency can be relaxed) that will live on a huggingface space. I have a small scale that could even fit in memory. I like [Qdrant](https://qdrant.tech) for this. "
12
- ]
13
- },
14
- {
15
- "cell_type": "markdown",
16
- "id": "b1b28232-b65d-41ce-88de-fd70b93a528d",
17
- "metadata": {},
18
- "source": [
19
- "# Imports"
20
- ]
21
- },
22
- {
23
- "cell_type": "code",
24
- "execution_count": 1,
25
- "id": "88408486-566a-4791-8ef2-5ee3e6941156",
26
- "metadata": {
27
- "tags": []
28
- },
29
- "outputs": [],
30
- "source": [
31
- "from IPython.core.interactiveshell import InteractiveShell\n",
32
- "InteractiveShell.ast_node_interactivity = 'all'"
33
- ]
34
- },
35
- {
36
- "cell_type": "code",
37
- "execution_count": 2,
38
- "id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
39
- "metadata": {
40
- "tags": []
41
- },
42
- "outputs": [],
43
- "source": [
44
- "from pathlib import Path\n",
45
- "import pickle\n",
46
- "\n",
47
- "from tqdm.notebook import tqdm\n",
48
- "from haystack.schema import Document\n",
49
- "from qdrant_haystack import QdrantDocumentStore"
50
- ]
51
- },
52
- {
53
- "cell_type": "code",
54
- "execution_count": 3,
55
- "id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
56
- "metadata": {
57
- "tags": []
58
- },
59
- "outputs": [
60
- {
61
- "name": "stdout",
62
- "output_type": "stream",
63
- "text": [
64
- "/home/ec2-user/RAGDemo\n"
65
- ]
66
- }
67
- ],
68
- "source": [
69
- "proj_dir = Path.cwd().parent\n",
70
- "print(proj_dir)"
71
- ]
72
- },
73
- {
74
- "cell_type": "markdown",
75
- "id": "76119e74-f601-436d-a253-63c5a19d1c83",
76
- "metadata": {},
77
- "source": [
78
- "# Config"
79
- ]
80
- },
81
- {
82
- "cell_type": "code",
83
- "execution_count": 4,
84
- "id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
85
- "metadata": {
86
- "tags": []
87
- },
88
- "outputs": [],
89
- "source": [
90
- "file_in = proj_dir / 'data/processed/simple_wiki_embeddings.pkl'"
91
- ]
92
- },
93
- {
94
- "cell_type": "markdown",
95
- "id": "d2dd0df0-4274-45b3-9ee5-0205494e4d75",
96
- "metadata": {
97
- "tags": []
98
- },
99
- "source": [
100
- "# Setup\n",
101
- "Read in our list of dictionaries. This is the upper end for the machine Im using. This takes ~10GB of RAM. We could easily do this in batches of ~100k and be fine in most machines. "
102
- ]
103
- },
104
- {
105
- "cell_type": "code",
106
- "execution_count": 5,
107
- "id": "3c08e039-3686-4eca-9f87-7c469e3f19bc",
108
- "metadata": {
109
- "tags": []
110
- },
111
- "outputs": [
112
- {
113
- "name": "stdout",
114
- "output_type": "stream",
115
- "text": [
116
- "CPU times: user 11.6 s, sys: 2.25 s, total: 13.9 s\n",
117
- "Wall time: 18.1 s\n"
118
- ]
119
- }
120
- ],
121
- "source": [
122
- "%%time\n",
123
- "with open(file_in, 'rb') as handle:\n",
124
- " documents = pickle.load(handle)"
125
- ]
126
- },
127
- {
128
- "cell_type": "markdown",
129
- "id": "98aec715-8d97-439e-99c0-0eff63df386b",
130
- "metadata": {},
131
- "source": [
132
- "Convert the dictionaries to `Documents`"
133
- ]
134
- },
135
- {
136
- "cell_type": "code",
137
- "execution_count": 6,
138
- "id": "4821e3c1-697d-4b69-bae3-300168755df9",
139
- "metadata": {
140
- "tags": []
141
- },
142
- "outputs": [],
143
- "source": [
144
- "documents = [Document.from_dict(d) for d in documents]"
145
- ]
146
- },
147
- {
148
- "cell_type": "markdown",
149
- "id": "676f644c-fb09-4d17-89ba-30c92aad8777",
150
- "metadata": {},
151
- "source": [
152
- "Instantiate our `DocumentStore`. Note that Im saving this to disk, this is for portability which is good considering I want to move from this ec2 instance into a Hugging Face Space. \n",
153
- "\n",
154
- "Note that if you are doing this at scale, you should use a proper instance and not saving to file. You should also take a [measured ingestion](https://qdrant.tech/documentation/tutorials/bulk-upload/) approach instead of using a convenient loader. "
155
- ]
156
- },
157
- {
158
- "cell_type": "code",
159
- "execution_count": 7,
160
- "id": "e51b6e19-3be8-4cb0-8b65-9d6f6121f660",
161
- "metadata": {
162
- "tags": []
163
- },
164
- "outputs": [],
165
- "source": [
166
- "document_store = QdrantDocumentStore(\n",
167
- " path=str(proj_dir/'Qdrant'),\n",
168
- " index=\"RAGDemo\",\n",
169
- " embedding_dim=768,\n",
170
- " recreate_index=True,\n",
171
- " hnsw_config={\"m\": 16, \"ef_construct\": 64} # Optional\n",
172
- ")"
173
- ]
174
- },
175
- {
176
- "cell_type": "code",
177
- "execution_count": 9,
178
- "id": "55fbcd5d-922c-4e93-a37a-974ba84464ac",
179
- "metadata": {
180
- "tags": []
181
- },
182
- "outputs": [
183
- {
184
- "name": "stderr",
185
- "output_type": "stream",
186
- "text": [
187
- "270000it [28:43, 156.68it/s] "
188
- ]
189
- },
190
- {
191
- "name": "stdout",
192
- "output_type": "stream",
193
- "text": [
194
- "CPU times: user 13min 23s, sys: 48.6 s, total: 14min 12s\n",
195
- "Wall time: 28min 43s\n"
196
- ]
197
- },
198
- {
199
- "name": "stderr",
200
- "output_type": "stream",
201
- "text": [
202
- "\n"
203
- ]
204
- }
205
- ],
206
- "source": [
207
- "%%time\n",
208
- "document_store.write_documents(documents, batch_size=5_000)"
209
- ]
210
- },
211
- {
212
- "cell_type": "code",
213
- "execution_count": null,
214
- "id": "9a073815-0191-48f7-890f-a4e4ecc0f9f1",
215
- "metadata": {},
216
- "outputs": [],
217
- "source": []
218
- }
219
- ],
220
- "metadata": {
221
- "kernelspec": {
222
- "display_name": "Python 3 (ipykernel)",
223
- "language": "python",
224
- "name": "python3"
225
- },
226
- "language_info": {
227
- "codemirror_mode": {
228
- "name": "ipython",
229
- "version": 3
230
- },
231
- "file_extension": ".py",
232
- "mimetype": "text/x-python",
233
- "name": "python",
234
- "nbconvert_exporter": "python",
235
- "pygments_lexer": "ipython3",
236
- "version": "3.10.9"
237
- }
238
- },
239
- "nbformat": 4,
240
- "nbformat_minor": 5
241
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1 +1,11 @@
1
- wikiextractor==3.0.6
 
 
 
 
 
 
 
 
 
 
 
1
+ wikiextractor==3.0.6
2
+ sentence-transformers>2.2.0
3
+ qdrant-haystack==1.0.10
4
+ ipywidgets==8.1.1
5
+ tqdm==4.66.1
6
+ aiohttp==3.8.6
7
+ huggingface-hub==0.17.3
8
+ farm-haystack[inference]
9
+ ipywidgets==8.1.1
10
+ plotly==5.18.0
11
+ lancedb==0.3.2