derek-thomas HF staff commited on
Commit
6f49ee6
1 Parent(s): b8cae8a

Adding code

Browse files
README.md CHANGED
@@ -11,3 +11,8 @@ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # Installation
16
+ 1. Download the repo
17
+ 2. Install jupyter
18
+ 3. Run the notebooks in order
notebooks/01_get_data.ipynb ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "883a8a6a-d0b5-40ea-90a0-5b33d3332360",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Get Data\n",
9
+ "The data from wikipedia starts in XML, this is a relatively simple way to format that into a single json for our purposes."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "id": "a7d66da5-185c-409e-9568-f211ca4b725e",
15
+ "metadata": {},
16
+ "source": [
17
+ "## Initialize Variables"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "id": "ea8ae64c-f597-4c94-b93d-1b78060d7953",
24
+ "metadata": {
25
+ "tags": []
26
+ },
27
+ "outputs": [],
28
+ "source": [
29
+ "from pathlib import Path\n",
30
+ "import sys"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "id": "2f9527f9-4756-478b-99ac-a3c8c26ab63e",
37
+ "metadata": {
38
+ "tags": []
39
+ },
40
+ "outputs": [],
41
+ "source": [
42
+ "proj_dir = str(Path.cwd().parent)\n",
43
+ "proj_dir\n",
44
+ "\n",
45
+ "# So we can import later\n",
46
+ "sys.path.append(proj_dir)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "markdown",
51
+ "id": "860da614-743b-4060-9d22-673896414cbd",
52
+ "metadata": {},
53
+ "source": [
54
+ "## Install Libraries"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "id": "8bec29e3-8434-491f-914c-13f303dc68f3",
61
+ "metadata": {
62
+ "tags": []
63
+ },
64
+ "outputs": [],
65
+ "source": [
66
+ "%pip install -q -r \"$proj_dir\"/requirements.txt"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "markdown",
71
+ "id": "b928c71f-7e34-47ee-b55e-aa12d5118ba7",
72
+ "metadata": {},
73
+ "source": [
74
+ "## Download Latest Simple Wikipedia"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "id": "f1dc5f57-c877-43e3-8131-4f351b99168d",
80
+ "metadata": {},
81
+ "source": [
82
+ "Im getting \"latest\" but its good to see what version it is nonetheless."
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "id": "fe4b357f-88fe-44b5-9fce-354404b1447f",
89
+ "metadata": {
90
+ "tags": []
91
+ },
92
+ "outputs": [],
93
+ "source": [
94
+ "!curl -I https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2 --silent | grep \"Last-Modified\""
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "markdown",
99
+ "id": "fe62d4a3-b59b-40c4-9a8c-bf0a447a9ec2",
100
+ "metadata": {},
101
+ "source": [
102
+ "Download simple wikipedia"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "id": "0f309c12-12de-4460-a03f-bd5b6fcc942c",
109
+ "metadata": {
110
+ "tags": []
111
+ },
112
+ "outputs": [],
113
+ "source": [
114
+ "!wget -P \"$proj_dir\"/data/raw https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "markdown",
119
+ "id": "46af5df6-5785-400a-986c-54a2c98768ea",
120
+ "metadata": {},
121
+ "source": [
122
+ "## Extract XML into jsonl"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "c22dedcd-73b3-4aad-8eb7-1063954967ed",
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "!wikiextractor -o \"$proj_dir\"/data/raw/output --json simplewiki-latest-pages-articles-multistream.xml.bz2 "
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "markdown",
137
+ "id": "bb8063c6-1bed-49f0-948a-eeb9a7933b4a",
138
+ "metadata": {},
139
+ "source": [
140
+ "## Consolidate into json\n",
141
+ "\n",
142
+ "Some of this is boring, and most people dont care how you format it, just that its correct. Feel free to check out the consolidate file for more details."
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "id": "0a4ce3aa-9c1e-45e4-8219-a1714f482371",
149
+ "metadata": {
150
+ "tags": []
151
+ },
152
+ "outputs": [],
153
+ "source": [
154
+ "from src.preprocessing.consolidate import folder_to_json"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "id": "3e93da6a-e304-450c-a81e-ffecaf0d8a9a",
161
+ "metadata": {},
162
+ "outputs": [],
163
+ "source": [
164
+ "folder = proj_dir / 'data/raw/output'\n",
165
+ "file_out = proj_dir / 'data/consolidated/simple_wiki.json'\n",
166
+ "folder_to_json(folder, file_out)"
167
+ ]
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "kernelspec": {
172
+ "display_name": "Python 3 (ipykernel)",
173
+ "language": "python",
174
+ "name": "python3"
175
+ },
176
+ "language_info": {
177
+ "codemirror_mode": {
178
+ "name": "ipython",
179
+ "version": 3
180
+ },
181
+ "file_extension": ".py",
182
+ "mimetype": "text/x-python",
183
+ "name": "python",
184
+ "nbconvert_exporter": "python",
185
+ "pygments_lexer": "ipython3",
186
+ "version": "3.10.13"
187
+ }
188
+ },
189
+ "nbformat": 4,
190
+ "nbformat_minor": 5
191
+ }
notebooks/02_preprocessing.ipynb ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b1b28232-b65d-41ce-88de-fd70b93a528d",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Imports"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
15
+ "metadata": {
16
+ "tags": []
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "import json\n",
21
+ "from pathlib import Path\n",
22
+ "import pickle\n",
23
+ "from tqdm.auto import tqdm\n",
24
+ "\n",
25
+ "from haystack.nodes.preprocessor import PreProcessor"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 2,
31
+ "id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
32
+ "metadata": {
33
+ "tags": []
34
+ },
35
+ "outputs": [
36
+ {
37
+ "name": "stdout",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "/Users/derekthomas/projects/spaces/RAGDemo\n"
41
+ ]
42
+ }
43
+ ],
44
+ "source": [
45
+ "proj_dir = Path.cwd().parent\n",
46
+ "print(proj_dir)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "markdown",
51
+ "id": "76119e74-f601-436d-a253-63c5a19d1c83",
52
+ "metadata": {},
53
+ "source": [
54
+ "# Config"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 3,
60
+ "id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
61
+ "metadata": {
62
+ "tags": []
63
+ },
64
+ "outputs": [],
65
+ "source": [
66
+ "file_in = proj_dir / 'data/consolidated/simple_wiki.json'\n",
67
+ "file_out = proj_dir / 'data/processed/simple_wiki_processed.plk'"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "markdown",
72
+ "id": "6a643cf2-abce-48a9-b4e0-478bcbee28c3",
73
+ "metadata": {},
74
+ "source": [
75
+ "# Preprocessing"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "markdown",
80
+ "id": "a8f9630e-447e-423e-9f6c-e1dbc654f2dd",
81
+ "metadata": {},
82
+ "source": [
83
+ "Its important to choose good pre-processing options. \n",
84
+ "\n",
85
+ "Clean whitespace helps each stage of RAG. It adds noise to the embeddings, and wastes space when we prompt with it.\n",
86
+ "\n",
87
+ "I chose to split by word as it would be tedious to tokenize here, and that doesnt scale well. The context length for most embedding models ends up being 512 tokens. This is ~400 words. \n",
88
+ "\n",
89
+ "I like to respect the sentence boundary, thats why I gave a ~50 word buffer."
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 4,
95
+ "id": "18807aea-24e4-4d74-bf10-55b24f3cb52c",
96
+ "metadata": {
97
+ "tags": []
98
+ },
99
+ "outputs": [],
100
+ "source": [
101
+ "pp = PreProcessor(clean_whitespace = True,\n",
102
+ " clean_header_footer = False,\n",
103
+ " clean_empty_lines = True,\n",
104
+ " remove_substrings = None,\n",
105
+ " split_by='word',\n",
106
+ " split_length = 350,\n",
107
+ " split_overlap = 50,\n",
108
+ " split_respect_sentence_boundary = True,\n",
109
+ " tokenizer_model_folder = None,\n",
110
+ " language = \"en\",\n",
111
+ " id_hash_keys = None,\n",
112
+ " progress_bar = True,\n",
113
+ " add_page_number = False,\n",
114
+ " max_chars_check = 10_000)"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 5,
120
+ "id": "dab1658a-79a7-40f2-9a8c-1798e0d124bf",
121
+ "metadata": {
122
+ "tags": []
123
+ },
124
+ "outputs": [],
125
+ "source": [
126
+ "with open(file_in, 'r', encoding='utf-8') as f:\n",
127
+ " list_of_articles = json.load(f)"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 6,
133
+ "id": "4ca6e576-4b7d-4c1a-916f-41d1b82be647",
134
+ "metadata": {
135
+ "tags": []
136
+ },
137
+ "outputs": [
138
+ {
139
+ "name": "stderr",
140
+ "output_type": "stream",
141
+ "text": [
142
+ "Preprocessing: 0%|█▏ | 1510/332023 [00:01<04:12, 1308.89docs/s]We found one or more sentences whose word count is higher than the split length.\n",
143
+ "Preprocessing: 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 276351/332023 [01:06<00:10, 5510.66docs/s]Document 81972e5bc1997b1ed4fb86d17f061a41 is 21206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
144
+ "Document 5e63e848e42966ddc747257fb7cf4092 is 11206 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.\n",
145
+ "Preprocessing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 332023/332023 [01:15<00:00, 4403.36docs/s]\n"
146
+ ]
147
+ }
148
+ ],
149
+ "source": [
150
+ "documents = pp.process(list_of_articles)"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "markdown",
155
+ "id": "f00dbdb2-906f-4d5a-a3f1-b0d84385d85a",
156
+ "metadata": {},
157
+ "source": [
158
+ "When we break a wikipedia article up, we lose some of the context. The local context is somewhat preserved by the `split_overlap`. Im trying to preserve the global context by adding a prefix that has the article's title.\n",
159
+ "\n",
160
+ "You could enhance this with the summary as well. This is mostly to help the retrieval step of RAG. Note that the way Im doing it alters some of `haystack`'s features like the hash and the lengths, but those arent too necessary. \n",
161
+ "\n",
162
+ "A more advanced way for many business applications would be to summarize the document and add that as a prefix for sub-documents.\n",
163
+ "\n",
164
+ "One last thing to note, is that it would be prudent (in some use-cases) to preserve the original document without the summary to give to the reader (retrieve with the summary but prompt without), but since this is a simple use-case I wont be doing that."
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 7,
170
+ "id": "076e115d-3e88-49d2-bc5d-f725a94e4964",
171
+ "metadata": {
172
+ "tags": []
173
+ },
174
+ "outputs": [
175
+ {
176
+ "data": {
177
+ "application/vnd.jupyter.widget-view+json": {
178
+ "model_id": "dbdb8eadba804b4485cc3e7f11a8b863",
179
+ "version_major": 2,
180
+ "version_minor": 0
181
+ },
182
+ "text/plain": [
183
+ " 0%| | 0/268980 [00:00<?, ?it/s]"
184
+ ]
185
+ },
186
+ "metadata": {},
187
+ "output_type": "display_data"
188
+ }
189
+ ],
190
+ "source": [
191
+ "# Prefix each document's content\n",
192
+ "for document in tqdm(documents):\n",
193
+ " if document.meta['_split_id'] != 0:\n",
194
+ " document.content = f'Title: {document.meta[\"title\"]}. ' + document.content"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "markdown",
199
+ "id": "72c1849c-1f4d-411f-b74b-6208b1e48217",
200
+ "metadata": {},
201
+ "source": [
202
+ "## Pre-processing Examples"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 8,
208
+ "id": "02c1c6c8-6283-49a8-9d29-c355f1b08540",
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "data": {
213
+ "text/plain": [
214
+ "<Document: {'content': \"April (Apr.) is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of the four months to have 30 days.\\nApril always begins on the same day of the week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\\nThe Month.\\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart.\\nIn common years, April starts on the same day of the week as October of the previous year, and in leap years, May of the previous year. In common years, April finishes on the same day of the week as July of the previous year, and in leap years, February and October of the previous year. In common years immediately after other common years, April starts on the same day of the week as January of the previous year, and in leap years and years immediately after that, April finishes on the same day of the week as January of the previous year.\\nIn years immediately before common years, April starts on the same day of the week as September and December of the following year, and in years immediately before leap years, June of the following year. In years immediately before common years, April finishes on the same day of the week as September of the following year, and in years immediately before leap years, March and June of the following year.\\nApril is a spring month in the Northern Hemisphere and an autumn/fall month in the Southern Hemisphere. \", 'content_type': 'text', 'score': None, 'meta': {'id': '1', 'revid': '9086769', 'url': 'https://simple.wikipedia.org/wiki?curid=1', 'title': 'April', '_split_id': 0, '_split_overlap': [{'doc_id': '79a74c1e6444dd0a1acd72840e9dd7c0', 'range': (1529, 1835)}]}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'a1c2acf337dbc3baa6f7f58403dfb95d'}>"
215
+ ]
216
+ },
217
+ "execution_count": 8,
218
+ "metadata": {},
219
+ "output_type": "execute_result"
220
+ }
221
+ ],
222
+ "source": [
223
+ "documents[0]"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 9,
229
+ "id": "b34890bf-9dba-459a-9b0d-aa4b5929cbe8",
230
+ "metadata": {
231
+ "tags": []
232
+ },
233
+ "outputs": [
234
+ {
235
+ "data": {
236
+ "text/plain": [
237
+ "<Document: {'content': 'Title: April. In years immediately before common years, April finishes on the same day of the week as September of the following year, and in years immediately before leap years, March and June of the following year.\\nApril is a spring month in the Northern Hemisphere and an autumn/fall month in the Southern Hemisphere. In each hemisphere, it is the seasonal equivalent of October in the other.\\nIt is unclear as to where April got its name. A common theory is that it comes from the Latin word \"aperire\", meaning \"to open\", referring to flowers opening in spring. Another theory is that the name could come from Aphrodite, the Greek goddess of love. It was originally the second month in the old Roman Calendar, before the start of the new year was put to January 1.\\nQuite a few festivals are held in this month. In many Southeast Asian cultures, new year is celebrated in this month (including Songkran). In Western Christianity, Easter can be celebrated on a Sunday between March 22 and April 25. In Orthodox Christianity, it can fall between April 4 and May 8. At the end of the month, Central and Northern European cultures celebrate Walpurgis Night on April 30, marking the transition from winter into summer.\\nApril in poetry.\\nPoets use \"April\" to mean the end of winter. For example: \"April showers bring May flowers.\"', 'content_type': 'text', 'score': None, 'meta': {'id': '1', 'revid': '9086769', 'url': 'https://simple.wikipedia.org/wiki?curid=1', 'title': 'April', '_split_id': 1, '_split_overlap': [{'doc_id': 'a1c2acf337dbc3baa6f7f58403dfb95d', 'range': (0, 306)}]}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '79a74c1e6444dd0a1acd72840e9dd7c0'}>"
238
+ ]
239
+ },
240
+ "execution_count": 9,
241
+ "metadata": {},
242
+ "output_type": "execute_result"
243
+ }
244
+ ],
245
+ "source": [
246
+ "documents[1]"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": 10,
252
+ "id": "e6f50c27-a486-47e9-ba60-d567f5e530db",
253
+ "metadata": {
254
+ "tags": []
255
+ },
256
+ "outputs": [
257
+ {
258
+ "data": {
259
+ "text/plain": [
260
+ "<Document: {'content': 'Title: Chief Joseph. He knew he could not trust them anymore. He was tired of being considered a savage. He felt it was not fair for people who were born on the same land to be treated differently. He delivered a lot of speeches on this subject, which are still really good examples of eloquence. But he did not feel listened to, and when he died in his reservation in 1904, the doctor said he \"died from sadness\". He was buried in Colville Native American Burial Ground, in Washington State.', 'content_type': 'text', 'score': None, 'meta': {'id': '19310', 'revid': '16695', 'url': 'https://simple.wikipedia.org/wiki?curid=19310', 'title': 'Chief Joseph', '_split_id': 1, '_split_overlap': [{'doc_id': '4bdf9cecd46c3bfac6b225aed940e798', 'range': (0, 275)}]}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '91bc8240c5d067ab24f35c11f8916fc6'}>"
261
+ ]
262
+ },
263
+ "execution_count": 10,
264
+ "metadata": {},
265
+ "output_type": "execute_result"
266
+ }
267
+ ],
268
+ "source": [
269
+ "documents[10102]"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 13,
275
+ "id": "5485cc27-3d3f-4b96-8884-accf5324da2d",
276
+ "metadata": {
277
+ "tags": []
278
+ },
279
+ "outputs": [
280
+ {
281
+ "name": "stdout",
282
+ "output_type": "stream",
283
+ "text": [
284
+ "Number of Articles: 332023\n",
285
+ "Number of processed articles: 237724\n",
286
+ "Number of processed documents: 268980\n"
287
+ ]
288
+ }
289
+ ],
290
+ "source": [
291
+ "print(f'Number of Articles: {len(list_of_articles)}')\n",
292
+ "processed_articles = len([d for d in documents if d.meta['_split_id'] == 0])\n",
293
+ "print(f'Number of processed articles: {processed_articles}')\n",
294
+ "print(f'Number of processed documents: {len(documents)}')"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "markdown",
299
+ "id": "23ce57a8-d14e-426d-abc2-0ce5cdbc881a",
300
+ "metadata": {},
301
+ "source": [
302
+ "# Write to file"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": 14,
308
+ "id": "0d044870-7a30-4e09-aad2-42f24a52780d",
309
+ "metadata": {
310
+ "tags": []
311
+ },
312
+ "outputs": [],
313
+ "source": [
314
+ "with open(file_out, 'wb') as handle:\n",
315
+ " pickle.dump(documents, handle, protocol=pickle.HIGHEST_PROTOCOL)"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": null,
321
+ "id": "c5833dba-1bf6-48aa-be6f-0d70c71e54aa",
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": []
325
+ }
326
+ ],
327
+ "metadata": {
328
+ "kernelspec": {
329
+ "display_name": "Python 3 (ipykernel)",
330
+ "language": "python",
331
+ "name": "python3"
332
+ },
333
+ "language_info": {
334
+ "codemirror_mode": {
335
+ "name": "ipython",
336
+ "version": 3
337
+ },
338
+ "file_extension": ".py",
339
+ "mimetype": "text/x-python",
340
+ "name": "python",
341
+ "nbconvert_exporter": "python",
342
+ "pygments_lexer": "ipython3",
343
+ "version": "3.10.13"
344
+ }
345
+ },
346
+ "nbformat": 4,
347
+ "nbformat_minor": 5
348
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ wikiextractor==3.0.6
2
+ farm-haystack[inference]==1.20.1
3
+ ipywidgets==8.1.1
4
+ tqdm==4.66.1
src/preprocessing/consolidate.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from time import perf_counter
4
+ from typing import Any, Dict
5
+
6
+ from tqdm.auto import tqdm
7
+
8
+
9
+ def folder_to_json(folder_in: Path, json_path: Path):
10
+ """
11
+ Process JSON lines from files in a given folder and write processed data to a new JSON file.
12
+
13
+ Parameters:
14
+ folder_in (Path): Path to the input folder containing the JSON files to process.
15
+ json_path (Path): Path to the output JSON file where the processed data will be written.
16
+
17
+ Example:
18
+ folder_to_json(Path("/path/to/input/folder"), Path("/path/to/output.json"))
19
+ """
20
+
21
+ json_out = [] # Initialize list to hold processed JSON data from all files
22
+
23
+ process_start = perf_counter()
24
+ # Use rglob to get all JSON files and sort them by their full path
25
+ all_files = sorted(folder_in.rglob('*wiki*'), key=lambda x: str(x))
26
+
27
+ # Initialize progress bar with total file count, description, and unit of progress
28
+ with tqdm(total=len(all_files), desc='Processing', unit='file') as pbar:
29
+ # Iterate through all files in the input folder in order
30
+ for file_path in all_files:
31
+ # Update progress bar postfix to display current file and directory
32
+ pbar.set_postfix_str(f"File: {file_path.name} | Dir: {file_path.parent}", refresh=True)
33
+
34
+ # Open and read the current file
35
+ with open(file_path, 'r', encoding='utf-8') as f:
36
+ for line in f:
37
+ # Load JSON data from each line and process it
38
+ article = json.loads(line)
39
+ # Add restructured article to the output list
40
+ json_out.extend([restructure_articles(article)])
41
+
42
+ # Update progress bar after processing each file
43
+ pbar.update(1)
44
+ time_taken_to_process = perf_counter() - process_start
45
+ pbar.write(f"Wiki processed in {round(time_taken_to_process, 2)} seconds!")
46
+
47
+ # Notify that the writing process is starting
48
+ pbar.write("Writing file!")
49
+ write_start = perf_counter()
50
+ # Open the output file and write the processed data as JSON
51
+ with open(json_path, "w", encoding='utf-8') as outfile:
52
+ json.dump(json_out, outfile)
53
+ time_taken_to_write = perf_counter() - write_start
54
+ # Notify that the writing process is complete
55
+ pbar.write(f"File written in {round(time_taken_to_write, 2)} seconds!")
56
+
57
+
58
+ def restructure_articles(article: Dict[str, Any]) -> Dict[str, Any]:
59
+ """
60
+ Restructures the given article into haystack's format, separating content and meta data.
61
+
62
+ Args:
63
+ - article (Dict[str, Any]): The article to restructure.
64
+
65
+ Returns:
66
+ - Dict[str, Any]: The restructured article.
67
+ """
68
+
69
+ # Extract content and separate meta data
70
+ article_out = {
71
+ 'content': article['text'],
72
+ 'meta': {k: v for k, v in article.items() if k != 'text'}
73
+ }
74
+
75
+ return article_out
76
+
77
+
78
+ if __name__ == '__main__':
79
+ proj_dir = Path(__file__).parents[2]
80
+ folder = proj_dir / 'data/raw/output'
81
+ file_out = proj_dir / 'data/consolidated/simple_wiki.json'
82
+ folder_to_json(folder, file_out)
83
+ print('Done!')