AlaFalaki commited on
Commit
e7ec6a9
β€’
1 Parent(s): 5d7ba1e

Created using Colaboratory

Browse files
notebooks/05-Improve_Prompts_+_Add_Source.ipynb ADDED
@@ -0,0 +1,1901 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "authorship_tag": "ABX9TyP+qPbomvCsKc9OqQJcj+gS",
8
+ "include_colab_link": true
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ },
17
+ "widgets": {
18
+ "application/vnd.jupyter.widget-state+json": {
19
+ "88b29c392c7d403488f81903b2395dc1": {
20
+ "model_module": "@jupyter-widgets/controls",
21
+ "model_name": "HBoxModel",
22
+ "model_module_version": "1.5.0",
23
+ "state": {
24
+ "_dom_classes": [],
25
+ "_model_module": "@jupyter-widgets/controls",
26
+ "_model_module_version": "1.5.0",
27
+ "_model_name": "HBoxModel",
28
+ "_view_count": null,
29
+ "_view_module": "@jupyter-widgets/controls",
30
+ "_view_module_version": "1.5.0",
31
+ "_view_name": "HBoxView",
32
+ "box_style": "",
33
+ "children": [
34
+ "IPY_MODEL_81d6771d5cc74dd49cfccc362d5a3c87",
35
+ "IPY_MODEL_1a021f3baf754210ac8696ecc555d968",
36
+ "IPY_MODEL_35d8b3afc94b4294ac0d5b090a49003c"
37
+ ],
38
+ "layout": "IPY_MODEL_2980851a0f0141fcb4b54aba07366ad9"
39
+ }
40
+ },
41
+ "81d6771d5cc74dd49cfccc362d5a3c87": {
42
+ "model_module": "@jupyter-widgets/controls",
43
+ "model_name": "HTMLModel",
44
+ "model_module_version": "1.5.0",
45
+ "state": {
46
+ "_dom_classes": [],
47
+ "_model_module": "@jupyter-widgets/controls",
48
+ "_model_module_version": "1.5.0",
49
+ "_model_name": "HTMLModel",
50
+ "_view_count": null,
51
+ "_view_module": "@jupyter-widgets/controls",
52
+ "_view_module_version": "1.5.0",
53
+ "_view_name": "HTMLView",
54
+ "description": "",
55
+ "description_tooltip": null,
56
+ "layout": "IPY_MODEL_553d09bfb9bc4ceeb0882d5bd6061514",
57
+ "placeholder": "​",
58
+ "style": "IPY_MODEL_13d692f33c85490aaf761d9444f6c145",
59
+ "value": "Parsing nodes: 100%"
60
+ }
61
+ },
62
+ "1a021f3baf754210ac8696ecc555d968": {
63
+ "model_module": "@jupyter-widgets/controls",
64
+ "model_name": "FloatProgressModel",
65
+ "model_module_version": "1.5.0",
66
+ "state": {
67
+ "_dom_classes": [],
68
+ "_model_module": "@jupyter-widgets/controls",
69
+ "_model_module_version": "1.5.0",
70
+ "_model_name": "FloatProgressModel",
71
+ "_view_count": null,
72
+ "_view_module": "@jupyter-widgets/controls",
73
+ "_view_module_version": "1.5.0",
74
+ "_view_name": "ProgressView",
75
+ "bar_style": "success",
76
+ "description": "",
77
+ "description_tooltip": null,
78
+ "layout": "IPY_MODEL_a3f4c5eb33a64fe895d19a5a52426000",
79
+ "max": 14,
80
+ "min": 0,
81
+ "orientation": "horizontal",
82
+ "style": "IPY_MODEL_5b0afc9227e6437ca4ca7a4929f47d7a",
83
+ "value": 14
84
+ }
85
+ },
86
+ "35d8b3afc94b4294ac0d5b090a49003c": {
87
+ "model_module": "@jupyter-widgets/controls",
88
+ "model_name": "HTMLModel",
89
+ "model_module_version": "1.5.0",
90
+ "state": {
91
+ "_dom_classes": [],
92
+ "_model_module": "@jupyter-widgets/controls",
93
+ "_model_module_version": "1.5.0",
94
+ "_model_name": "HTMLModel",
95
+ "_view_count": null,
96
+ "_view_module": "@jupyter-widgets/controls",
97
+ "_view_module_version": "1.5.0",
98
+ "_view_name": "HTMLView",
99
+ "description": "",
100
+ "description_tooltip": null,
101
+ "layout": "IPY_MODEL_39e87e82d7604abfa03879003898259f",
102
+ "placeholder": "​",
103
+ "style": "IPY_MODEL_e876a66432c24d39a4630601d0f4ce93",
104
+ "value": " 14/14 [00:00<00:00, 18.38it/s]"
105
+ }
106
+ },
107
+ "2980851a0f0141fcb4b54aba07366ad9": {
108
+ "model_module": "@jupyter-widgets/base",
109
+ "model_name": "LayoutModel",
110
+ "model_module_version": "1.2.0",
111
+ "state": {
112
+ "_model_module": "@jupyter-widgets/base",
113
+ "_model_module_version": "1.2.0",
114
+ "_model_name": "LayoutModel",
115
+ "_view_count": null,
116
+ "_view_module": "@jupyter-widgets/base",
117
+ "_view_module_version": "1.2.0",
118
+ "_view_name": "LayoutView",
119
+ "align_content": null,
120
+ "align_items": null,
121
+ "align_self": null,
122
+ "border": null,
123
+ "bottom": null,
124
+ "display": null,
125
+ "flex": null,
126
+ "flex_flow": null,
127
+ "grid_area": null,
128
+ "grid_auto_columns": null,
129
+ "grid_auto_flow": null,
130
+ "grid_auto_rows": null,
131
+ "grid_column": null,
132
+ "grid_gap": null,
133
+ "grid_row": null,
134
+ "grid_template_areas": null,
135
+ "grid_template_columns": null,
136
+ "grid_template_rows": null,
137
+ "height": null,
138
+ "justify_content": null,
139
+ "justify_items": null,
140
+ "left": null,
141
+ "margin": null,
142
+ "max_height": null,
143
+ "max_width": null,
144
+ "min_height": null,
145
+ "min_width": null,
146
+ "object_fit": null,
147
+ "object_position": null,
148
+ "order": null,
149
+ "overflow": null,
150
+ "overflow_x": null,
151
+ "overflow_y": null,
152
+ "padding": null,
153
+ "right": null,
154
+ "top": null,
155
+ "visibility": null,
156
+ "width": null
157
+ }
158
+ },
159
+ "553d09bfb9bc4ceeb0882d5bd6061514": {
160
+ "model_module": "@jupyter-widgets/base",
161
+ "model_name": "LayoutModel",
162
+ "model_module_version": "1.2.0",
163
+ "state": {
164
+ "_model_module": "@jupyter-widgets/base",
165
+ "_model_module_version": "1.2.0",
166
+ "_model_name": "LayoutModel",
167
+ "_view_count": null,
168
+ "_view_module": "@jupyter-widgets/base",
169
+ "_view_module_version": "1.2.0",
170
+ "_view_name": "LayoutView",
171
+ "align_content": null,
172
+ "align_items": null,
173
+ "align_self": null,
174
+ "border": null,
175
+ "bottom": null,
176
+ "display": null,
177
+ "flex": null,
178
+ "flex_flow": null,
179
+ "grid_area": null,
180
+ "grid_auto_columns": null,
181
+ "grid_auto_flow": null,
182
+ "grid_auto_rows": null,
183
+ "grid_column": null,
184
+ "grid_gap": null,
185
+ "grid_row": null,
186
+ "grid_template_areas": null,
187
+ "grid_template_columns": null,
188
+ "grid_template_rows": null,
189
+ "height": null,
190
+ "justify_content": null,
191
+ "justify_items": null,
192
+ "left": null,
193
+ "margin": null,
194
+ "max_height": null,
195
+ "max_width": null,
196
+ "min_height": null,
197
+ "min_width": null,
198
+ "object_fit": null,
199
+ "object_position": null,
200
+ "order": null,
201
+ "overflow": null,
202
+ "overflow_x": null,
203
+ "overflow_y": null,
204
+ "padding": null,
205
+ "right": null,
206
+ "top": null,
207
+ "visibility": null,
208
+ "width": null
209
+ }
210
+ },
211
+ "13d692f33c85490aaf761d9444f6c145": {
212
+ "model_module": "@jupyter-widgets/controls",
213
+ "model_name": "DescriptionStyleModel",
214
+ "model_module_version": "1.5.0",
215
+ "state": {
216
+ "_model_module": "@jupyter-widgets/controls",
217
+ "_model_module_version": "1.5.0",
218
+ "_model_name": "DescriptionStyleModel",
219
+ "_view_count": null,
220
+ "_view_module": "@jupyter-widgets/base",
221
+ "_view_module_version": "1.2.0",
222
+ "_view_name": "StyleView",
223
+ "description_width": ""
224
+ }
225
+ },
226
+ "a3f4c5eb33a64fe895d19a5a52426000": {
227
+ "model_module": "@jupyter-widgets/base",
228
+ "model_name": "LayoutModel",
229
+ "model_module_version": "1.2.0",
230
+ "state": {
231
+ "_model_module": "@jupyter-widgets/base",
232
+ "_model_module_version": "1.2.0",
233
+ "_model_name": "LayoutModel",
234
+ "_view_count": null,
235
+ "_view_module": "@jupyter-widgets/base",
236
+ "_view_module_version": "1.2.0",
237
+ "_view_name": "LayoutView",
238
+ "align_content": null,
239
+ "align_items": null,
240
+ "align_self": null,
241
+ "border": null,
242
+ "bottom": null,
243
+ "display": null,
244
+ "flex": null,
245
+ "flex_flow": null,
246
+ "grid_area": null,
247
+ "grid_auto_columns": null,
248
+ "grid_auto_flow": null,
249
+ "grid_auto_rows": null,
250
+ "grid_column": null,
251
+ "grid_gap": null,
252
+ "grid_row": null,
253
+ "grid_template_areas": null,
254
+ "grid_template_columns": null,
255
+ "grid_template_rows": null,
256
+ "height": null,
257
+ "justify_content": null,
258
+ "justify_items": null,
259
+ "left": null,
260
+ "margin": null,
261
+ "max_height": null,
262
+ "max_width": null,
263
+ "min_height": null,
264
+ "min_width": null,
265
+ "object_fit": null,
266
+ "object_position": null,
267
+ "order": null,
268
+ "overflow": null,
269
+ "overflow_x": null,
270
+ "overflow_y": null,
271
+ "padding": null,
272
+ "right": null,
273
+ "top": null,
274
+ "visibility": null,
275
+ "width": null
276
+ }
277
+ },
278
+ "5b0afc9227e6437ca4ca7a4929f47d7a": {
279
+ "model_module": "@jupyter-widgets/controls",
280
+ "model_name": "ProgressStyleModel",
281
+ "model_module_version": "1.5.0",
282
+ "state": {
283
+ "_model_module": "@jupyter-widgets/controls",
284
+ "_model_module_version": "1.5.0",
285
+ "_model_name": "ProgressStyleModel",
286
+ "_view_count": null,
287
+ "_view_module": "@jupyter-widgets/base",
288
+ "_view_module_version": "1.2.0",
289
+ "_view_name": "StyleView",
290
+ "bar_color": null,
291
+ "description_width": ""
292
+ }
293
+ },
294
+ "39e87e82d7604abfa03879003898259f": {
295
+ "model_module": "@jupyter-widgets/base",
296
+ "model_name": "LayoutModel",
297
+ "model_module_version": "1.2.0",
298
+ "state": {
299
+ "_model_module": "@jupyter-widgets/base",
300
+ "_model_module_version": "1.2.0",
301
+ "_model_name": "LayoutModel",
302
+ "_view_count": null,
303
+ "_view_module": "@jupyter-widgets/base",
304
+ "_view_module_version": "1.2.0",
305
+ "_view_name": "LayoutView",
306
+ "align_content": null,
307
+ "align_items": null,
308
+ "align_self": null,
309
+ "border": null,
310
+ "bottom": null,
311
+ "display": null,
312
+ "flex": null,
313
+ "flex_flow": null,
314
+ "grid_area": null,
315
+ "grid_auto_columns": null,
316
+ "grid_auto_flow": null,
317
+ "grid_auto_rows": null,
318
+ "grid_column": null,
319
+ "grid_gap": null,
320
+ "grid_row": null,
321
+ "grid_template_areas": null,
322
+ "grid_template_columns": null,
323
+ "grid_template_rows": null,
324
+ "height": null,
325
+ "justify_content": null,
326
+ "justify_items": null,
327
+ "left": null,
328
+ "margin": null,
329
+ "max_height": null,
330
+ "max_width": null,
331
+ "min_height": null,
332
+ "min_width": null,
333
+ "object_fit": null,
334
+ "object_position": null,
335
+ "order": null,
336
+ "overflow": null,
337
+ "overflow_x": null,
338
+ "overflow_y": null,
339
+ "padding": null,
340
+ "right": null,
341
+ "top": null,
342
+ "visibility": null,
343
+ "width": null
344
+ }
345
+ },
346
+ "e876a66432c24d39a4630601d0f4ce93": {
347
+ "model_module": "@jupyter-widgets/controls",
348
+ "model_name": "DescriptionStyleModel",
349
+ "model_module_version": "1.5.0",
350
+ "state": {
351
+ "_model_module": "@jupyter-widgets/controls",
352
+ "_model_module_version": "1.5.0",
353
+ "_model_name": "DescriptionStyleModel",
354
+ "_view_count": null,
355
+ "_view_module": "@jupyter-widgets/base",
356
+ "_view_module_version": "1.2.0",
357
+ "_view_name": "StyleView",
358
+ "description_width": ""
359
+ }
360
+ },
361
+ "f67382c8ddf248c4b4eeb1b596284917": {
362
+ "model_module": "@jupyter-widgets/controls",
363
+ "model_name": "HBoxModel",
364
+ "model_module_version": "1.5.0",
365
+ "state": {
366
+ "_dom_classes": [],
367
+ "_model_module": "@jupyter-widgets/controls",
368
+ "_model_module_version": "1.5.0",
369
+ "_model_name": "HBoxModel",
370
+ "_view_count": null,
371
+ "_view_module": "@jupyter-widgets/controls",
372
+ "_view_module_version": "1.5.0",
373
+ "_view_name": "HBoxView",
374
+ "box_style": "",
375
+ "children": [
376
+ "IPY_MODEL_61ebc39888444d448f624f1ae848646a",
377
+ "IPY_MODEL_a692807d09ba4ea89bfdae50821ee518",
378
+ "IPY_MODEL_7fbeb4bc3ea743168a1816b0021e092b"
379
+ ],
380
+ "layout": "IPY_MODEL_ac3c4e3c9b0c4703b5a9148a70c23e21"
381
+ }
382
+ },
383
+ "61ebc39888444d448f624f1ae848646a": {
384
+ "model_module": "@jupyter-widgets/controls",
385
+ "model_name": "HTMLModel",
386
+ "model_module_version": "1.5.0",
387
+ "state": {
388
+ "_dom_classes": [],
389
+ "_model_module": "@jupyter-widgets/controls",
390
+ "_model_module_version": "1.5.0",
391
+ "_model_name": "HTMLModel",
392
+ "_view_count": null,
393
+ "_view_module": "@jupyter-widgets/controls",
394
+ "_view_module_version": "1.5.0",
395
+ "_view_name": "HTMLView",
396
+ "description": "",
397
+ "description_tooltip": null,
398
+ "layout": "IPY_MODEL_165ccb061bb843a4b44896df7b4d15b0",
399
+ "placeholder": "​",
400
+ "style": "IPY_MODEL_f1a9031e7c3445ee80308888d28f2d66",
401
+ "value": "Generating embeddings: 100%"
402
+ }
403
+ },
404
+ "a692807d09ba4ea89bfdae50821ee518": {
405
+ "model_module": "@jupyter-widgets/controls",
406
+ "model_name": "FloatProgressModel",
407
+ "model_module_version": "1.5.0",
408
+ "state": {
409
+ "_dom_classes": [],
410
+ "_model_module": "@jupyter-widgets/controls",
411
+ "_model_module_version": "1.5.0",
412
+ "_model_name": "FloatProgressModel",
413
+ "_view_count": null,
414
+ "_view_module": "@jupyter-widgets/controls",
415
+ "_view_module_version": "1.5.0",
416
+ "_view_name": "ProgressView",
417
+ "bar_style": "success",
418
+ "description": "",
419
+ "description_tooltip": null,
420
+ "layout": "IPY_MODEL_b15acb0416594d44a41df804e42b4cdf",
421
+ "max": 108,
422
+ "min": 0,
423
+ "orientation": "horizontal",
424
+ "style": "IPY_MODEL_4aeb5362822f490aa6cff491dded8111",
425
+ "value": 108
426
+ }
427
+ },
428
+ "7fbeb4bc3ea743168a1816b0021e092b": {
429
+ "model_module": "@jupyter-widgets/controls",
430
+ "model_name": "HTMLModel",
431
+ "model_module_version": "1.5.0",
432
+ "state": {
433
+ "_dom_classes": [],
434
+ "_model_module": "@jupyter-widgets/controls",
435
+ "_model_module_version": "1.5.0",
436
+ "_model_name": "HTMLModel",
437
+ "_view_count": null,
438
+ "_view_module": "@jupyter-widgets/controls",
439
+ "_view_module_version": "1.5.0",
440
+ "_view_name": "HTMLView",
441
+ "description": "",
442
+ "description_tooltip": null,
443
+ "layout": "IPY_MODEL_faa803d16fc74ddcbb003485a506569c",
444
+ "placeholder": "​",
445
+ "style": "IPY_MODEL_4f09405744b04ba79ecb18398b49a389",
446
+ "value": " 108/108 [00:06<00:00, 23.93it/s]"
447
+ }
448
+ },
449
+ "ac3c4e3c9b0c4703b5a9148a70c23e21": {
450
+ "model_module": "@jupyter-widgets/base",
451
+ "model_name": "LayoutModel",
452
+ "model_module_version": "1.2.0",
453
+ "state": {
454
+ "_model_module": "@jupyter-widgets/base",
455
+ "_model_module_version": "1.2.0",
456
+ "_model_name": "LayoutModel",
457
+ "_view_count": null,
458
+ "_view_module": "@jupyter-widgets/base",
459
+ "_view_module_version": "1.2.0",
460
+ "_view_name": "LayoutView",
461
+ "align_content": null,
462
+ "align_items": null,
463
+ "align_self": null,
464
+ "border": null,
465
+ "bottom": null,
466
+ "display": null,
467
+ "flex": null,
468
+ "flex_flow": null,
469
+ "grid_area": null,
470
+ "grid_auto_columns": null,
471
+ "grid_auto_flow": null,
472
+ "grid_auto_rows": null,
473
+ "grid_column": null,
474
+ "grid_gap": null,
475
+ "grid_row": null,
476
+ "grid_template_areas": null,
477
+ "grid_template_columns": null,
478
+ "grid_template_rows": null,
479
+ "height": null,
480
+ "justify_content": null,
481
+ "justify_items": null,
482
+ "left": null,
483
+ "margin": null,
484
+ "max_height": null,
485
+ "max_width": null,
486
+ "min_height": null,
487
+ "min_width": null,
488
+ "object_fit": null,
489
+ "object_position": null,
490
+ "order": null,
491
+ "overflow": null,
492
+ "overflow_x": null,
493
+ "overflow_y": null,
494
+ "padding": null,
495
+ "right": null,
496
+ "top": null,
497
+ "visibility": null,
498
+ "width": null
499
+ }
500
+ },
501
+ "165ccb061bb843a4b44896df7b4d15b0": {
502
+ "model_module": "@jupyter-widgets/base",
503
+ "model_name": "LayoutModel",
504
+ "model_module_version": "1.2.0",
505
+ "state": {
506
+ "_model_module": "@jupyter-widgets/base",
507
+ "_model_module_version": "1.2.0",
508
+ "_model_name": "LayoutModel",
509
+ "_view_count": null,
510
+ "_view_module": "@jupyter-widgets/base",
511
+ "_view_module_version": "1.2.0",
512
+ "_view_name": "LayoutView",
513
+ "align_content": null,
514
+ "align_items": null,
515
+ "align_self": null,
516
+ "border": null,
517
+ "bottom": null,
518
+ "display": null,
519
+ "flex": null,
520
+ "flex_flow": null,
521
+ "grid_area": null,
522
+ "grid_auto_columns": null,
523
+ "grid_auto_flow": null,
524
+ "grid_auto_rows": null,
525
+ "grid_column": null,
526
+ "grid_gap": null,
527
+ "grid_row": null,
528
+ "grid_template_areas": null,
529
+ "grid_template_columns": null,
530
+ "grid_template_rows": null,
531
+ "height": null,
532
+ "justify_content": null,
533
+ "justify_items": null,
534
+ "left": null,
535
+ "margin": null,
536
+ "max_height": null,
537
+ "max_width": null,
538
+ "min_height": null,
539
+ "min_width": null,
540
+ "object_fit": null,
541
+ "object_position": null,
542
+ "order": null,
543
+ "overflow": null,
544
+ "overflow_x": null,
545
+ "overflow_y": null,
546
+ "padding": null,
547
+ "right": null,
548
+ "top": null,
549
+ "visibility": null,
550
+ "width": null
551
+ }
552
+ },
553
+ "f1a9031e7c3445ee80308888d28f2d66": {
554
+ "model_module": "@jupyter-widgets/controls",
555
+ "model_name": "DescriptionStyleModel",
556
+ "model_module_version": "1.5.0",
557
+ "state": {
558
+ "_model_module": "@jupyter-widgets/controls",
559
+ "_model_module_version": "1.5.0",
560
+ "_model_name": "DescriptionStyleModel",
561
+ "_view_count": null,
562
+ "_view_module": "@jupyter-widgets/base",
563
+ "_view_module_version": "1.2.0",
564
+ "_view_name": "StyleView",
565
+ "description_width": ""
566
+ }
567
+ },
568
+ "b15acb0416594d44a41df804e42b4cdf": {
569
+ "model_module": "@jupyter-widgets/base",
570
+ "model_name": "LayoutModel",
571
+ "model_module_version": "1.2.0",
572
+ "state": {
573
+ "_model_module": "@jupyter-widgets/base",
574
+ "_model_module_version": "1.2.0",
575
+ "_model_name": "LayoutModel",
576
+ "_view_count": null,
577
+ "_view_module": "@jupyter-widgets/base",
578
+ "_view_module_version": "1.2.0",
579
+ "_view_name": "LayoutView",
580
+ "align_content": null,
581
+ "align_items": null,
582
+ "align_self": null,
583
+ "border": null,
584
+ "bottom": null,
585
+ "display": null,
586
+ "flex": null,
587
+ "flex_flow": null,
588
+ "grid_area": null,
589
+ "grid_auto_columns": null,
590
+ "grid_auto_flow": null,
591
+ "grid_auto_rows": null,
592
+ "grid_column": null,
593
+ "grid_gap": null,
594
+ "grid_row": null,
595
+ "grid_template_areas": null,
596
+ "grid_template_columns": null,
597
+ "grid_template_rows": null,
598
+ "height": null,
599
+ "justify_content": null,
600
+ "justify_items": null,
601
+ "left": null,
602
+ "margin": null,
603
+ "max_height": null,
604
+ "max_width": null,
605
+ "min_height": null,
606
+ "min_width": null,
607
+ "object_fit": null,
608
+ "object_position": null,
609
+ "order": null,
610
+ "overflow": null,
611
+ "overflow_x": null,
612
+ "overflow_y": null,
613
+ "padding": null,
614
+ "right": null,
615
+ "top": null,
616
+ "visibility": null,
617
+ "width": null
618
+ }
619
+ },
620
+ "4aeb5362822f490aa6cff491dded8111": {
621
+ "model_module": "@jupyter-widgets/controls",
622
+ "model_name": "ProgressStyleModel",
623
+ "model_module_version": "1.5.0",
624
+ "state": {
625
+ "_model_module": "@jupyter-widgets/controls",
626
+ "_model_module_version": "1.5.0",
627
+ "_model_name": "ProgressStyleModel",
628
+ "_view_count": null,
629
+ "_view_module": "@jupyter-widgets/base",
630
+ "_view_module_version": "1.2.0",
631
+ "_view_name": "StyleView",
632
+ "bar_color": null,
633
+ "description_width": ""
634
+ }
635
+ },
636
+ "faa803d16fc74ddcbb003485a506569c": {
637
+ "model_module": "@jupyter-widgets/base",
638
+ "model_name": "LayoutModel",
639
+ "model_module_version": "1.2.0",
640
+ "state": {
641
+ "_model_module": "@jupyter-widgets/base",
642
+ "_model_module_version": "1.2.0",
643
+ "_model_name": "LayoutModel",
644
+ "_view_count": null,
645
+ "_view_module": "@jupyter-widgets/base",
646
+ "_view_module_version": "1.2.0",
647
+ "_view_name": "LayoutView",
648
+ "align_content": null,
649
+ "align_items": null,
650
+ "align_self": null,
651
+ "border": null,
652
+ "bottom": null,
653
+ "display": null,
654
+ "flex": null,
655
+ "flex_flow": null,
656
+ "grid_area": null,
657
+ "grid_auto_columns": null,
658
+ "grid_auto_flow": null,
659
+ "grid_auto_rows": null,
660
+ "grid_column": null,
661
+ "grid_gap": null,
662
+ "grid_row": null,
663
+ "grid_template_areas": null,
664
+ "grid_template_columns": null,
665
+ "grid_template_rows": null,
666
+ "height": null,
667
+ "justify_content": null,
668
+ "justify_items": null,
669
+ "left": null,
670
+ "margin": null,
671
+ "max_height": null,
672
+ "max_width": null,
673
+ "min_height": null,
674
+ "min_width": null,
675
+ "object_fit": null,
676
+ "object_position": null,
677
+ "order": null,
678
+ "overflow": null,
679
+ "overflow_x": null,
680
+ "overflow_y": null,
681
+ "padding": null,
682
+ "right": null,
683
+ "top": null,
684
+ "visibility": null,
685
+ "width": null
686
+ }
687
+ },
688
+ "4f09405744b04ba79ecb18398b49a389": {
689
+ "model_module": "@jupyter-widgets/controls",
690
+ "model_name": "DescriptionStyleModel",
691
+ "model_module_version": "1.5.0",
692
+ "state": {
693
+ "_model_module": "@jupyter-widgets/controls",
694
+ "_model_module_version": "1.5.0",
695
+ "_model_name": "DescriptionStyleModel",
696
+ "_view_count": null,
697
+ "_view_module": "@jupyter-widgets/base",
698
+ "_view_module_version": "1.2.0",
699
+ "_view_name": "StyleView",
700
+ "description_width": ""
701
+ }
702
+ },
703
+ "685e146910634868b154ba03885d8b4c": {
704
+ "model_module": "@jupyter-widgets/controls",
705
+ "model_name": "HBoxModel",
706
+ "model_module_version": "1.5.0",
707
+ "state": {
708
+ "_dom_classes": [],
709
+ "_model_module": "@jupyter-widgets/controls",
710
+ "_model_module_version": "1.5.0",
711
+ "_model_name": "HBoxModel",
712
+ "_view_count": null,
713
+ "_view_module": "@jupyter-widgets/controls",
714
+ "_view_module_version": "1.5.0",
715
+ "_view_name": "HBoxView",
716
+ "box_style": "",
717
+ "children": [
718
+ "IPY_MODEL_fefe61069a1a416cbb512e1f006c82b0",
719
+ "IPY_MODEL_3cac12c147134eb4b71561f17345712d",
720
+ "IPY_MODEL_810b0e9e274a433892f87040283b4db9"
721
+ ],
722
+ "layout": "IPY_MODEL_11629aeef5a146e79869fded9f603d6d"
723
+ }
724
+ },
725
+ "fefe61069a1a416cbb512e1f006c82b0": {
726
+ "model_module": "@jupyter-widgets/controls",
727
+ "model_name": "HTMLModel",
728
+ "model_module_version": "1.5.0",
729
+ "state": {
730
+ "_dom_classes": [],
731
+ "_model_module": "@jupyter-widgets/controls",
732
+ "_model_module_version": "1.5.0",
733
+ "_model_name": "HTMLModel",
734
+ "_view_count": null,
735
+ "_view_module": "@jupyter-widgets/controls",
736
+ "_view_module_version": "1.5.0",
737
+ "_view_name": "HTMLView",
738
+ "description": "",
739
+ "description_tooltip": null,
740
+ "layout": "IPY_MODEL_dfe6494357d040ac8d51d03069822e41",
741
+ "placeholder": "​",
742
+ "style": "IPY_MODEL_190de89f93d048658ee4788ee4af4418",
743
+ "value": "Parsing nodes: 100%"
744
+ }
745
+ },
746
+ "3cac12c147134eb4b71561f17345712d": {
747
+ "model_module": "@jupyter-widgets/controls",
748
+ "model_name": "FloatProgressModel",
749
+ "model_module_version": "1.5.0",
750
+ "state": {
751
+ "_dom_classes": [],
752
+ "_model_module": "@jupyter-widgets/controls",
753
+ "_model_module_version": "1.5.0",
754
+ "_model_name": "FloatProgressModel",
755
+ "_view_count": null,
756
+ "_view_module": "@jupyter-widgets/controls",
757
+ "_view_module_version": "1.5.0",
758
+ "_view_name": "ProgressView",
759
+ "bar_style": "success",
760
+ "description": "",
761
+ "description_tooltip": null,
762
+ "layout": "IPY_MODEL_835fd209022c4086b509bf42084243b2",
763
+ "max": 14,
764
+ "min": 0,
765
+ "orientation": "horizontal",
766
+ "style": "IPY_MODEL_4b27da15dad34e839a4a02dab06d3e5a",
767
+ "value": 14
768
+ }
769
+ },
770
+ "810b0e9e274a433892f87040283b4db9": {
771
+ "model_module": "@jupyter-widgets/controls",
772
+ "model_name": "HTMLModel",
773
+ "model_module_version": "1.5.0",
774
+ "state": {
775
+ "_dom_classes": [],
776
+ "_model_module": "@jupyter-widgets/controls",
777
+ "_model_module_version": "1.5.0",
778
+ "_model_name": "HTMLModel",
779
+ "_view_count": null,
780
+ "_view_module": "@jupyter-widgets/controls",
781
+ "_view_module_version": "1.5.0",
782
+ "_view_name": "HTMLView",
783
+ "description": "",
784
+ "description_tooltip": null,
785
+ "layout": "IPY_MODEL_afa339155f4b4ffaa5fc70457b6b7a69",
786
+ "placeholder": "​",
787
+ "style": "IPY_MODEL_e8d0668d65dd4743b25c1bc74e1d8057",
788
+ "value": " 14/14 [00:00<00:00, 20.68it/s]"
789
+ }
790
+ },
791
+ "11629aeef5a146e79869fded9f603d6d": {
792
+ "model_module": "@jupyter-widgets/base",
793
+ "model_name": "LayoutModel",
794
+ "model_module_version": "1.2.0",
795
+ "state": {
796
+ "_model_module": "@jupyter-widgets/base",
797
+ "_model_module_version": "1.2.0",
798
+ "_model_name": "LayoutModel",
799
+ "_view_count": null,
800
+ "_view_module": "@jupyter-widgets/base",
801
+ "_view_module_version": "1.2.0",
802
+ "_view_name": "LayoutView",
803
+ "align_content": null,
804
+ "align_items": null,
805
+ "align_self": null,
806
+ "border": null,
807
+ "bottom": null,
808
+ "display": null,
809
+ "flex": null,
810
+ "flex_flow": null,
811
+ "grid_area": null,
812
+ "grid_auto_columns": null,
813
+ "grid_auto_flow": null,
814
+ "grid_auto_rows": null,
815
+ "grid_column": null,
816
+ "grid_gap": null,
817
+ "grid_row": null,
818
+ "grid_template_areas": null,
819
+ "grid_template_columns": null,
820
+ "grid_template_rows": null,
821
+ "height": null,
822
+ "justify_content": null,
823
+ "justify_items": null,
824
+ "left": null,
825
+ "margin": null,
826
+ "max_height": null,
827
+ "max_width": null,
828
+ "min_height": null,
829
+ "min_width": null,
830
+ "object_fit": null,
831
+ "object_position": null,
832
+ "order": null,
833
+ "overflow": null,
834
+ "overflow_x": null,
835
+ "overflow_y": null,
836
+ "padding": null,
837
+ "right": null,
838
+ "top": null,
839
+ "visibility": null,
840
+ "width": null
841
+ }
842
+ },
843
+ "dfe6494357d040ac8d51d03069822e41": {
844
+ "model_module": "@jupyter-widgets/base",
845
+ "model_name": "LayoutModel",
846
+ "model_module_version": "1.2.0",
847
+ "state": {
848
+ "_model_module": "@jupyter-widgets/base",
849
+ "_model_module_version": "1.2.0",
850
+ "_model_name": "LayoutModel",
851
+ "_view_count": null,
852
+ "_view_module": "@jupyter-widgets/base",
853
+ "_view_module_version": "1.2.0",
854
+ "_view_name": "LayoutView",
855
+ "align_content": null,
856
+ "align_items": null,
857
+ "align_self": null,
858
+ "border": null,
859
+ "bottom": null,
860
+ "display": null,
861
+ "flex": null,
862
+ "flex_flow": null,
863
+ "grid_area": null,
864
+ "grid_auto_columns": null,
865
+ "grid_auto_flow": null,
866
+ "grid_auto_rows": null,
867
+ "grid_column": null,
868
+ "grid_gap": null,
869
+ "grid_row": null,
870
+ "grid_template_areas": null,
871
+ "grid_template_columns": null,
872
+ "grid_template_rows": null,
873
+ "height": null,
874
+ "justify_content": null,
875
+ "justify_items": null,
876
+ "left": null,
877
+ "margin": null,
878
+ "max_height": null,
879
+ "max_width": null,
880
+ "min_height": null,
881
+ "min_width": null,
882
+ "object_fit": null,
883
+ "object_position": null,
884
+ "order": null,
885
+ "overflow": null,
886
+ "overflow_x": null,
887
+ "overflow_y": null,
888
+ "padding": null,
889
+ "right": null,
890
+ "top": null,
891
+ "visibility": null,
892
+ "width": null
893
+ }
894
+ },
895
+ "190de89f93d048658ee4788ee4af4418": {
896
+ "model_module": "@jupyter-widgets/controls",
897
+ "model_name": "DescriptionStyleModel",
898
+ "model_module_version": "1.5.0",
899
+ "state": {
900
+ "_model_module": "@jupyter-widgets/controls",
901
+ "_model_module_version": "1.5.0",
902
+ "_model_name": "DescriptionStyleModel",
903
+ "_view_count": null,
904
+ "_view_module": "@jupyter-widgets/base",
905
+ "_view_module_version": "1.2.0",
906
+ "_view_name": "StyleView",
907
+ "description_width": ""
908
+ }
909
+ },
910
+ "835fd209022c4086b509bf42084243b2": {
911
+ "model_module": "@jupyter-widgets/base",
912
+ "model_name": "LayoutModel",
913
+ "model_module_version": "1.2.0",
914
+ "state": {
915
+ "_model_module": "@jupyter-widgets/base",
916
+ "_model_module_version": "1.2.0",
917
+ "_model_name": "LayoutModel",
918
+ "_view_count": null,
919
+ "_view_module": "@jupyter-widgets/base",
920
+ "_view_module_version": "1.2.0",
921
+ "_view_name": "LayoutView",
922
+ "align_content": null,
923
+ "align_items": null,
924
+ "align_self": null,
925
+ "border": null,
926
+ "bottom": null,
927
+ "display": null,
928
+ "flex": null,
929
+ "flex_flow": null,
930
+ "grid_area": null,
931
+ "grid_auto_columns": null,
932
+ "grid_auto_flow": null,
933
+ "grid_auto_rows": null,
934
+ "grid_column": null,
935
+ "grid_gap": null,
936
+ "grid_row": null,
937
+ "grid_template_areas": null,
938
+ "grid_template_columns": null,
939
+ "grid_template_rows": null,
940
+ "height": null,
941
+ "justify_content": null,
942
+ "justify_items": null,
943
+ "left": null,
944
+ "margin": null,
945
+ "max_height": null,
946
+ "max_width": null,
947
+ "min_height": null,
948
+ "min_width": null,
949
+ "object_fit": null,
950
+ "object_position": null,
951
+ "order": null,
952
+ "overflow": null,
953
+ "overflow_x": null,
954
+ "overflow_y": null,
955
+ "padding": null,
956
+ "right": null,
957
+ "top": null,
958
+ "visibility": null,
959
+ "width": null
960
+ }
961
+ },
962
+ "4b27da15dad34e839a4a02dab06d3e5a": {
963
+ "model_module": "@jupyter-widgets/controls",
964
+ "model_name": "ProgressStyleModel",
965
+ "model_module_version": "1.5.0",
966
+ "state": {
967
+ "_model_module": "@jupyter-widgets/controls",
968
+ "_model_module_version": "1.5.0",
969
+ "_model_name": "ProgressStyleModel",
970
+ "_view_count": null,
971
+ "_view_module": "@jupyter-widgets/base",
972
+ "_view_module_version": "1.2.0",
973
+ "_view_name": "StyleView",
974
+ "bar_color": null,
975
+ "description_width": ""
976
+ }
977
+ },
978
+ "afa339155f4b4ffaa5fc70457b6b7a69": {
979
+ "model_module": "@jupyter-widgets/base",
980
+ "model_name": "LayoutModel",
981
+ "model_module_version": "1.2.0",
982
+ "state": {
983
+ "_model_module": "@jupyter-widgets/base",
984
+ "_model_module_version": "1.2.0",
985
+ "_model_name": "LayoutModel",
986
+ "_view_count": null,
987
+ "_view_module": "@jupyter-widgets/base",
988
+ "_view_module_version": "1.2.0",
989
+ "_view_name": "LayoutView",
990
+ "align_content": null,
991
+ "align_items": null,
992
+ "align_self": null,
993
+ "border": null,
994
+ "bottom": null,
995
+ "display": null,
996
+ "flex": null,
997
+ "flex_flow": null,
998
+ "grid_area": null,
999
+ "grid_auto_columns": null,
1000
+ "grid_auto_flow": null,
1001
+ "grid_auto_rows": null,
1002
+ "grid_column": null,
1003
+ "grid_gap": null,
1004
+ "grid_row": null,
1005
+ "grid_template_areas": null,
1006
+ "grid_template_columns": null,
1007
+ "grid_template_rows": null,
1008
+ "height": null,
1009
+ "justify_content": null,
1010
+ "justify_items": null,
1011
+ "left": null,
1012
+ "margin": null,
1013
+ "max_height": null,
1014
+ "max_width": null,
1015
+ "min_height": null,
1016
+ "min_width": null,
1017
+ "object_fit": null,
1018
+ "object_position": null,
1019
+ "order": null,
1020
+ "overflow": null,
1021
+ "overflow_x": null,
1022
+ "overflow_y": null,
1023
+ "padding": null,
1024
+ "right": null,
1025
+ "top": null,
1026
+ "visibility": null,
1027
+ "width": null
1028
+ }
1029
+ },
1030
+ "e8d0668d65dd4743b25c1bc74e1d8057": {
1031
+ "model_module": "@jupyter-widgets/controls",
1032
+ "model_name": "DescriptionStyleModel",
1033
+ "model_module_version": "1.5.0",
1034
+ "state": {
1035
+ "_model_module": "@jupyter-widgets/controls",
1036
+ "_model_module_version": "1.5.0",
1037
+ "_model_name": "DescriptionStyleModel",
1038
+ "_view_count": null,
1039
+ "_view_module": "@jupyter-widgets/base",
1040
+ "_view_module_version": "1.2.0",
1041
+ "_view_name": "StyleView",
1042
+ "description_width": ""
1043
+ }
1044
+ }
1045
+ }
1046
+ }
1047
+ },
1048
+ "cells": [
1049
+ {
1050
+ "cell_type": "markdown",
1051
+ "metadata": {
1052
+ "id": "view-in-github",
1053
+ "colab_type": "text"
1054
+ },
1055
+ "source": [
1056
+ "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/05-Improve_Prompts_%2B_Add_Source.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
1057
+ ]
1058
+ },
1059
+ {
1060
+ "cell_type": "markdown",
1061
+ "source": [
1062
+ "# Install Packages and Setup Variables"
1063
+ ],
1064
+ "metadata": {
1065
+ "id": "5BGJ3fxhOk2V"
1066
+ }
1067
+ },
1068
+ {
1069
+ "cell_type": "code",
1070
+ "execution_count": null,
1071
+ "metadata": {
1072
+ "id": "QPJzr-I9XQ7l",
1073
+ "colab": {
1074
+ "base_uri": "https://localhost:8080/"
1075
+ },
1076
+ "outputId": "c31cde74-f2a8-4c1b-adce-a8cce4268ec2"
1077
+ },
1078
+ "outputs": [
1079
+ {
1080
+ "output_type": "stream",
1081
+ "name": "stdout",
1082
+ "text": [
1083
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m52.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1084
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1085
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.7/51.7 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1086
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m47.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1087
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.6/508.6 kB\u001b[0m \u001b[31m23.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1088
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.9/79.9 MB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1089
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1090
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.9/147.9 kB\u001b[0m \u001b[31m15.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1091
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1092
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m79.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1093
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m71.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1094
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1095
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.7/60.7 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1096
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1097
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m61.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1098
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m61.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1099
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1100
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1101
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1102
+ "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
1103
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
1104
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
1105
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m52.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1106
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1107
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.6/67.6 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1108
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1109
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━��━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1110
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1111
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1112
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1113
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1114
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m59.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1115
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m70.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1116
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1117
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1118
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1119
+ "\u001b[?25h Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
1120
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
1121
+ "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
1122
+ "\u001b[0m"
1123
+ ]
1124
+ }
1125
+ ],
1126
+ "source": [
1127
+ "!pip install -q llama-index==0.9.21 openai==1.6.0 cohere==4.39 tiktoken==0.5.2 chromadb==0.4.21 kaleido==0.2.1 python-multipart==0.0.6 html2text==2020.1.16"
1128
+ ]
1129
+ },
1130
+ {
1131
+ "cell_type": "code",
1132
+ "source": [
1133
+ "import os\n",
1134
+ "\n",
1135
+ "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
1136
+ "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
1137
+ ],
1138
+ "metadata": {
1139
+ "id": "riuXwpSPcvWC"
1140
+ },
1141
+ "execution_count": null,
1142
+ "outputs": []
1143
+ },
1144
+ {
1145
+ "cell_type": "code",
1146
+ "source": [
1147
+ "import nest_asyncio\n",
1148
+ "\n",
1149
+ "nest_asyncio.apply()"
1150
+ ],
1151
+ "metadata": {
1152
+ "id": "km-KQOrgr3VB"
1153
+ },
1154
+ "execution_count": null,
1155
+ "outputs": []
1156
+ },
1157
+ {
1158
+ "cell_type": "markdown",
1159
+ "source": [
1160
+ "# Load a Model"
1161
+ ],
1162
+ "metadata": {
1163
+ "id": "Bkgi2OrYzF7q"
1164
+ }
1165
+ },
1166
+ {
1167
+ "cell_type": "code",
1168
+ "source": [
1169
+ "from llama_index.llms import OpenAI\n",
1170
+ "\n",
1171
+ "llm = OpenAI(temperature=0.9, model=\"gpt-3.5-turbo\", max_tokens=512)"
1172
+ ],
1173
+ "metadata": {
1174
+ "id": "9oGT6crooSSj"
1175
+ },
1176
+ "execution_count": null,
1177
+ "outputs": []
1178
+ },
1179
+ {
1180
+ "cell_type": "markdown",
1181
+ "source": [
1182
+ "# Create a VectoreStore"
1183
+ ],
1184
+ "metadata": {
1185
+ "id": "0BwVuJXlzHVL"
1186
+ }
1187
+ },
1188
+ {
1189
+ "cell_type": "code",
1190
+ "source": [
1191
+ "import chromadb\n",
1192
+ "\n",
1193
+ "# create client and a new collection\n",
1194
+ "# chromadb.EphemeralClient saves data in-memory.\n",
1195
+ "chroma_client = chromadb.PersistentClient(path=\"./mini-llama-articles\")\n",
1196
+ "chroma_collection = chroma_client.create_collection(\"mini-llama-articles\")"
1197
+ ],
1198
+ "metadata": {
1199
+ "id": "SQP87lHczHKc"
1200
+ },
1201
+ "execution_count": null,
1202
+ "outputs": []
1203
+ },
1204
+ {
1205
+ "cell_type": "code",
1206
+ "source": [
1207
+ "from llama_index.vector_stores import ChromaVectorStore\n",
1208
+ "\n",
1209
+ "# Define a storage context object using the created vector database.\n",
1210
+ "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
1211
+ ],
1212
+ "metadata": {
1213
+ "id": "zAaGcYMJzHAN"
1214
+ },
1215
+ "execution_count": null,
1216
+ "outputs": []
1217
+ },
1218
+ {
1219
+ "cell_type": "markdown",
1220
+ "source": [
1221
+ "# Load the Dataset (CSV)"
1222
+ ],
1223
+ "metadata": {
1224
+ "id": "I9JbAzFcjkpn"
1225
+ }
1226
+ },
1227
+ {
1228
+ "cell_type": "markdown",
1229
+ "source": [
1230
+ "## Download"
1231
+ ],
1232
+ "metadata": {
1233
+ "id": "_Tif8-JoRH68"
1234
+ }
1235
+ },
1236
+ {
1237
+ "cell_type": "markdown",
1238
+ "source": [
1239
+ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model."
1240
+ ],
1241
+ "metadata": {
1242
+ "id": "4fQaa1LN1mXL"
1243
+ }
1244
+ },
1245
+ {
1246
+ "cell_type": "code",
1247
+ "source": [
1248
+ "!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
1249
+ ],
1250
+ "metadata": {
1251
+ "colab": {
1252
+ "base_uri": "https://localhost:8080/"
1253
+ },
1254
+ "id": "fQtpDvUzKNzI",
1255
+ "outputId": "9a62a730-6fe0-4542-cfd1-cbb1f84d889e"
1256
+ },
1257
+ "execution_count": null,
1258
+ "outputs": [
1259
+ {
1260
+ "output_type": "stream",
1261
+ "name": "stdout",
1262
+ "text": [
1263
+ "--2024-02-01 14:53:37-- https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv\n",
1264
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
1265
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
1266
+ "HTTP request sent, awaiting response... 200 OK\n",
1267
+ "Length: 173646 (170K) [text/plain]\n",
1268
+ "Saving to: β€˜mini-llama-articles.csv’\n",
1269
+ "\n",
1270
+ "\rmini-llama-articles 0%[ ] 0 --.-KB/s \rmini-llama-articles 100%[===================>] 169.58K --.-KB/s in 0.02s \n",
1271
+ "\n",
1272
+ "2024-02-01 14:53:37 (6.64 MB/s) - β€˜mini-llama-articles.csv’ saved [173646/173646]\n",
1273
+ "\n"
1274
+ ]
1275
+ }
1276
+ ]
1277
+ },
1278
+ {
1279
+ "cell_type": "markdown",
1280
+ "source": [
1281
+ "## Load the Articles"
1282
+ ],
1283
+ "metadata": {
1284
+ "id": "zk-4alIxROo8"
1285
+ }
1286
+ },
1287
+ {
1288
+ "cell_type": "code",
1289
+ "source": [
1290
+ "import csv\n",
1291
+ "\n",
1292
+ "rows = []\n",
1293
+ "\n",
1294
+ "# Load the file as a JSON\n",
1295
+ "with open(\"./mini-llama-articles.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n",
1296
+ " csv_reader = csv.reader(file)\n",
1297
+ "\n",
1298
+ " for idx, row in enumerate( csv_reader ):\n",
1299
+ " if idx == 0: continue; # Skip header row\n",
1300
+ " rows.append( row )\n",
1301
+ "\n",
1302
+ "# The number of characters in the dataset.\n",
1303
+ "len( rows )"
1304
+ ],
1305
+ "metadata": {
1306
+ "colab": {
1307
+ "base_uri": "https://localhost:8080/"
1308
+ },
1309
+ "id": "_WER5lt0N7c5",
1310
+ "outputId": "3abbb956-12ed-4663-ade9-eb45d0784ee1"
1311
+ },
1312
+ "execution_count": null,
1313
+ "outputs": [
1314
+ {
1315
+ "output_type": "execute_result",
1316
+ "data": {
1317
+ "text/plain": [
1318
+ "14"
1319
+ ]
1320
+ },
1321
+ "metadata": {},
1322
+ "execution_count": 27
1323
+ }
1324
+ ]
1325
+ },
1326
+ {
1327
+ "cell_type": "code",
1328
+ "source": [
1329
+ "rows[0][3]"
1330
+ ],
1331
+ "metadata": {
1332
+ "colab": {
1333
+ "base_uri": "https://localhost:8080/",
1334
+ "height": 35
1335
+ },
1336
+ "id": "NonYMN-Ihx1O",
1337
+ "outputId": "f6fc2829-8a5b-493e-ec97-e1b8d6b66c03"
1338
+ },
1339
+ "execution_count": null,
1340
+ "outputs": [
1341
+ {
1342
+ "output_type": "execute_result",
1343
+ "data": {
1344
+ "text/plain": [
1345
+ "'towards_ai'"
1346
+ ],
1347
+ "application/vnd.google.colaboratory.intrinsic+json": {
1348
+ "type": "string"
1349
+ }
1350
+ },
1351
+ "metadata": {},
1352
+ "execution_count": 34
1353
+ }
1354
+ ]
1355
+ },
1356
+ {
1357
+ "cell_type": "markdown",
1358
+ "source": [
1359
+ "# Convert to Document obj"
1360
+ ],
1361
+ "metadata": {
1362
+ "id": "wxEStggPdxYs"
1363
+ }
1364
+ },
1365
+ {
1366
+ "cell_type": "code",
1367
+ "source": [
1368
+ "from llama_index import Document\n",
1369
+ "\n",
1370
+ "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
1371
+ "documents = [Document(text=row[1], metadata={\"title\": row[0], \"url\": row[2], \"source_name\": row[3]}) for row in rows]"
1372
+ ],
1373
+ "metadata": {
1374
+ "id": "lFvW_886dxKX"
1375
+ },
1376
+ "execution_count": null,
1377
+ "outputs": []
1378
+ },
1379
+ {
1380
+ "cell_type": "code",
1381
+ "source": [
1382
+ "len( documents )"
1383
+ ],
1384
+ "metadata": {
1385
+ "colab": {
1386
+ "base_uri": "https://localhost:8080/"
1387
+ },
1388
+ "id": "Njoc3XEVkKkf",
1389
+ "outputId": "02f05737-9dbf-4398-fe21-1f385eacbd13"
1390
+ },
1391
+ "execution_count": null,
1392
+ "outputs": [
1393
+ {
1394
+ "output_type": "execute_result",
1395
+ "data": {
1396
+ "text/plain": [
1397
+ "14"
1398
+ ]
1399
+ },
1400
+ "metadata": {},
1401
+ "execution_count": 44
1402
+ }
1403
+ ]
1404
+ },
1405
+ {
1406
+ "cell_type": "markdown",
1407
+ "source": [
1408
+ "# Transforming"
1409
+ ],
1410
+ "metadata": {
1411
+ "id": "S17g2RYOjmf2"
1412
+ }
1413
+ },
1414
+ {
1415
+ "cell_type": "code",
1416
+ "source": [
1417
+ "from llama_index.text_splitter import TokenTextSplitter\n",
1418
+ "\n",
1419
+ "text_splitter = TokenTextSplitter(\n",
1420
+ " separator=\" \", chunk_size=512, chunk_overlap=128\n",
1421
+ ")"
1422
+ ],
1423
+ "metadata": {
1424
+ "id": "STACTMUR1z9N"
1425
+ },
1426
+ "execution_count": null,
1427
+ "outputs": []
1428
+ },
1429
+ {
1430
+ "cell_type": "code",
1431
+ "source": [
1432
+ "from llama_index.extractors import (\n",
1433
+ " SummaryExtractor,\n",
1434
+ " QuestionsAnsweredExtractor,\n",
1435
+ " KeywordExtractor,\n",
1436
+ ")\n",
1437
+ "from llama_index.embeddings import OpenAIEmbedding\n",
1438
+ "from llama_index.ingestion import IngestionPipeline\n",
1439
+ "\n",
1440
+ "pipeline = IngestionPipeline(\n",
1441
+ " transformations=[\n",
1442
+ " text_splitter,\n",
1443
+ " QuestionsAnsweredExtractor(questions=3, llm=llm),\n",
1444
+ " SummaryExtractor(summaries=[\"prev\", \"self\"], llm=llm),\n",
1445
+ " KeywordExtractor(keywords=10, llm=llm),\n",
1446
+ " OpenAIEmbedding(),\n",
1447
+ " ],\n",
1448
+ " vector_store=vector_store\n",
1449
+ ")\n",
1450
+ "\n",
1451
+ "pipeline.run(documents=documents, show_progress=True)"
1452
+ ],
1453
+ "metadata": {
1454
+ "id": "CtdsIUQ81_hT",
1455
+ "colab": {
1456
+ "base_uri": "https://localhost:8080/",
1457
+ "height": 385,
1458
+ "referenced_widgets": [
1459
+ "88b29c392c7d403488f81903b2395dc1",
1460
+ "81d6771d5cc74dd49cfccc362d5a3c87",
1461
+ "1a021f3baf754210ac8696ecc555d968",
1462
+ "35d8b3afc94b4294ac0d5b090a49003c",
1463
+ "2980851a0f0141fcb4b54aba07366ad9",
1464
+ "553d09bfb9bc4ceeb0882d5bd6061514",
1465
+ "13d692f33c85490aaf761d9444f6c145",
1466
+ "a3f4c5eb33a64fe895d19a5a52426000",
1467
+ "5b0afc9227e6437ca4ca7a4929f47d7a",
1468
+ "39e87e82d7604abfa03879003898259f",
1469
+ "e876a66432c24d39a4630601d0f4ce93",
1470
+ "f67382c8ddf248c4b4eeb1b596284917",
1471
+ "61ebc39888444d448f624f1ae848646a",
1472
+ "a692807d09ba4ea89bfdae50821ee518",
1473
+ "7fbeb4bc3ea743168a1816b0021e092b",
1474
+ "ac3c4e3c9b0c4703b5a9148a70c23e21",
1475
+ "165ccb061bb843a4b44896df7b4d15b0",
1476
+ "f1a9031e7c3445ee80308888d28f2d66",
1477
+ "b15acb0416594d44a41df804e42b4cdf",
1478
+ "4aeb5362822f490aa6cff491dded8111",
1479
+ "faa803d16fc74ddcbb003485a506569c",
1480
+ "4f09405744b04ba79ecb18398b49a389"
1481
+ ]
1482
+ },
1483
+ "outputId": "b29154c5-8209-4d0e-b546-2e546e6ceeeb"
1484
+ },
1485
+ "execution_count": null,
1486
+ "outputs": [
1487
+ {
1488
+ "output_type": "display_data",
1489
+ "data": {
1490
+ "text/plain": [
1491
+ "Parsing nodes: 0%| | 0/14 [00:00<?, ?it/s]"
1492
+ ],
1493
+ "application/vnd.jupyter.widget-view+json": {
1494
+ "version_major": 2,
1495
+ "version_minor": 0,
1496
+ "model_id": "88b29c392c7d403488f81903b2395dc1"
1497
+ }
1498
+ },
1499
+ "metadata": {}
1500
+ },
1501
+ {
1502
+ "output_type": "stream",
1503
+ "name": "stdout",
1504
+ "text": [
1505
+ "464\n",
1506
+ "452\n",
1507
+ "457\n",
1508
+ "465\n",
1509
+ "448\n",
1510
+ "468\n",
1511
+ "434\n",
1512
+ "447\n",
1513
+ "455\n",
1514
+ "445\n",
1515
+ "449\n",
1516
+ "455\n",
1517
+ "431\n",
1518
+ "453\n"
1519
+ ]
1520
+ },
1521
+ {
1522
+ "output_type": "stream",
1523
+ "name": "stderr",
1524
+ "text": [
1525
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 108/108 [00:48<00:00, 2.22it/s]\n",
1526
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 108/108 [01:05<00:00, 1.65it/s]\n",
1527
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 108/108 [00:48<00:00, 2.22it/s]\n"
1528
+ ]
1529
+ },
1530
+ {
1531
+ "output_type": "display_data",
1532
+ "data": {
1533
+ "text/plain": [
1534
+ "Generating embeddings: 0%| | 0/108 [00:00<?, ?it/s]"
1535
+ ],
1536
+ "application/vnd.jupyter.widget-view+json": {
1537
+ "version_major": 2,
1538
+ "version_minor": 0,
1539
+ "model_id": "f67382c8ddf248c4b4eeb1b596284917"
1540
+ }
1541
+ },
1542
+ "metadata": {}
1543
+ }
1544
+ ]
1545
+ },
1546
+ {
1547
+ "cell_type": "markdown",
1548
+ "source": [
1549
+ "# Load Indexes"
1550
+ ],
1551
+ "metadata": {
1552
+ "id": "EV0ll57p46Dc"
1553
+ }
1554
+ },
1555
+ {
1556
+ "cell_type": "code",
1557
+ "source": [
1558
+ "# Create your index\n",
1559
+ "db = chromadb.PersistentClient(path=\"./mini-llama-articles\")\n",
1560
+ "chroma_collection = db.get_or_create_collection(\"mini-llama-articles\")\n",
1561
+ "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
1562
+ ],
1563
+ "metadata": {
1564
+ "id": "PS215gCGkGD-"
1565
+ },
1566
+ "execution_count": null,
1567
+ "outputs": []
1568
+ },
1569
+ {
1570
+ "cell_type": "code",
1571
+ "source": [
1572
+ "# Create your index\n",
1573
+ "from llama_index import VectorStoreIndex\n",
1574
+ "\n",
1575
+ "index = VectorStoreIndex.from_vector_store(vector_store)"
1576
+ ],
1577
+ "metadata": {
1578
+ "id": "HbT3-kRO4Qpt"
1579
+ },
1580
+ "execution_count": null,
1581
+ "outputs": []
1582
+ },
1583
+ {
1584
+ "cell_type": "code",
1585
+ "source": [
1586
+ "query_engine = index.as_query_engine()"
1587
+ ],
1588
+ "metadata": {
1589
+ "id": "sb61DWU84bHP"
1590
+ },
1591
+ "execution_count": null,
1592
+ "outputs": []
1593
+ },
1594
+ {
1595
+ "cell_type": "code",
1596
+ "source": [
1597
+ "res = query_engine.query(\"How many parameters LLaMA2 model has?\")"
1598
+ ],
1599
+ "metadata": {
1600
+ "id": "G32W2LMMCmnv"
1601
+ },
1602
+ "execution_count": null,
1603
+ "outputs": []
1604
+ },
1605
+ {
1606
+ "cell_type": "code",
1607
+ "source": [
1608
+ "res.response"
1609
+ ],
1610
+ "metadata": {
1611
+ "colab": {
1612
+ "base_uri": "https://localhost:8080/",
1613
+ "height": 35
1614
+ },
1615
+ "id": "obc20cU5Cxf2",
1616
+ "outputId": "fafff42a-b10d-47e9-8b7f-b0b8d256e31c"
1617
+ },
1618
+ "execution_count": null,
1619
+ "outputs": [
1620
+ {
1621
+ "output_type": "execute_result",
1622
+ "data": {
1623
+ "text/plain": [
1624
+ "'The LLaMA2 model has four different parameter sizes: 7 billion, 13 billion, 34 billion, and 70 billion.'"
1625
+ ],
1626
+ "application/vnd.google.colaboratory.intrinsic+json": {
1627
+ "type": "string"
1628
+ }
1629
+ },
1630
+ "metadata": {},
1631
+ "execution_count": 131
1632
+ }
1633
+ ]
1634
+ },
1635
+ {
1636
+ "cell_type": "code",
1637
+ "source": [
1638
+ "for src in res.source_nodes:\n",
1639
+ " print(\"Node ID\\t\", src.node_id)\n",
1640
+ " print(\"Title\\t\", src.metadata['title'])\n",
1641
+ " print(\"Text\\t\", src.text)\n",
1642
+ " print(\"Score\\t\", src.score)\n",
1643
+ " print(\"-_\"*20)"
1644
+ ],
1645
+ "metadata": {
1646
+ "colab": {
1647
+ "base_uri": "https://localhost:8080/"
1648
+ },
1649
+ "id": "oIAO-saJCzYe",
1650
+ "outputId": "7756d876-a474-42c3-e1a6-a4c042b8fbab"
1651
+ },
1652
+ "execution_count": null,
1653
+ "outputs": [
1654
+ {
1655
+ "output_type": "stream",
1656
+ "name": "stdout",
1657
+ "text": [
1658
+ "Node ID\t cccc8c69-1648-469b-8a2a-ea7f003dbe27\n",
1659
+ "Title\t Fine-Tuning a Llama-2 7B Model for Python Code Generation\n",
1660
+ "Text\t New Llama-2 model In mid-July, Meta released its new family of pre-trained and finetuned models called Llama-2, with an open source and commercial character to facilitate its use and expansion. The base model was released with a chat version and sizes 7B, 13B, and 70B. Together with the models, the corresponding papers were published describing their characteristics and relevant points of the learning process, which provide very interesting information on the subject. For pre-training, 40% more tokens were used, reaching 2T, the context length was doubled and the grouped-query attention (GQA) technique was applied to speed up inference on the heavier 70B model. On the standard transformer architecture, RMSNorm normalization, SwiGLU activation, and rotatory positional embedding are used, the context length reaches 4096 tokens, and an Adam optimizer is applied with a cosine learning rate schedule, a weight decay of 0.1 and gradient clipping. The dataset for tuning For our tuning process, we will take a dataset containing about 18,000 examples where the model is asked to build a Python code that solves a given task. This is an extraction of the original dataset [2], where only the Python language examples are selected. Each row contains the description of the task to be solved, an example of data input to the task if applicable, and the generated code fragment that solves the task is provided [3]. Creating the prompt To carry out an instruction fine-tuning, we must transform each one of our data examples as if it were an instruction, outlining its main sections as follows: Output: Fine-tuning the model To carry out this stage, we have used the Google Colab environment, where we have developed a notebook that allows us to run the training in an interactive way and also a Python script to run the training in unattended mode. For the first test runs, a T4 instance with a high RAM capacity is enough, but when it comes to running the whole dataset and epochs, we have opted to use an A100 instance in order to speed up the training and ensure that its execution time is reasonable. In order to be able to\n",
1661
+ "Score\t 0.768167879324151\n",
1662
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
1663
+ "Node ID\t 2b138886-4ff7-4c96-9d01-ea883d4f34ed\n",
1664
+ "Title\t Fine-Tuning a Llama-2 7B Model for Python Code Generation\n",
1665
+ "Text\t weights As we mention, we have trained \"modification weights\" on the base model, our final model requires merging the pretrained model and the adapters in a single model. You can find and download the model in my Hugging Face account edumunozsala/llama-27b-int4-python-code-20k. Give it a try! Inferencing or generating Python code And finally, we will show you how you can download the model from the Hugging Face Hub and call the model to generate an accurate result: Thanks to Maxime Labonne for an excellent article [9] and Philipp Schmid who provides an inspiring code [8]. Their articles are a must-read for everyone interested in Llama 2 and model fine-tuning. And it is all I have to mention, I hope you find useful this article and claps are welcome!! You can Follow me and Subscribe to my articles, or even connect to me via Linkedin. The code is available in my Github Repository. References [1] Llama-2 paper [2] Link to the original dataset in the Huggingface hub [3] Link to the used dataset in the Huggingface hub [4] Fine-tuning a GPT - LoRA by Chris Kuo/Dr. Dataman [5] Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, & Weizhu Chen. (2021). LoRA: Low-Rank Adaptation of Large Language Models. arXiv:2106.09685 [6]. QLoRa: Efficient Finetuning of QuantizedLLMs [7] Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning [8] Extended Guide: Instruction-tune Llama 2 by Philipp Schmid. [9] Fine-Tune Your Own Llama 2 Model in a Colab Notebook by Maxime Labonne [10]. My Github Repository\n",
1666
+ "Score\t 0.7586440479430622\n",
1667
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
1668
+ ]
1669
+ }
1670
+ ]
1671
+ },
1672
+ {
1673
+ "cell_type": "markdown",
1674
+ "source": [
1675
+ "# No Metadata"
1676
+ ],
1677
+ "metadata": {
1678
+ "id": "wvOhbZvl95di"
1679
+ }
1680
+ },
1681
+ {
1682
+ "cell_type": "code",
1683
+ "source": [
1684
+ "from llama_index import Document\n",
1685
+ "\n",
1686
+ "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
1687
+ "documents = [Document(text=row[1]) for row in rows]"
1688
+ ],
1689
+ "metadata": {
1690
+ "id": "y5w5ZPbR97iK"
1691
+ },
1692
+ "execution_count": null,
1693
+ "outputs": []
1694
+ },
1695
+ {
1696
+ "cell_type": "code",
1697
+ "source": [
1698
+ "from llama_index.text_splitter import TokenTextSplitter\n",
1699
+ "\n",
1700
+ "text_splitter = TokenTextSplitter(\n",
1701
+ " separator=\" \", chunk_size=512, chunk_overlap=128\n",
1702
+ ")"
1703
+ ],
1704
+ "metadata": {
1705
+ "id": "WzF8LYgH9-o0"
1706
+ },
1707
+ "execution_count": null,
1708
+ "outputs": []
1709
+ },
1710
+ {
1711
+ "cell_type": "code",
1712
+ "source": [
1713
+ "from llama_index.extractors import (\n",
1714
+ " SummaryExtractor,\n",
1715
+ " QuestionsAnsweredExtractor,\n",
1716
+ " KeywordExtractor,\n",
1717
+ ")\n",
1718
+ "from llama_index.embeddings import OpenAIEmbedding\n",
1719
+ "from llama_index.ingestion import IngestionPipeline\n",
1720
+ "\n",
1721
+ "pipeline = IngestionPipeline(\n",
1722
+ " transformations=[\n",
1723
+ " text_splitter\n",
1724
+ " ]\n",
1725
+ ")\n",
1726
+ "\n",
1727
+ "nodes = pipeline.run(documents=documents, show_progress=True)"
1728
+ ],
1729
+ "metadata": {
1730
+ "colab": {
1731
+ "base_uri": "https://localhost:8080/",
1732
+ "height": 299,
1733
+ "referenced_widgets": [
1734
+ "685e146910634868b154ba03885d8b4c",
1735
+ "fefe61069a1a416cbb512e1f006c82b0",
1736
+ "3cac12c147134eb4b71561f17345712d",
1737
+ "810b0e9e274a433892f87040283b4db9",
1738
+ "11629aeef5a146e79869fded9f603d6d",
1739
+ "dfe6494357d040ac8d51d03069822e41",
1740
+ "190de89f93d048658ee4788ee4af4418",
1741
+ "835fd209022c4086b509bf42084243b2",
1742
+ "4b27da15dad34e839a4a02dab06d3e5a",
1743
+ "afa339155f4b4ffaa5fc70457b6b7a69",
1744
+ "e8d0668d65dd4743b25c1bc74e1d8057"
1745
+ ]
1746
+ },
1747
+ "id": "hYGkf-Rf-DKd",
1748
+ "outputId": "a1117309-317c-42c0-9b66-8efe3af493f4"
1749
+ },
1750
+ "execution_count": null,
1751
+ "outputs": [
1752
+ {
1753
+ "output_type": "display_data",
1754
+ "data": {
1755
+ "text/plain": [
1756
+ "Parsing nodes: 0%| | 0/14 [00:00<?, ?it/s]"
1757
+ ],
1758
+ "application/vnd.jupyter.widget-view+json": {
1759
+ "version_major": 2,
1760
+ "version_minor": 0,
1761
+ "model_id": "685e146910634868b154ba03885d8b4c"
1762
+ }
1763
+ },
1764
+ "metadata": {}
1765
+ },
1766
+ {
1767
+ "output_type": "stream",
1768
+ "name": "stdout",
1769
+ "text": [
1770
+ "510\n",
1771
+ "510\n",
1772
+ "510\n",
1773
+ "510\n",
1774
+ "510\n",
1775
+ "510\n",
1776
+ "510\n",
1777
+ "510\n",
1778
+ "510\n",
1779
+ "510\n",
1780
+ "510\n",
1781
+ "510\n",
1782
+ "510\n",
1783
+ "510\n"
1784
+ ]
1785
+ }
1786
+ ]
1787
+ },
1788
+ {
1789
+ "cell_type": "code",
1790
+ "source": [
1791
+ "from llama_index import ServiceContext\n",
1792
+ "\n",
1793
+ "index_no_metadata = VectorStoreIndex(\n",
1794
+ " nodes=nodes,\n",
1795
+ " service_context=ServiceContext.from_defaults(llm=OpenAI(model=\"gpt-3.5-turbo\")),\n",
1796
+ ")"
1797
+ ],
1798
+ "metadata": {
1799
+ "id": "2FR8rgOd-Jt2"
1800
+ },
1801
+ "execution_count": null,
1802
+ "outputs": []
1803
+ },
1804
+ {
1805
+ "cell_type": "code",
1806
+ "source": [
1807
+ "query_engine_no_metadata = index_no_metadata.as_query_engine()"
1808
+ ],
1809
+ "metadata": {
1810
+ "id": "HfzZ7Xyx-mbX"
1811
+ },
1812
+ "execution_count": null,
1813
+ "outputs": []
1814
+ },
1815
+ {
1816
+ "cell_type": "code",
1817
+ "source": [
1818
+ "res = query_engine_no_metadata.query(\"How many parameters LLaMA2 model has?\")"
1819
+ ],
1820
+ "metadata": {
1821
+ "id": "n8WQJFuLD4FW"
1822
+ },
1823
+ "execution_count": null,
1824
+ "outputs": []
1825
+ },
1826
+ {
1827
+ "cell_type": "code",
1828
+ "source": [
1829
+ "res.response"
1830
+ ],
1831
+ "metadata": {
1832
+ "colab": {
1833
+ "base_uri": "https://localhost:8080/",
1834
+ "height": 35
1835
+ },
1836
+ "id": "uZw_S9gNGS17",
1837
+ "outputId": "b7ce3cd0-296d-400a-d12a-6b39dd2c008a"
1838
+ },
1839
+ "execution_count": null,
1840
+ "outputs": [
1841
+ {
1842
+ "output_type": "execute_result",
1843
+ "data": {
1844
+ "text/plain": [
1845
+ "'The context information does not provide any information about the number of parameters in the LLaMA2 model.'"
1846
+ ],
1847
+ "application/vnd.google.colaboratory.intrinsic+json": {
1848
+ "type": "string"
1849
+ }
1850
+ },
1851
+ "metadata": {},
1852
+ "execution_count": 134
1853
+ }
1854
+ ]
1855
+ },
1856
+ {
1857
+ "cell_type": "code",
1858
+ "source": [
1859
+ "for src in res.source_nodes:\n",
1860
+ " print( src )\n",
1861
+ " print(\"-_\"*20)"
1862
+ ],
1863
+ "metadata": {
1864
+ "colab": {
1865
+ "base_uri": "https://localhost:8080/"
1866
+ },
1867
+ "id": "V6Rm7v8eD3xh",
1868
+ "outputId": "88ede9b3-c322-4968-f99d-b27e482ca050"
1869
+ },
1870
+ "execution_count": null,
1871
+ "outputs": [
1872
+ {
1873
+ "output_type": "stream",
1874
+ "name": "stdout",
1875
+ "text": [
1876
+ "Node ID: 895debf4-60ad-4156-8f52-cdddf03b1138\n",
1877
+ "Text: I. Llama 2: Revolutionizing Commercial Use Unlike its\n",
1878
+ "predecessor Llama 1, which was limited to research use, Llama 2\n",
1879
+ "represents a major advancement as an open-source commercial model.\n",
1880
+ "Businesses can now integrate Llama 2 into products to create AI-\n",
1881
+ "powered applications. Availability on Azure and AWS facilitates fine-\n",
1882
+ "tuning and adoption. However,...\n",
1883
+ "Score: 0.852\n",
1884
+ "\n",
1885
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
1886
+ "Node ID: a919ca6a-bdda-4d92-a7b2-ab4048cdc0d8\n",
1887
+ "Text: basis. The sharing of codes and weights allows other researchers\n",
1888
+ "to test new approaches in LLMs. The LLaMA models have a range of 7\n",
1889
+ "billion to 65 billion parameters. LLaMA-65B can be compared to\n",
1890
+ "DeepMind's Chinchilla and Google's PaLM. Publicly available unlabeled\n",
1891
+ "data was used to train these models, and training smaller foundational\n",
1892
+ "models requ...\n",
1893
+ "Score: 0.830\n",
1894
+ "\n",
1895
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
1896
+ ]
1897
+ }
1898
+ ]
1899
+ }
1900
+ ]
1901
+ }