Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
•
57b9989
1
Parent(s):
dbb343d
add notebook to test app
Browse files- tests/notebook.ipynb +58 -20
tests/notebook.ipynb
CHANGED
@@ -36,10 +36,10 @@
|
|
36 |
" \"normalize_bullet_points\",\n",
|
37 |
" \"normalize_hyphenated_words\",\n",
|
38 |
" \"normalize_quotation_marks\",\n",
|
|
|
39 |
" \"normalize_repeating_words\",\n",
|
40 |
" \"normalize_repeating_chars\",\n",
|
41 |
" \"normalize_whitespaces\",\n",
|
42 |
-
" \"normalize_useless_spaces\",\n",
|
43 |
" # \"replace_currency_symbols\",\n",
|
44 |
" # \"replace_emails\",\n",
|
45 |
" # \"replace_emojis\",\n",
|
@@ -66,20 +66,20 @@
|
|
66 |
"source": [
|
67 |
"post_steps = [\n",
|
68 |
" \"lowercase\",\n",
|
69 |
-
" \"replace_currency_symbols\",\n",
|
70 |
-
" \"replace_urls\",\n",
|
71 |
-
" \"replace_emails\",\n",
|
72 |
-
" \"replace_user_handles\",\n",
|
73 |
-
" \"replace_hashtags\",\n",
|
74 |
-
" \"replace_emojis\",\n",
|
75 |
" # \"replace_phone_numbers\",\n",
|
76 |
" # \"replace_numbers\",\n",
|
77 |
-
" \"remove_accents\",\n",
|
78 |
-
" \"remove_brackets\",\n",
|
79 |
" \"remove_html_tags\",\n",
|
|
|
|
|
80 |
" \"remove_non_words\",\n",
|
81 |
-
" \"remove_numbers\",\n",
|
82 |
-
" \"remove_punctuation\",\n",
|
83 |
" \"normalize_repeating_words\",\n",
|
84 |
" \"normalize_repeating_chars\",\n",
|
85 |
" \"normalize_useless_spaces\",\n",
|
@@ -172,7 +172,7 @@
|
|
172 |
},
|
173 |
{
|
174 |
"cell_type": "code",
|
175 |
-
"execution_count":
|
176 |
"metadata": {},
|
177 |
"outputs": [
|
178 |
{
|
@@ -303,7 +303,7 @@
|
|
303 |
"[5000 rows x 3 columns]"
|
304 |
]
|
305 |
},
|
306 |
-
"execution_count":
|
307 |
"metadata": {},
|
308 |
"output_type": "execute_result"
|
309 |
}
|
@@ -314,7 +314,7 @@
|
|
314 |
},
|
315 |
{
|
316 |
"cell_type": "code",
|
317 |
-
"execution_count":
|
318 |
"metadata": {},
|
319 |
"outputs": [],
|
320 |
"source": [
|
@@ -325,16 +325,16 @@
|
|
325 |
},
|
326 |
{
|
327 |
"cell_type": "code",
|
328 |
-
"execution_count":
|
329 |
"metadata": {},
|
330 |
"outputs": [
|
331 |
{
|
332 |
"data": {
|
333 |
"text/plain": [
|
334 |
-
"[1, 14, 2, 3, 4, 23, 22, 5,
|
335 |
]
|
336 |
},
|
337 |
-
"execution_count":
|
338 |
"metadata": {},
|
339 |
"output_type": "execute_result"
|
340 |
}
|
@@ -345,16 +345,16 @@
|
|
345 |
},
|
346 |
{
|
347 |
"cell_type": "code",
|
348 |
-
"execution_count":
|
349 |
"metadata": {},
|
350 |
"outputs": [
|
351 |
{
|
352 |
"data": {
|
353 |
"text/plain": [
|
354 |
-
"[0,
|
355 |
]
|
356 |
},
|
357 |
-
"execution_count":
|
358 |
"metadata": {},
|
359 |
"output_type": "execute_result"
|
360 |
}
|
@@ -381,6 +381,44 @@
|
|
381 |
"list(PreprocessingPipeline.lemmatization_component().keys())"
|
382 |
]
|
383 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
{
|
385 |
"cell_type": "code",
|
386 |
"execution_count": null,
|
|
|
36 |
" \"normalize_bullet_points\",\n",
|
37 |
" \"normalize_hyphenated_words\",\n",
|
38 |
" \"normalize_quotation_marks\",\n",
|
39 |
+
" \"normalize_useless_spaces\",\n",
|
40 |
" \"normalize_repeating_words\",\n",
|
41 |
" \"normalize_repeating_chars\",\n",
|
42 |
" \"normalize_whitespaces\",\n",
|
|
|
43 |
" # \"replace_currency_symbols\",\n",
|
44 |
" # \"replace_emails\",\n",
|
45 |
" # \"replace_emojis\",\n",
|
|
|
66 |
"source": [
|
67 |
"post_steps = [\n",
|
68 |
" \"lowercase\",\n",
|
69 |
+
" # \"replace_currency_symbols\",\n",
|
70 |
+
" # \"replace_urls\",\n",
|
71 |
+
" # \"replace_emails\",\n",
|
72 |
+
" # \"replace_user_handles\",\n",
|
73 |
+
" # \"replace_hashtags\",\n",
|
74 |
+
" # \"replace_emojis\",\n",
|
75 |
" # \"replace_phone_numbers\",\n",
|
76 |
" # \"replace_numbers\",\n",
|
|
|
|
|
77 |
" \"remove_html_tags\",\n",
|
78 |
+
" \"remove_accents\",\n",
|
79 |
+
" # \"remove_brackets\",\n",
|
80 |
" \"remove_non_words\",\n",
|
81 |
+
" # \"remove_numbers\",\n",
|
82 |
+
" # \"remove_punctuation\",\n",
|
83 |
" \"normalize_repeating_words\",\n",
|
84 |
" \"normalize_repeating_chars\",\n",
|
85 |
" \"normalize_useless_spaces\",\n",
|
|
|
172 |
},
|
173 |
{
|
174 |
"cell_type": "code",
|
175 |
+
"execution_count": 14,
|
176 |
"metadata": {},
|
177 |
"outputs": [
|
178 |
{
|
|
|
303 |
"[5000 rows x 3 columns]"
|
304 |
]
|
305 |
},
|
306 |
+
"execution_count": 14,
|
307 |
"metadata": {},
|
308 |
"output_type": "execute_result"
|
309 |
}
|
|
|
314 |
},
|
315 |
{
|
316 |
"cell_type": "code",
|
317 |
+
"execution_count": 15,
|
318 |
"metadata": {},
|
319 |
"outputs": [],
|
320 |
"source": [
|
|
|
325 |
},
|
326 |
{
|
327 |
"cell_type": "code",
|
328 |
+
"execution_count": 16,
|
329 |
"metadata": {},
|
330 |
"outputs": [
|
331 |
{
|
332 |
"data": {
|
333 |
"text/plain": [
|
334 |
+
"[1, 14, 2, 3, 4, 21, 23, 22, 5, 24]"
|
335 |
]
|
336 |
},
|
337 |
+
"execution_count": 16,
|
338 |
"metadata": {},
|
339 |
"output_type": "execute_result"
|
340 |
}
|
|
|
345 |
},
|
346 |
{
|
347 |
"cell_type": "code",
|
348 |
+
"execution_count": 17,
|
349 |
"metadata": {},
|
350 |
"outputs": [
|
351 |
{
|
352 |
"data": {
|
353 |
"text/plain": [
|
354 |
+
"[0, 17, 15, 19, 23, 22, 21, 24]"
|
355 |
]
|
356 |
},
|
357 |
+
"execution_count": 17,
|
358 |
"metadata": {},
|
359 |
"output_type": "execute_result"
|
360 |
}
|
|
|
381 |
"list(PreprocessingPipeline.lemmatization_component().keys())"
|
382 |
]
|
383 |
},
|
384 |
+
{
|
385 |
+
"cell_type": "code",
|
386 |
+
"execution_count": 14,
|
387 |
+
"metadata": {},
|
388 |
+
"outputs": [],
|
389 |
+
"source": [
|
390 |
+
"import re"
|
391 |
+
]
|
392 |
+
},
|
393 |
+
{
|
394 |
+
"cell_type": "code",
|
395 |
+
"execution_count": 27,
|
396 |
+
"metadata": {},
|
397 |
+
"outputs": [],
|
398 |
+
"source": [
|
399 |
+
"_re_non_words = re.compile(\"[^A-Za-z]+\")"
|
400 |
+
]
|
401 |
+
},
|
402 |
+
{
|
403 |
+
"cell_type": "code",
|
404 |
+
"execution_count": 28,
|
405 |
+
"metadata": {},
|
406 |
+
"outputs": [
|
407 |
+
{
|
408 |
+
"data": {
|
409 |
+
"text/plain": [
|
410 |
+
"'Mimmo '"
|
411 |
+
]
|
412 |
+
},
|
413 |
+
"execution_count": 28,
|
414 |
+
"metadata": {},
|
415 |
+
"output_type": "execute_result"
|
416 |
+
}
|
417 |
+
],
|
418 |
+
"source": [
|
419 |
+
"_re_non_words.sub(\" \", \"Mimmo23\")"
|
420 |
+
]
|
421 |
+
},
|
422 |
{
|
423 |
"cell_type": "code",
|
424 |
"execution_count": null,
|