Pietro Lesci commited on
Commit
57b9989
1 Parent(s): dbb343d

add notebook to test app

Browse files
Files changed (1) hide show
  1. tests/notebook.ipynb +58 -20
tests/notebook.ipynb CHANGED
@@ -36,10 +36,10 @@
36
  " \"normalize_bullet_points\",\n",
37
  " \"normalize_hyphenated_words\",\n",
38
  " \"normalize_quotation_marks\",\n",
 
39
  " \"normalize_repeating_words\",\n",
40
  " \"normalize_repeating_chars\",\n",
41
  " \"normalize_whitespaces\",\n",
42
- " \"normalize_useless_spaces\",\n",
43
  " # \"replace_currency_symbols\",\n",
44
  " # \"replace_emails\",\n",
45
  " # \"replace_emojis\",\n",
@@ -66,20 +66,20 @@
66
  "source": [
67
  "post_steps = [\n",
68
  " \"lowercase\",\n",
69
- " \"replace_currency_symbols\",\n",
70
- " \"replace_urls\",\n",
71
- " \"replace_emails\",\n",
72
- " \"replace_user_handles\",\n",
73
- " \"replace_hashtags\",\n",
74
- " \"replace_emojis\",\n",
75
  " # \"replace_phone_numbers\",\n",
76
  " # \"replace_numbers\",\n",
77
- " \"remove_accents\",\n",
78
- " \"remove_brackets\",\n",
79
  " \"remove_html_tags\",\n",
 
 
80
  " \"remove_non_words\",\n",
81
- " \"remove_numbers\",\n",
82
- " \"remove_punctuation\",\n",
83
  " \"normalize_repeating_words\",\n",
84
  " \"normalize_repeating_chars\",\n",
85
  " \"normalize_useless_spaces\",\n",
@@ -172,7 +172,7 @@
172
  },
173
  {
174
  "cell_type": "code",
175
- "execution_count": 10,
176
  "metadata": {},
177
  "outputs": [
178
  {
@@ -303,7 +303,7 @@
303
  "[5000 rows x 3 columns]"
304
  ]
305
  },
306
- "execution_count": 10,
307
  "metadata": {},
308
  "output_type": "execute_result"
309
  }
@@ -314,7 +314,7 @@
314
  },
315
  {
316
  "cell_type": "code",
317
- "execution_count": 11,
318
  "metadata": {},
319
  "outputs": [],
320
  "source": [
@@ -325,16 +325,16 @@
325
  },
326
  {
327
  "cell_type": "code",
328
- "execution_count": 12,
329
  "metadata": {},
330
  "outputs": [
331
  {
332
  "data": {
333
  "text/plain": [
334
- "[1, 14, 2, 3, 4, 23, 22, 5, 21, 24]"
335
  ]
336
  },
337
- "execution_count": 12,
338
  "metadata": {},
339
  "output_type": "execute_result"
340
  }
@@ -345,16 +345,16 @@
345
  },
346
  {
347
  "cell_type": "code",
348
- "execution_count": 13,
349
  "metadata": {},
350
  "outputs": [
351
  {
352
  "data": {
353
  "text/plain": [
354
- "[0, 7, 6, 8, 13, 10, 9, 15, 16, 17, 19, 20, 18, 23, 22, 21, 24]"
355
  ]
356
  },
357
- "execution_count": 13,
358
  "metadata": {},
359
  "output_type": "execute_result"
360
  }
@@ -381,6 +381,44 @@
381
  "list(PreprocessingPipeline.lemmatization_component().keys())"
382
  ]
383
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  {
385
  "cell_type": "code",
386
  "execution_count": null,
 
36
  " \"normalize_bullet_points\",\n",
37
  " \"normalize_hyphenated_words\",\n",
38
  " \"normalize_quotation_marks\",\n",
39
+ " \"normalize_useless_spaces\",\n",
40
  " \"normalize_repeating_words\",\n",
41
  " \"normalize_repeating_chars\",\n",
42
  " \"normalize_whitespaces\",\n",
 
43
  " # \"replace_currency_symbols\",\n",
44
  " # \"replace_emails\",\n",
45
  " # \"replace_emojis\",\n",
 
66
  "source": [
67
  "post_steps = [\n",
68
  " \"lowercase\",\n",
69
+ " # \"replace_currency_symbols\",\n",
70
+ " # \"replace_urls\",\n",
71
+ " # \"replace_emails\",\n",
72
+ " # \"replace_user_handles\",\n",
73
+ " # \"replace_hashtags\",\n",
74
+ " # \"replace_emojis\",\n",
75
  " # \"replace_phone_numbers\",\n",
76
  " # \"replace_numbers\",\n",
 
 
77
  " \"remove_html_tags\",\n",
78
+ " \"remove_accents\",\n",
79
+ " # \"remove_brackets\",\n",
80
  " \"remove_non_words\",\n",
81
+ " # \"remove_numbers\",\n",
82
+ " # \"remove_punctuation\",\n",
83
  " \"normalize_repeating_words\",\n",
84
  " \"normalize_repeating_chars\",\n",
85
  " \"normalize_useless_spaces\",\n",
 
172
  },
173
  {
174
  "cell_type": "code",
175
+ "execution_count": 14,
176
  "metadata": {},
177
  "outputs": [
178
  {
 
303
  "[5000 rows x 3 columns]"
304
  ]
305
  },
306
+ "execution_count": 14,
307
  "metadata": {},
308
  "output_type": "execute_result"
309
  }
 
314
  },
315
  {
316
  "cell_type": "code",
317
+ "execution_count": 15,
318
  "metadata": {},
319
  "outputs": [],
320
  "source": [
 
325
  },
326
  {
327
  "cell_type": "code",
328
+ "execution_count": 16,
329
  "metadata": {},
330
  "outputs": [
331
  {
332
  "data": {
333
  "text/plain": [
334
+ "[1, 14, 2, 3, 4, 21, 23, 22, 5, 24]"
335
  ]
336
  },
337
+ "execution_count": 16,
338
  "metadata": {},
339
  "output_type": "execute_result"
340
  }
 
345
  },
346
  {
347
  "cell_type": "code",
348
+ "execution_count": 17,
349
  "metadata": {},
350
  "outputs": [
351
  {
352
  "data": {
353
  "text/plain": [
354
+ "[0, 17, 15, 19, 23, 22, 21, 24]"
355
  ]
356
  },
357
+ "execution_count": 17,
358
  "metadata": {},
359
  "output_type": "execute_result"
360
  }
 
381
  "list(PreprocessingPipeline.lemmatization_component().keys())"
382
  ]
383
  },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 14,
387
+ "metadata": {},
388
+ "outputs": [],
389
+ "source": [
390
+ "import re"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 27,
396
+ "metadata": {},
397
+ "outputs": [],
398
+ "source": [
399
+ "_re_non_words = re.compile(\"[^A-Za-z]+\")"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 28,
405
+ "metadata": {},
406
+ "outputs": [
407
+ {
408
+ "data": {
409
+ "text/plain": [
410
+ "'Mimmo '"
411
+ ]
412
+ },
413
+ "execution_count": 28,
414
+ "metadata": {},
415
+ "output_type": "execute_result"
416
+ }
417
+ ],
418
+ "source": [
419
+ "_re_non_words.sub(\" \", \"Mimmo23\")"
420
+ ]
421
+ },
422
  {
423
  "cell_type": "code",
424
  "execution_count": null,