Spaces:
Runtime error
Runtime error
HugoLaurencon
commited on
Commit
•
150f094
1
Parent(s):
78a727e
Update parameters for Indonesian (and default ones)
Browse files- parameters_filtering.py +59 -59
parameters_filtering.py
CHANGED
@@ -26,34 +26,34 @@ parameters_filtering_default = {
|
|
26 |
"cond_check_number_words": True,
|
27 |
"tokenization": False,
|
28 |
"strip_characters": special_characters_default,
|
29 |
-
"number_words_min_cutoff":
|
30 |
"number_words_max_cutoff": 100000,
|
31 |
"cond_check_character_repetition_removal": True,
|
32 |
"character_repetition_length": 10,
|
33 |
-
"character_repetition_max_cutoff": 0.
|
34 |
"cond_check_word_repetition_removal": True,
|
35 |
"word_repetition_length": 5,
|
36 |
-
"word_repetition_max_cutoff": 0.
|
37 |
"cond_check_special_characters": True,
|
38 |
"special_characters": special_characters_default,
|
39 |
"special_characters_max_cutoff": 0.4,
|
40 |
"cond_words_augmentation": False,
|
41 |
"words_augmentation_group_sizes": [],
|
42 |
"words_augmentation_join_char": "",
|
43 |
-
"cond_check_stopwords":
|
44 |
-
"stopwords_min_cutoff": 0,
|
45 |
-
"cond_check_flagged_words":
|
46 |
-
"flagged_words_max_cutoff": 0.
|
47 |
"cond_check_lang_id": True,
|
48 |
"lang_id_min_cutoff": 0.70,
|
49 |
"cond_check_perplexity": False,
|
50 |
-
"perplexity_max_cutoff":
|
51 |
}
|
52 |
|
53 |
parameters_filtering_af = {
|
54 |
"cond_uniform_whitespace": True,
|
55 |
"cond_replace_unicode_punctuation": False,
|
56 |
-
"cond_remove_words_with_incorrect_substrings":
|
57 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
58 |
"cond_remove_long_words": True,
|
59 |
"length_word_max_cutoff": 25,
|
@@ -76,7 +76,7 @@ parameters_filtering_af = {
|
|
76 |
"words_augmentation_join_char": "",
|
77 |
"cond_check_stopwords": True,
|
78 |
"stopwords_min_cutoff": 0,
|
79 |
-
"cond_check_flagged_words":
|
80 |
"flagged_words_max_cutoff": 0.2,
|
81 |
"cond_check_lang_id": True,
|
82 |
"lang_id_min_cutoff": 0.6,
|
@@ -87,7 +87,7 @@ parameters_filtering_af = {
|
|
87 |
parameters_filtering_ar = {
|
88 |
"cond_uniform_whitespace": True,
|
89 |
"cond_replace_unicode_punctuation": False,
|
90 |
-
"cond_remove_words_with_incorrect_substrings":
|
91 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
92 |
"cond_remove_long_words": True,
|
93 |
"length_word_max_cutoff": 25,
|
@@ -110,7 +110,7 @@ parameters_filtering_ar = {
|
|
110 |
"words_augmentation_join_char": "",
|
111 |
"cond_check_stopwords": True,
|
112 |
"stopwords_min_cutoff": 0,
|
113 |
-
"cond_check_flagged_words":
|
114 |
"flagged_words_max_cutoff": 0.2,
|
115 |
"cond_check_lang_id": True,
|
116 |
"lang_id_min_cutoff": 0.75,
|
@@ -121,7 +121,7 @@ parameters_filtering_ar = {
|
|
121 |
parameters_filtering_arz = {
|
122 |
"cond_uniform_whitespace": True,
|
123 |
"cond_replace_unicode_punctuation": False,
|
124 |
-
"cond_remove_words_with_incorrect_substrings":
|
125 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
126 |
"cond_remove_long_words": True,
|
127 |
"length_word_max_cutoff": 25,
|
@@ -144,7 +144,7 @@ parameters_filtering_arz = {
|
|
144 |
"words_augmentation_join_char": "",
|
145 |
"cond_check_stopwords": True,
|
146 |
"stopwords_min_cutoff": 0,
|
147 |
-
"cond_check_flagged_words":
|
148 |
"flagged_words_max_cutoff": 0.2,
|
149 |
"cond_check_lang_id": True,
|
150 |
"lang_id_min_cutoff": 0.75,
|
@@ -155,7 +155,7 @@ parameters_filtering_arz = {
|
|
155 |
parameters_filtering_as = {
|
156 |
"cond_uniform_whitespace": True,
|
157 |
"cond_replace_unicode_punctuation": False,
|
158 |
-
"cond_remove_words_with_incorrect_substrings":
|
159 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
160 |
"cond_remove_long_words": True,
|
161 |
"length_word_max_cutoff": 25,
|
@@ -178,7 +178,7 @@ parameters_filtering_as = {
|
|
178 |
"words_augmentation_join_char": "",
|
179 |
"cond_check_stopwords": True,
|
180 |
"stopwords_min_cutoff": 0,
|
181 |
-
"cond_check_flagged_words":
|
182 |
"flagged_words_max_cutoff": 0.2,
|
183 |
"cond_check_lang_id": True,
|
184 |
"lang_id_min_cutoff": 0.75,
|
@@ -189,7 +189,7 @@ parameters_filtering_as = {
|
|
189 |
parameters_filtering_bn = {
|
190 |
"cond_uniform_whitespace": True,
|
191 |
"cond_replace_unicode_punctuation": False,
|
192 |
-
"cond_remove_words_with_incorrect_substrings":
|
193 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
194 |
"cond_remove_long_words": True,
|
195 |
"length_word_max_cutoff": 30,
|
@@ -212,7 +212,7 @@ parameters_filtering_bn = {
|
|
212 |
"words_augmentation_join_char": "",
|
213 |
"cond_check_stopwords": True,
|
214 |
"stopwords_min_cutoff": 0.05,
|
215 |
-
"cond_check_flagged_words":
|
216 |
"flagged_words_max_cutoff": 0.2,
|
217 |
"cond_check_lang_id": True,
|
218 |
"lang_id_min_cutoff": 0.75,
|
@@ -246,7 +246,7 @@ parameters_filtering_ca = {
|
|
246 |
"words_augmentation_join_char": "",
|
247 |
"cond_check_stopwords": True,
|
248 |
"stopwords_min_cutoff": 0.25,
|
249 |
-
"cond_check_flagged_words":
|
250 |
"flagged_words_max_cutoff": 0.1,
|
251 |
"cond_check_lang_id": True,
|
252 |
"lang_id_min_cutoff": 0.8,
|
@@ -291,7 +291,7 @@ parameters_filtering_en = {
|
|
291 |
parameters_filtering_es = {
|
292 |
"cond_uniform_whitespace": True,
|
293 |
"cond_replace_unicode_punctuation": False,
|
294 |
-
"cond_remove_words_with_incorrect_substrings":
|
295 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
296 |
"cond_remove_long_words": True,
|
297 |
"length_word_max_cutoff": 30,
|
@@ -314,7 +314,7 @@ parameters_filtering_es = {
|
|
314 |
"words_augmentation_join_char": "",
|
315 |
"cond_check_stopwords": True,
|
316 |
"stopwords_min_cutoff": 0.2,
|
317 |
-
"cond_check_flagged_words":
|
318 |
"flagged_words_max_cutoff": 0.2,
|
319 |
"cond_check_lang_id": True,
|
320 |
"lang_id_min_cutoff": 0.75,
|
@@ -325,7 +325,7 @@ parameters_filtering_es = {
|
|
325 |
parameters_filtering_eu = {
|
326 |
"cond_uniform_whitespace": True,
|
327 |
"cond_replace_unicode_punctuation": False,
|
328 |
-
"cond_remove_words_with_incorrect_substrings":
|
329 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
330 |
"cond_remove_long_words": True,
|
331 |
"length_word_max_cutoff": 35,
|
@@ -348,7 +348,7 @@ parameters_filtering_eu = {
|
|
348 |
"words_augmentation_join_char": "",
|
349 |
"cond_check_stopwords": True,
|
350 |
"stopwords_min_cutoff": 0,
|
351 |
-
"cond_check_flagged_words":
|
352 |
"flagged_words_max_cutoff": 0.2,
|
353 |
"cond_check_lang_id": True,
|
354 |
"lang_id_min_cutoff": 0.75,
|
@@ -382,7 +382,7 @@ parameters_filtering_fr = {
|
|
382 |
"words_augmentation_join_char": "",
|
383 |
"cond_check_stopwords": True,
|
384 |
"stopwords_min_cutoff": 0.27,
|
385 |
-
"cond_check_flagged_words":
|
386 |
"flagged_words_max_cutoff": 0.008,
|
387 |
"cond_check_lang_id": True,
|
388 |
"lang_id_min_cutoff": 0.8,
|
@@ -393,7 +393,7 @@ parameters_filtering_fr = {
|
|
393 |
parameters_filtering_gu = {
|
394 |
"cond_uniform_whitespace": True,
|
395 |
"cond_replace_unicode_punctuation": False,
|
396 |
-
"cond_remove_words_with_incorrect_substrings":
|
397 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
398 |
"cond_remove_long_words": True,
|
399 |
"length_word_max_cutoff": 30,
|
@@ -416,7 +416,7 @@ parameters_filtering_gu = {
|
|
416 |
"words_augmentation_join_char": "",
|
417 |
"cond_check_stopwords": True,
|
418 |
"stopwords_min_cutoff": 0,
|
419 |
-
"cond_check_flagged_words":
|
420 |
"flagged_words_max_cutoff": 0.2,
|
421 |
"cond_check_lang_id": True,
|
422 |
"lang_id_min_cutoff": 0.75,
|
@@ -427,7 +427,7 @@ parameters_filtering_gu = {
|
|
427 |
parameters_filtering_hi = {
|
428 |
"cond_uniform_whitespace": True,
|
429 |
"cond_replace_unicode_punctuation": False,
|
430 |
-
"cond_remove_words_with_incorrect_substrings":
|
431 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
432 |
"cond_remove_long_words": True,
|
433 |
"length_word_max_cutoff": 25,
|
@@ -450,7 +450,7 @@ parameters_filtering_hi = {
|
|
450 |
"words_augmentation_join_char": "",
|
451 |
"cond_check_stopwords": True,
|
452 |
"stopwords_min_cutoff": 0,
|
453 |
-
"cond_check_flagged_words":
|
454 |
"flagged_words_max_cutoff": 0.2,
|
455 |
"cond_check_lang_id": True,
|
456 |
"lang_id_min_cutoff": 0.75,
|
@@ -461,41 +461,41 @@ parameters_filtering_hi = {
|
|
461 |
parameters_filtering_id = {
|
462 |
"cond_uniform_whitespace": True,
|
463 |
"cond_replace_unicode_punctuation": False,
|
464 |
-
"cond_remove_words_with_incorrect_substrings":
|
465 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
466 |
"cond_remove_long_words": True,
|
467 |
"length_word_max_cutoff": 30,
|
468 |
"cond_check_number_words": True,
|
469 |
"tokenization": False,
|
470 |
"strip_characters": special_characters_default,
|
471 |
-
"number_words_min_cutoff":
|
472 |
"number_words_max_cutoff": 100000,
|
473 |
"cond_check_character_repetition_removal": True,
|
474 |
"character_repetition_length": 10,
|
475 |
-
"character_repetition_max_cutoff": 0.
|
476 |
"cond_check_word_repetition_removal": True,
|
477 |
"word_repetition_length": 5,
|
478 |
-
"word_repetition_max_cutoff": 0.
|
479 |
"cond_check_special_characters": True,
|
480 |
"special_characters": special_characters_default,
|
481 |
-
"special_characters_max_cutoff": 0.
|
482 |
"cond_words_augmentation": False,
|
483 |
"words_augmentation_group_sizes": [],
|
484 |
"words_augmentation_join_char": "",
|
485 |
"cond_check_stopwords": True,
|
486 |
-
"stopwords_min_cutoff": 0.
|
487 |
-
"cond_check_flagged_words":
|
488 |
-
"flagged_words_max_cutoff": 0.
|
489 |
"cond_check_lang_id": True,
|
490 |
-
"lang_id_min_cutoff": 0.
|
491 |
"cond_check_perplexity": True,
|
492 |
-
"perplexity_max_cutoff":
|
493 |
}
|
494 |
|
495 |
parameters_filtering_kn = {
|
496 |
"cond_uniform_whitespace": True,
|
497 |
"cond_replace_unicode_punctuation": False,
|
498 |
-
"cond_remove_words_with_incorrect_substrings":
|
499 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
500 |
"cond_remove_long_words": True,
|
501 |
"length_word_max_cutoff": 50,
|
@@ -518,7 +518,7 @@ parameters_filtering_kn = {
|
|
518 |
"words_augmentation_join_char": "",
|
519 |
"cond_check_stopwords": True,
|
520 |
"stopwords_min_cutoff": 0,
|
521 |
-
"cond_check_flagged_words":
|
522 |
"flagged_words_max_cutoff": 0.2,
|
523 |
"cond_check_lang_id": True,
|
524 |
"lang_id_min_cutoff": 0.75,
|
@@ -529,7 +529,7 @@ parameters_filtering_kn = {
|
|
529 |
parameters_filtering_ml = {
|
530 |
"cond_uniform_whitespace": True,
|
531 |
"cond_replace_unicode_punctuation": False,
|
532 |
-
"cond_remove_words_with_incorrect_substrings":
|
533 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
534 |
"cond_remove_long_words": True,
|
535 |
"length_word_max_cutoff": 50,
|
@@ -552,7 +552,7 @@ parameters_filtering_ml = {
|
|
552 |
"words_augmentation_join_char": "",
|
553 |
"cond_check_stopwords": True,
|
554 |
"stopwords_min_cutoff": 0,
|
555 |
-
"cond_check_flagged_words":
|
556 |
"flagged_words_max_cutoff": 0.2,
|
557 |
"cond_check_lang_id": True,
|
558 |
"lang_id_min_cutoff": 0.75,
|
@@ -563,7 +563,7 @@ parameters_filtering_ml = {
|
|
563 |
parameters_filtering_mr = {
|
564 |
"cond_uniform_whitespace": True,
|
565 |
"cond_replace_unicode_punctuation": False,
|
566 |
-
"cond_remove_words_with_incorrect_substrings":
|
567 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
568 |
"cond_remove_long_words": True,
|
569 |
"length_word_max_cutoff": 30,
|
@@ -586,7 +586,7 @@ parameters_filtering_mr = {
|
|
586 |
"words_augmentation_join_char": "",
|
587 |
"cond_check_stopwords": True,
|
588 |
"stopwords_min_cutoff": 0,
|
589 |
-
"cond_check_flagged_words":
|
590 |
"flagged_words_max_cutoff": 0.2,
|
591 |
"cond_check_lang_id": True,
|
592 |
"lang_id_min_cutoff": 0.75,
|
@@ -620,7 +620,7 @@ parameters_filtering_pt = {
|
|
620 |
"words_augmentation_join_char": "",
|
621 |
"cond_check_stopwords": True,
|
622 |
"stopwords_min_cutoff": 0.2,
|
623 |
-
"cond_check_flagged_words":
|
624 |
"flagged_words_max_cutoff": 0.007,
|
625 |
"cond_check_lang_id": True,
|
626 |
"lang_id_min_cutoff": 0.6,
|
@@ -631,7 +631,7 @@ parameters_filtering_pt = {
|
|
631 |
parameters_filtering_sw = {
|
632 |
"cond_uniform_whitespace": True,
|
633 |
"cond_replace_unicode_punctuation": False,
|
634 |
-
"cond_remove_words_with_incorrect_substrings":
|
635 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
636 |
"cond_remove_long_words": True,
|
637 |
"length_word_max_cutoff": 30,
|
@@ -654,7 +654,7 @@ parameters_filtering_sw = {
|
|
654 |
"words_augmentation_join_char": "",
|
655 |
"cond_check_stopwords": True,
|
656 |
"stopwords_min_cutoff": 0,
|
657 |
-
"cond_check_flagged_words":
|
658 |
"flagged_words_max_cutoff": 0.2,
|
659 |
"cond_check_lang_id": True,
|
660 |
"lang_id_min_cutoff": 0.75,
|
@@ -665,7 +665,7 @@ parameters_filtering_sw = {
|
|
665 |
parameters_filtering_ta = {
|
666 |
"cond_uniform_whitespace": True,
|
667 |
"cond_replace_unicode_punctuation": False,
|
668 |
-
"cond_remove_words_with_incorrect_substrings":
|
669 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
670 |
"cond_remove_long_words": True,
|
671 |
"length_word_max_cutoff": 50,
|
@@ -688,7 +688,7 @@ parameters_filtering_ta = {
|
|
688 |
"words_augmentation_join_char": "",
|
689 |
"cond_check_stopwords": True,
|
690 |
"stopwords_min_cutoff": 0,
|
691 |
-
"cond_check_flagged_words":
|
692 |
"flagged_words_max_cutoff": 0.2,
|
693 |
"cond_check_lang_id": True,
|
694 |
"lang_id_min_cutoff": 0.75,
|
@@ -699,7 +699,7 @@ parameters_filtering_ta = {
|
|
699 |
parameters_filtering_te = {
|
700 |
"cond_uniform_whitespace": True,
|
701 |
"cond_replace_unicode_punctuation": False,
|
702 |
-
"cond_remove_words_with_incorrect_substrings":
|
703 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
704 |
"cond_remove_long_words": True,
|
705 |
"length_word_max_cutoff": 35,
|
@@ -722,7 +722,7 @@ parameters_filtering_te = {
|
|
722 |
"words_augmentation_join_char": "",
|
723 |
"cond_check_stopwords": True,
|
724 |
"stopwords_min_cutoff": 0,
|
725 |
-
"cond_check_flagged_words":
|
726 |
"flagged_words_max_cutoff": 0.2,
|
727 |
"cond_check_lang_id": True,
|
728 |
"lang_id_min_cutoff": 0.75,
|
@@ -733,7 +733,7 @@ parameters_filtering_te = {
|
|
733 |
parameters_filtering_ur = {
|
734 |
"cond_uniform_whitespace": True,
|
735 |
"cond_replace_unicode_punctuation": False,
|
736 |
-
"cond_remove_words_with_incorrect_substrings":
|
737 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
738 |
"cond_remove_long_words": True,
|
739 |
"length_word_max_cutoff": 30,
|
@@ -756,7 +756,7 @@ parameters_filtering_ur = {
|
|
756 |
"words_augmentation_join_char": "",
|
757 |
"cond_check_stopwords": True,
|
758 |
"stopwords_min_cutoff": 0,
|
759 |
-
"cond_check_flagged_words":
|
760 |
"flagged_words_max_cutoff": 0.2,
|
761 |
"cond_check_lang_id": True,
|
762 |
"lang_id_min_cutoff": 0.75,
|
@@ -767,7 +767,7 @@ parameters_filtering_ur = {
|
|
767 |
parameters_filtering_vi = {
|
768 |
"cond_uniform_whitespace": True,
|
769 |
"cond_replace_unicode_punctuation": False,
|
770 |
-
"cond_remove_words_with_incorrect_substrings":
|
771 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
772 |
"cond_remove_long_words": True,
|
773 |
"length_word_max_cutoff": 30,
|
@@ -790,7 +790,7 @@ parameters_filtering_vi = {
|
|
790 |
"words_augmentation_join_char": " ",
|
791 |
"cond_check_stopwords": True,
|
792 |
"stopwords_min_cutoff": 0,
|
793 |
-
"cond_check_flagged_words":
|
794 |
"flagged_words_max_cutoff": 0.2,
|
795 |
"cond_check_lang_id": True,
|
796 |
"lang_id_min_cutoff": 0.75,
|
@@ -801,7 +801,7 @@ parameters_filtering_vi = {
|
|
801 |
parameters_filtering_yo = {
|
802 |
"cond_uniform_whitespace": True,
|
803 |
"cond_replace_unicode_punctuation": False,
|
804 |
-
"cond_remove_words_with_incorrect_substrings":
|
805 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
806 |
"cond_remove_long_words": True,
|
807 |
"length_word_max_cutoff": 30,
|
@@ -824,7 +824,7 @@ parameters_filtering_yo = {
|
|
824 |
"words_augmentation_join_char": "",
|
825 |
"cond_check_stopwords": True,
|
826 |
"stopwords_min_cutoff": 0,
|
827 |
-
"cond_check_flagged_words":
|
828 |
"flagged_words_max_cutoff": 0.2,
|
829 |
"cond_check_lang_id": True,
|
830 |
"lang_id_min_cutoff": 0.75,
|
@@ -856,9 +856,9 @@ parameters_filtering_zh = {
|
|
856 |
"cond_words_augmentation": True,
|
857 |
"words_augmentation_group_sizes": [2],
|
858 |
"words_augmentation_join_char": "",
|
859 |
-
"cond_check_stopwords":
|
860 |
"stopwords_min_cutoff": 0,
|
861 |
-
"cond_check_flagged_words":
|
862 |
"flagged_words_max_cutoff": 0.2,
|
863 |
"cond_check_lang_id": True,
|
864 |
"lang_id_min_cutoff": 0.75,
|
|
|
26 |
"cond_check_number_words": True,
|
27 |
"tokenization": False,
|
28 |
"strip_characters": special_characters_default,
|
29 |
+
"number_words_min_cutoff": 10,
|
30 |
"number_words_max_cutoff": 100000,
|
31 |
"cond_check_character_repetition_removal": True,
|
32 |
"character_repetition_length": 10,
|
33 |
+
"character_repetition_max_cutoff": 0.2,
|
34 |
"cond_check_word_repetition_removal": True,
|
35 |
"word_repetition_length": 5,
|
36 |
+
"word_repetition_max_cutoff": 0.3,
|
37 |
"cond_check_special_characters": True,
|
38 |
"special_characters": special_characters_default,
|
39 |
"special_characters_max_cutoff": 0.4,
|
40 |
"cond_words_augmentation": False,
|
41 |
"words_augmentation_group_sizes": [],
|
42 |
"words_augmentation_join_char": "",
|
43 |
+
"cond_check_stopwords": True,
|
44 |
+
"stopwords_min_cutoff": 0.1,
|
45 |
+
"cond_check_flagged_words": True,
|
46 |
+
"flagged_words_max_cutoff": 0.1,
|
47 |
"cond_check_lang_id": True,
|
48 |
"lang_id_min_cutoff": 0.70,
|
49 |
"cond_check_perplexity": False,
|
50 |
+
"perplexity_max_cutoff": 10000,
|
51 |
}
|
52 |
|
53 |
parameters_filtering_af = {
|
54 |
"cond_uniform_whitespace": True,
|
55 |
"cond_replace_unicode_punctuation": False,
|
56 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
57 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
58 |
"cond_remove_long_words": True,
|
59 |
"length_word_max_cutoff": 25,
|
|
|
76 |
"words_augmentation_join_char": "",
|
77 |
"cond_check_stopwords": True,
|
78 |
"stopwords_min_cutoff": 0,
|
79 |
+
"cond_check_flagged_words": True,
|
80 |
"flagged_words_max_cutoff": 0.2,
|
81 |
"cond_check_lang_id": True,
|
82 |
"lang_id_min_cutoff": 0.6,
|
|
|
87 |
parameters_filtering_ar = {
|
88 |
"cond_uniform_whitespace": True,
|
89 |
"cond_replace_unicode_punctuation": False,
|
90 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
91 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
92 |
"cond_remove_long_words": True,
|
93 |
"length_word_max_cutoff": 25,
|
|
|
110 |
"words_augmentation_join_char": "",
|
111 |
"cond_check_stopwords": True,
|
112 |
"stopwords_min_cutoff": 0,
|
113 |
+
"cond_check_flagged_words": True,
|
114 |
"flagged_words_max_cutoff": 0.2,
|
115 |
"cond_check_lang_id": True,
|
116 |
"lang_id_min_cutoff": 0.75,
|
|
|
121 |
parameters_filtering_arz = {
|
122 |
"cond_uniform_whitespace": True,
|
123 |
"cond_replace_unicode_punctuation": False,
|
124 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
125 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
126 |
"cond_remove_long_words": True,
|
127 |
"length_word_max_cutoff": 25,
|
|
|
144 |
"words_augmentation_join_char": "",
|
145 |
"cond_check_stopwords": True,
|
146 |
"stopwords_min_cutoff": 0,
|
147 |
+
"cond_check_flagged_words": True,
|
148 |
"flagged_words_max_cutoff": 0.2,
|
149 |
"cond_check_lang_id": True,
|
150 |
"lang_id_min_cutoff": 0.75,
|
|
|
155 |
parameters_filtering_as = {
|
156 |
"cond_uniform_whitespace": True,
|
157 |
"cond_replace_unicode_punctuation": False,
|
158 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
159 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
160 |
"cond_remove_long_words": True,
|
161 |
"length_word_max_cutoff": 25,
|
|
|
178 |
"words_augmentation_join_char": "",
|
179 |
"cond_check_stopwords": True,
|
180 |
"stopwords_min_cutoff": 0,
|
181 |
+
"cond_check_flagged_words": True,
|
182 |
"flagged_words_max_cutoff": 0.2,
|
183 |
"cond_check_lang_id": True,
|
184 |
"lang_id_min_cutoff": 0.75,
|
|
|
189 |
parameters_filtering_bn = {
|
190 |
"cond_uniform_whitespace": True,
|
191 |
"cond_replace_unicode_punctuation": False,
|
192 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
193 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
194 |
"cond_remove_long_words": True,
|
195 |
"length_word_max_cutoff": 30,
|
|
|
212 |
"words_augmentation_join_char": "",
|
213 |
"cond_check_stopwords": True,
|
214 |
"stopwords_min_cutoff": 0.05,
|
215 |
+
"cond_check_flagged_words": True,
|
216 |
"flagged_words_max_cutoff": 0.2,
|
217 |
"cond_check_lang_id": True,
|
218 |
"lang_id_min_cutoff": 0.75,
|
|
|
246 |
"words_augmentation_join_char": "",
|
247 |
"cond_check_stopwords": True,
|
248 |
"stopwords_min_cutoff": 0.25,
|
249 |
+
"cond_check_flagged_words": True,
|
250 |
"flagged_words_max_cutoff": 0.1,
|
251 |
"cond_check_lang_id": True,
|
252 |
"lang_id_min_cutoff": 0.8,
|
|
|
291 |
parameters_filtering_es = {
|
292 |
"cond_uniform_whitespace": True,
|
293 |
"cond_replace_unicode_punctuation": False,
|
294 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
295 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
296 |
"cond_remove_long_words": True,
|
297 |
"length_word_max_cutoff": 30,
|
|
|
314 |
"words_augmentation_join_char": "",
|
315 |
"cond_check_stopwords": True,
|
316 |
"stopwords_min_cutoff": 0.2,
|
317 |
+
"cond_check_flagged_words": True,
|
318 |
"flagged_words_max_cutoff": 0.2,
|
319 |
"cond_check_lang_id": True,
|
320 |
"lang_id_min_cutoff": 0.75,
|
|
|
325 |
parameters_filtering_eu = {
|
326 |
"cond_uniform_whitespace": True,
|
327 |
"cond_replace_unicode_punctuation": False,
|
328 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
329 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
330 |
"cond_remove_long_words": True,
|
331 |
"length_word_max_cutoff": 35,
|
|
|
348 |
"words_augmentation_join_char": "",
|
349 |
"cond_check_stopwords": True,
|
350 |
"stopwords_min_cutoff": 0,
|
351 |
+
"cond_check_flagged_words": True,
|
352 |
"flagged_words_max_cutoff": 0.2,
|
353 |
"cond_check_lang_id": True,
|
354 |
"lang_id_min_cutoff": 0.75,
|
|
|
382 |
"words_augmentation_join_char": "",
|
383 |
"cond_check_stopwords": True,
|
384 |
"stopwords_min_cutoff": 0.27,
|
385 |
+
"cond_check_flagged_words": True,
|
386 |
"flagged_words_max_cutoff": 0.008,
|
387 |
"cond_check_lang_id": True,
|
388 |
"lang_id_min_cutoff": 0.8,
|
|
|
393 |
parameters_filtering_gu = {
|
394 |
"cond_uniform_whitespace": True,
|
395 |
"cond_replace_unicode_punctuation": False,
|
396 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
397 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
398 |
"cond_remove_long_words": True,
|
399 |
"length_word_max_cutoff": 30,
|
|
|
416 |
"words_augmentation_join_char": "",
|
417 |
"cond_check_stopwords": True,
|
418 |
"stopwords_min_cutoff": 0,
|
419 |
+
"cond_check_flagged_words": True,
|
420 |
"flagged_words_max_cutoff": 0.2,
|
421 |
"cond_check_lang_id": True,
|
422 |
"lang_id_min_cutoff": 0.75,
|
|
|
427 |
parameters_filtering_hi = {
|
428 |
"cond_uniform_whitespace": True,
|
429 |
"cond_replace_unicode_punctuation": False,
|
430 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
431 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
432 |
"cond_remove_long_words": True,
|
433 |
"length_word_max_cutoff": 25,
|
|
|
450 |
"words_augmentation_join_char": "",
|
451 |
"cond_check_stopwords": True,
|
452 |
"stopwords_min_cutoff": 0,
|
453 |
+
"cond_check_flagged_words": True,
|
454 |
"flagged_words_max_cutoff": 0.2,
|
455 |
"cond_check_lang_id": True,
|
456 |
"lang_id_min_cutoff": 0.75,
|
|
|
461 |
parameters_filtering_id = {
|
462 |
"cond_uniform_whitespace": True,
|
463 |
"cond_replace_unicode_punctuation": False,
|
464 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
465 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
466 |
"cond_remove_long_words": True,
|
467 |
"length_word_max_cutoff": 30,
|
468 |
"cond_check_number_words": True,
|
469 |
"tokenization": False,
|
470 |
"strip_characters": special_characters_default,
|
471 |
+
"number_words_min_cutoff": 15,
|
472 |
"number_words_max_cutoff": 100000,
|
473 |
"cond_check_character_repetition_removal": True,
|
474 |
"character_repetition_length": 10,
|
475 |
+
"character_repetition_max_cutoff": 0.15,
|
476 |
"cond_check_word_repetition_removal": True,
|
477 |
"word_repetition_length": 5,
|
478 |
+
"word_repetition_max_cutoff": 0.20,
|
479 |
"cond_check_special_characters": True,
|
480 |
"special_characters": special_characters_default,
|
481 |
+
"special_characters_max_cutoff": 0.34,
|
482 |
"cond_words_augmentation": False,
|
483 |
"words_augmentation_group_sizes": [],
|
484 |
"words_augmentation_join_char": "",
|
485 |
"cond_check_stopwords": True,
|
486 |
+
"stopwords_min_cutoff": 0.15,
|
487 |
+
"cond_check_flagged_words": True,
|
488 |
+
"flagged_words_max_cutoff": 0.01,
|
489 |
"cond_check_lang_id": True,
|
490 |
+
"lang_id_min_cutoff": 0.7,
|
491 |
"cond_check_perplexity": True,
|
492 |
+
"perplexity_max_cutoff": 5000,
|
493 |
}
|
494 |
|
495 |
parameters_filtering_kn = {
|
496 |
"cond_uniform_whitespace": True,
|
497 |
"cond_replace_unicode_punctuation": False,
|
498 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
499 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
500 |
"cond_remove_long_words": True,
|
501 |
"length_word_max_cutoff": 50,
|
|
|
518 |
"words_augmentation_join_char": "",
|
519 |
"cond_check_stopwords": True,
|
520 |
"stopwords_min_cutoff": 0,
|
521 |
+
"cond_check_flagged_words": True,
|
522 |
"flagged_words_max_cutoff": 0.2,
|
523 |
"cond_check_lang_id": True,
|
524 |
"lang_id_min_cutoff": 0.75,
|
|
|
529 |
parameters_filtering_ml = {
|
530 |
"cond_uniform_whitespace": True,
|
531 |
"cond_replace_unicode_punctuation": False,
|
532 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
533 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
534 |
"cond_remove_long_words": True,
|
535 |
"length_word_max_cutoff": 50,
|
|
|
552 |
"words_augmentation_join_char": "",
|
553 |
"cond_check_stopwords": True,
|
554 |
"stopwords_min_cutoff": 0,
|
555 |
+
"cond_check_flagged_words": True,
|
556 |
"flagged_words_max_cutoff": 0.2,
|
557 |
"cond_check_lang_id": True,
|
558 |
"lang_id_min_cutoff": 0.75,
|
|
|
563 |
parameters_filtering_mr = {
|
564 |
"cond_uniform_whitespace": True,
|
565 |
"cond_replace_unicode_punctuation": False,
|
566 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
567 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
568 |
"cond_remove_long_words": True,
|
569 |
"length_word_max_cutoff": 30,
|
|
|
586 |
"words_augmentation_join_char": "",
|
587 |
"cond_check_stopwords": True,
|
588 |
"stopwords_min_cutoff": 0,
|
589 |
+
"cond_check_flagged_words": True,
|
590 |
"flagged_words_max_cutoff": 0.2,
|
591 |
"cond_check_lang_id": True,
|
592 |
"lang_id_min_cutoff": 0.75,
|
|
|
620 |
"words_augmentation_join_char": "",
|
621 |
"cond_check_stopwords": True,
|
622 |
"stopwords_min_cutoff": 0.2,
|
623 |
+
"cond_check_flagged_words": True,
|
624 |
"flagged_words_max_cutoff": 0.007,
|
625 |
"cond_check_lang_id": True,
|
626 |
"lang_id_min_cutoff": 0.6,
|
|
|
631 |
parameters_filtering_sw = {
|
632 |
"cond_uniform_whitespace": True,
|
633 |
"cond_replace_unicode_punctuation": False,
|
634 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
635 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
636 |
"cond_remove_long_words": True,
|
637 |
"length_word_max_cutoff": 30,
|
|
|
654 |
"words_augmentation_join_char": "",
|
655 |
"cond_check_stopwords": True,
|
656 |
"stopwords_min_cutoff": 0,
|
657 |
+
"cond_check_flagged_words": True,
|
658 |
"flagged_words_max_cutoff": 0.2,
|
659 |
"cond_check_lang_id": True,
|
660 |
"lang_id_min_cutoff": 0.75,
|
|
|
665 |
parameters_filtering_ta = {
|
666 |
"cond_uniform_whitespace": True,
|
667 |
"cond_replace_unicode_punctuation": False,
|
668 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
669 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
670 |
"cond_remove_long_words": True,
|
671 |
"length_word_max_cutoff": 50,
|
|
|
688 |
"words_augmentation_join_char": "",
|
689 |
"cond_check_stopwords": True,
|
690 |
"stopwords_min_cutoff": 0,
|
691 |
+
"cond_check_flagged_words": True,
|
692 |
"flagged_words_max_cutoff": 0.2,
|
693 |
"cond_check_lang_id": True,
|
694 |
"lang_id_min_cutoff": 0.75,
|
|
|
699 |
parameters_filtering_te = {
|
700 |
"cond_uniform_whitespace": True,
|
701 |
"cond_replace_unicode_punctuation": False,
|
702 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
703 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
704 |
"cond_remove_long_words": True,
|
705 |
"length_word_max_cutoff": 35,
|
|
|
722 |
"words_augmentation_join_char": "",
|
723 |
"cond_check_stopwords": True,
|
724 |
"stopwords_min_cutoff": 0,
|
725 |
+
"cond_check_flagged_words": True,
|
726 |
"flagged_words_max_cutoff": 0.2,
|
727 |
"cond_check_lang_id": True,
|
728 |
"lang_id_min_cutoff": 0.75,
|
|
|
733 |
parameters_filtering_ur = {
|
734 |
"cond_uniform_whitespace": True,
|
735 |
"cond_replace_unicode_punctuation": False,
|
736 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
737 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
738 |
"cond_remove_long_words": True,
|
739 |
"length_word_max_cutoff": 30,
|
|
|
756 |
"words_augmentation_join_char": "",
|
757 |
"cond_check_stopwords": True,
|
758 |
"stopwords_min_cutoff": 0,
|
759 |
+
"cond_check_flagged_words": True,
|
760 |
"flagged_words_max_cutoff": 0.2,
|
761 |
"cond_check_lang_id": True,
|
762 |
"lang_id_min_cutoff": 0.75,
|
|
|
767 |
parameters_filtering_vi = {
|
768 |
"cond_uniform_whitespace": True,
|
769 |
"cond_replace_unicode_punctuation": False,
|
770 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
771 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
772 |
"cond_remove_long_words": True,
|
773 |
"length_word_max_cutoff": 30,
|
|
|
790 |
"words_augmentation_join_char": " ",
|
791 |
"cond_check_stopwords": True,
|
792 |
"stopwords_min_cutoff": 0,
|
793 |
+
"cond_check_flagged_words": True,
|
794 |
"flagged_words_max_cutoff": 0.2,
|
795 |
"cond_check_lang_id": True,
|
796 |
"lang_id_min_cutoff": 0.75,
|
|
|
801 |
parameters_filtering_yo = {
|
802 |
"cond_uniform_whitespace": True,
|
803 |
"cond_replace_unicode_punctuation": False,
|
804 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
805 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
806 |
"cond_remove_long_words": True,
|
807 |
"length_word_max_cutoff": 30,
|
|
|
824 |
"words_augmentation_join_char": "",
|
825 |
"cond_check_stopwords": True,
|
826 |
"stopwords_min_cutoff": 0,
|
827 |
+
"cond_check_flagged_words": True,
|
828 |
"flagged_words_max_cutoff": 0.2,
|
829 |
"cond_check_lang_id": True,
|
830 |
"lang_id_min_cutoff": 0.75,
|
|
|
856 |
"cond_words_augmentation": True,
|
857 |
"words_augmentation_group_sizes": [2],
|
858 |
"words_augmentation_join_char": "",
|
859 |
+
"cond_check_stopwords": True,
|
860 |
"stopwords_min_cutoff": 0,
|
861 |
+
"cond_check_flagged_words": True,
|
862 |
"flagged_words_max_cutoff": 0.2,
|
863 |
"cond_check_lang_id": True,
|
864 |
"lang_id_min_cutoff": 0.75,
|