HugoLaurencon HF staff commited on
Commit
150f094
1 Parent(s): 78a727e

Update parameters for Indonesian (and default ones)

Browse files
Files changed (1) hide show
  1. parameters_filtering.py +59 -59
parameters_filtering.py CHANGED
@@ -26,34 +26,34 @@ parameters_filtering_default = {
26
  "cond_check_number_words": True,
27
  "tokenization": False,
28
  "strip_characters": special_characters_default,
29
- "number_words_min_cutoff": 1,
30
  "number_words_max_cutoff": 100000,
31
  "cond_check_character_repetition_removal": True,
32
  "character_repetition_length": 10,
33
- "character_repetition_max_cutoff": 0.106,
34
  "cond_check_word_repetition_removal": True,
35
  "word_repetition_length": 5,
36
- "word_repetition_max_cutoff": 0.19,
37
  "cond_check_special_characters": True,
38
  "special_characters": special_characters_default,
39
  "special_characters_max_cutoff": 0.4,
40
  "cond_words_augmentation": False,
41
  "words_augmentation_group_sizes": [],
42
  "words_augmentation_join_char": "",
43
- "cond_check_stopwords": False,
44
- "stopwords_min_cutoff": 0,
45
- "cond_check_flagged_words": False,
46
- "flagged_words_max_cutoff": 0.2,
47
  "cond_check_lang_id": True,
48
  "lang_id_min_cutoff": 0.70,
49
  "cond_check_perplexity": False,
50
- "perplexity_max_cutoff": 3000000,
51
  }
52
 
53
  parameters_filtering_af = {
54
  "cond_uniform_whitespace": True,
55
  "cond_replace_unicode_punctuation": False,
56
- "cond_remove_words_with_incorrect_substrings": False,
57
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
58
  "cond_remove_long_words": True,
59
  "length_word_max_cutoff": 25,
@@ -76,7 +76,7 @@ parameters_filtering_af = {
76
  "words_augmentation_join_char": "",
77
  "cond_check_stopwords": True,
78
  "stopwords_min_cutoff": 0,
79
- "cond_check_flagged_words": False,
80
  "flagged_words_max_cutoff": 0.2,
81
  "cond_check_lang_id": True,
82
  "lang_id_min_cutoff": 0.6,
@@ -87,7 +87,7 @@ parameters_filtering_af = {
87
  parameters_filtering_ar = {
88
  "cond_uniform_whitespace": True,
89
  "cond_replace_unicode_punctuation": False,
90
- "cond_remove_words_with_incorrect_substrings": False,
91
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
92
  "cond_remove_long_words": True,
93
  "length_word_max_cutoff": 25,
@@ -110,7 +110,7 @@ parameters_filtering_ar = {
110
  "words_augmentation_join_char": "",
111
  "cond_check_stopwords": True,
112
  "stopwords_min_cutoff": 0,
113
- "cond_check_flagged_words": False,
114
  "flagged_words_max_cutoff": 0.2,
115
  "cond_check_lang_id": True,
116
  "lang_id_min_cutoff": 0.75,
@@ -121,7 +121,7 @@ parameters_filtering_ar = {
121
  parameters_filtering_arz = {
122
  "cond_uniform_whitespace": True,
123
  "cond_replace_unicode_punctuation": False,
124
- "cond_remove_words_with_incorrect_substrings": False,
125
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
126
  "cond_remove_long_words": True,
127
  "length_word_max_cutoff": 25,
@@ -144,7 +144,7 @@ parameters_filtering_arz = {
144
  "words_augmentation_join_char": "",
145
  "cond_check_stopwords": True,
146
  "stopwords_min_cutoff": 0,
147
- "cond_check_flagged_words": False,
148
  "flagged_words_max_cutoff": 0.2,
149
  "cond_check_lang_id": True,
150
  "lang_id_min_cutoff": 0.75,
@@ -155,7 +155,7 @@ parameters_filtering_arz = {
155
  parameters_filtering_as = {
156
  "cond_uniform_whitespace": True,
157
  "cond_replace_unicode_punctuation": False,
158
- "cond_remove_words_with_incorrect_substrings": False,
159
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
160
  "cond_remove_long_words": True,
161
  "length_word_max_cutoff": 25,
@@ -178,7 +178,7 @@ parameters_filtering_as = {
178
  "words_augmentation_join_char": "",
179
  "cond_check_stopwords": True,
180
  "stopwords_min_cutoff": 0,
181
- "cond_check_flagged_words": False,
182
  "flagged_words_max_cutoff": 0.2,
183
  "cond_check_lang_id": True,
184
  "lang_id_min_cutoff": 0.75,
@@ -189,7 +189,7 @@ parameters_filtering_as = {
189
  parameters_filtering_bn = {
190
  "cond_uniform_whitespace": True,
191
  "cond_replace_unicode_punctuation": False,
192
- "cond_remove_words_with_incorrect_substrings": False,
193
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
194
  "cond_remove_long_words": True,
195
  "length_word_max_cutoff": 30,
@@ -212,7 +212,7 @@ parameters_filtering_bn = {
212
  "words_augmentation_join_char": "",
213
  "cond_check_stopwords": True,
214
  "stopwords_min_cutoff": 0.05,
215
- "cond_check_flagged_words": False,
216
  "flagged_words_max_cutoff": 0.2,
217
  "cond_check_lang_id": True,
218
  "lang_id_min_cutoff": 0.75,
@@ -246,7 +246,7 @@ parameters_filtering_ca = {
246
  "words_augmentation_join_char": "",
247
  "cond_check_stopwords": True,
248
  "stopwords_min_cutoff": 0.25,
249
- "cond_check_flagged_words": False,
250
  "flagged_words_max_cutoff": 0.1,
251
  "cond_check_lang_id": True,
252
  "lang_id_min_cutoff": 0.8,
@@ -291,7 +291,7 @@ parameters_filtering_en = {
291
  parameters_filtering_es = {
292
  "cond_uniform_whitespace": True,
293
  "cond_replace_unicode_punctuation": False,
294
- "cond_remove_words_with_incorrect_substrings": False,
295
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
296
  "cond_remove_long_words": True,
297
  "length_word_max_cutoff": 30,
@@ -314,7 +314,7 @@ parameters_filtering_es = {
314
  "words_augmentation_join_char": "",
315
  "cond_check_stopwords": True,
316
  "stopwords_min_cutoff": 0.2,
317
- "cond_check_flagged_words": False,
318
  "flagged_words_max_cutoff": 0.2,
319
  "cond_check_lang_id": True,
320
  "lang_id_min_cutoff": 0.75,
@@ -325,7 +325,7 @@ parameters_filtering_es = {
325
  parameters_filtering_eu = {
326
  "cond_uniform_whitespace": True,
327
  "cond_replace_unicode_punctuation": False,
328
- "cond_remove_words_with_incorrect_substrings": False,
329
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
330
  "cond_remove_long_words": True,
331
  "length_word_max_cutoff": 35,
@@ -348,7 +348,7 @@ parameters_filtering_eu = {
348
  "words_augmentation_join_char": "",
349
  "cond_check_stopwords": True,
350
  "stopwords_min_cutoff": 0,
351
- "cond_check_flagged_words": False,
352
  "flagged_words_max_cutoff": 0.2,
353
  "cond_check_lang_id": True,
354
  "lang_id_min_cutoff": 0.75,
@@ -382,7 +382,7 @@ parameters_filtering_fr = {
382
  "words_augmentation_join_char": "",
383
  "cond_check_stopwords": True,
384
  "stopwords_min_cutoff": 0.27,
385
- "cond_check_flagged_words": False,
386
  "flagged_words_max_cutoff": 0.008,
387
  "cond_check_lang_id": True,
388
  "lang_id_min_cutoff": 0.8,
@@ -393,7 +393,7 @@ parameters_filtering_fr = {
393
  parameters_filtering_gu = {
394
  "cond_uniform_whitespace": True,
395
  "cond_replace_unicode_punctuation": False,
396
- "cond_remove_words_with_incorrect_substrings": False,
397
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
398
  "cond_remove_long_words": True,
399
  "length_word_max_cutoff": 30,
@@ -416,7 +416,7 @@ parameters_filtering_gu = {
416
  "words_augmentation_join_char": "",
417
  "cond_check_stopwords": True,
418
  "stopwords_min_cutoff": 0,
419
- "cond_check_flagged_words": False,
420
  "flagged_words_max_cutoff": 0.2,
421
  "cond_check_lang_id": True,
422
  "lang_id_min_cutoff": 0.75,
@@ -427,7 +427,7 @@ parameters_filtering_gu = {
427
  parameters_filtering_hi = {
428
  "cond_uniform_whitespace": True,
429
  "cond_replace_unicode_punctuation": False,
430
- "cond_remove_words_with_incorrect_substrings": False,
431
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
432
  "cond_remove_long_words": True,
433
  "length_word_max_cutoff": 25,
@@ -450,7 +450,7 @@ parameters_filtering_hi = {
450
  "words_augmentation_join_char": "",
451
  "cond_check_stopwords": True,
452
  "stopwords_min_cutoff": 0,
453
- "cond_check_flagged_words": False,
454
  "flagged_words_max_cutoff": 0.2,
455
  "cond_check_lang_id": True,
456
  "lang_id_min_cutoff": 0.75,
@@ -461,41 +461,41 @@ parameters_filtering_hi = {
461
  parameters_filtering_id = {
462
  "cond_uniform_whitespace": True,
463
  "cond_replace_unicode_punctuation": False,
464
- "cond_remove_words_with_incorrect_substrings": False,
465
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
466
  "cond_remove_long_words": True,
467
  "length_word_max_cutoff": 30,
468
  "cond_check_number_words": True,
469
  "tokenization": False,
470
  "strip_characters": special_characters_default,
471
- "number_words_min_cutoff": 1,
472
  "number_words_max_cutoff": 100000,
473
  "cond_check_character_repetition_removal": True,
474
  "character_repetition_length": 10,
475
- "character_repetition_max_cutoff": 0.106,
476
  "cond_check_word_repetition_removal": True,
477
  "word_repetition_length": 5,
478
- "word_repetition_max_cutoff": 0.19,
479
  "cond_check_special_characters": True,
480
  "special_characters": special_characters_default,
481
- "special_characters_max_cutoff": 0.25,
482
  "cond_words_augmentation": False,
483
  "words_augmentation_group_sizes": [],
484
  "words_augmentation_join_char": "",
485
  "cond_check_stopwords": True,
486
- "stopwords_min_cutoff": 0.25,
487
- "cond_check_flagged_words": False,
488
- "flagged_words_max_cutoff": 0.2,
489
  "cond_check_lang_id": True,
490
- "lang_id_min_cutoff": 0.75,
491
  "cond_check_perplexity": True,
492
- "perplexity_max_cutoff": 2500000,
493
  }
494
 
495
  parameters_filtering_kn = {
496
  "cond_uniform_whitespace": True,
497
  "cond_replace_unicode_punctuation": False,
498
- "cond_remove_words_with_incorrect_substrings": False,
499
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
500
  "cond_remove_long_words": True,
501
  "length_word_max_cutoff": 50,
@@ -518,7 +518,7 @@ parameters_filtering_kn = {
518
  "words_augmentation_join_char": "",
519
  "cond_check_stopwords": True,
520
  "stopwords_min_cutoff": 0,
521
- "cond_check_flagged_words": False,
522
  "flagged_words_max_cutoff": 0.2,
523
  "cond_check_lang_id": True,
524
  "lang_id_min_cutoff": 0.75,
@@ -529,7 +529,7 @@ parameters_filtering_kn = {
529
  parameters_filtering_ml = {
530
  "cond_uniform_whitespace": True,
531
  "cond_replace_unicode_punctuation": False,
532
- "cond_remove_words_with_incorrect_substrings": False,
533
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
534
  "cond_remove_long_words": True,
535
  "length_word_max_cutoff": 50,
@@ -552,7 +552,7 @@ parameters_filtering_ml = {
552
  "words_augmentation_join_char": "",
553
  "cond_check_stopwords": True,
554
  "stopwords_min_cutoff": 0,
555
- "cond_check_flagged_words": False,
556
  "flagged_words_max_cutoff": 0.2,
557
  "cond_check_lang_id": True,
558
  "lang_id_min_cutoff": 0.75,
@@ -563,7 +563,7 @@ parameters_filtering_ml = {
563
  parameters_filtering_mr = {
564
  "cond_uniform_whitespace": True,
565
  "cond_replace_unicode_punctuation": False,
566
- "cond_remove_words_with_incorrect_substrings": False,
567
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
568
  "cond_remove_long_words": True,
569
  "length_word_max_cutoff": 30,
@@ -586,7 +586,7 @@ parameters_filtering_mr = {
586
  "words_augmentation_join_char": "",
587
  "cond_check_stopwords": True,
588
  "stopwords_min_cutoff": 0,
589
- "cond_check_flagged_words": False,
590
  "flagged_words_max_cutoff": 0.2,
591
  "cond_check_lang_id": True,
592
  "lang_id_min_cutoff": 0.75,
@@ -620,7 +620,7 @@ parameters_filtering_pt = {
620
  "words_augmentation_join_char": "",
621
  "cond_check_stopwords": True,
622
  "stopwords_min_cutoff": 0.2,
623
- "cond_check_flagged_words": False,
624
  "flagged_words_max_cutoff": 0.007,
625
  "cond_check_lang_id": True,
626
  "lang_id_min_cutoff": 0.6,
@@ -631,7 +631,7 @@ parameters_filtering_pt = {
631
  parameters_filtering_sw = {
632
  "cond_uniform_whitespace": True,
633
  "cond_replace_unicode_punctuation": False,
634
- "cond_remove_words_with_incorrect_substrings": False,
635
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
636
  "cond_remove_long_words": True,
637
  "length_word_max_cutoff": 30,
@@ -654,7 +654,7 @@ parameters_filtering_sw = {
654
  "words_augmentation_join_char": "",
655
  "cond_check_stopwords": True,
656
  "stopwords_min_cutoff": 0,
657
- "cond_check_flagged_words": False,
658
  "flagged_words_max_cutoff": 0.2,
659
  "cond_check_lang_id": True,
660
  "lang_id_min_cutoff": 0.75,
@@ -665,7 +665,7 @@ parameters_filtering_sw = {
665
  parameters_filtering_ta = {
666
  "cond_uniform_whitespace": True,
667
  "cond_replace_unicode_punctuation": False,
668
- "cond_remove_words_with_incorrect_substrings": False,
669
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
670
  "cond_remove_long_words": True,
671
  "length_word_max_cutoff": 50,
@@ -688,7 +688,7 @@ parameters_filtering_ta = {
688
  "words_augmentation_join_char": "",
689
  "cond_check_stopwords": True,
690
  "stopwords_min_cutoff": 0,
691
- "cond_check_flagged_words": False,
692
  "flagged_words_max_cutoff": 0.2,
693
  "cond_check_lang_id": True,
694
  "lang_id_min_cutoff": 0.75,
@@ -699,7 +699,7 @@ parameters_filtering_ta = {
699
  parameters_filtering_te = {
700
  "cond_uniform_whitespace": True,
701
  "cond_replace_unicode_punctuation": False,
702
- "cond_remove_words_with_incorrect_substrings": False,
703
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
704
  "cond_remove_long_words": True,
705
  "length_word_max_cutoff": 35,
@@ -722,7 +722,7 @@ parameters_filtering_te = {
722
  "words_augmentation_join_char": "",
723
  "cond_check_stopwords": True,
724
  "stopwords_min_cutoff": 0,
725
- "cond_check_flagged_words": False,
726
  "flagged_words_max_cutoff": 0.2,
727
  "cond_check_lang_id": True,
728
  "lang_id_min_cutoff": 0.75,
@@ -733,7 +733,7 @@ parameters_filtering_te = {
733
  parameters_filtering_ur = {
734
  "cond_uniform_whitespace": True,
735
  "cond_replace_unicode_punctuation": False,
736
- "cond_remove_words_with_incorrect_substrings": False,
737
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
738
  "cond_remove_long_words": True,
739
  "length_word_max_cutoff": 30,
@@ -756,7 +756,7 @@ parameters_filtering_ur = {
756
  "words_augmentation_join_char": "",
757
  "cond_check_stopwords": True,
758
  "stopwords_min_cutoff": 0,
759
- "cond_check_flagged_words": False,
760
  "flagged_words_max_cutoff": 0.2,
761
  "cond_check_lang_id": True,
762
  "lang_id_min_cutoff": 0.75,
@@ -767,7 +767,7 @@ parameters_filtering_ur = {
767
  parameters_filtering_vi = {
768
  "cond_uniform_whitespace": True,
769
  "cond_replace_unicode_punctuation": False,
770
- "cond_remove_words_with_incorrect_substrings": False,
771
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
772
  "cond_remove_long_words": True,
773
  "length_word_max_cutoff": 30,
@@ -790,7 +790,7 @@ parameters_filtering_vi = {
790
  "words_augmentation_join_char": " ",
791
  "cond_check_stopwords": True,
792
  "stopwords_min_cutoff": 0,
793
- "cond_check_flagged_words": False,
794
  "flagged_words_max_cutoff": 0.2,
795
  "cond_check_lang_id": True,
796
  "lang_id_min_cutoff": 0.75,
@@ -801,7 +801,7 @@ parameters_filtering_vi = {
801
  parameters_filtering_yo = {
802
  "cond_uniform_whitespace": True,
803
  "cond_replace_unicode_punctuation": False,
804
- "cond_remove_words_with_incorrect_substrings": False,
805
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
806
  "cond_remove_long_words": True,
807
  "length_word_max_cutoff": 30,
@@ -824,7 +824,7 @@ parameters_filtering_yo = {
824
  "words_augmentation_join_char": "",
825
  "cond_check_stopwords": True,
826
  "stopwords_min_cutoff": 0,
827
- "cond_check_flagged_words": False,
828
  "flagged_words_max_cutoff": 0.2,
829
  "cond_check_lang_id": True,
830
  "lang_id_min_cutoff": 0.75,
@@ -856,9 +856,9 @@ parameters_filtering_zh = {
856
  "cond_words_augmentation": True,
857
  "words_augmentation_group_sizes": [2],
858
  "words_augmentation_join_char": "",
859
- "cond_check_stopwords": False,
860
  "stopwords_min_cutoff": 0,
861
- "cond_check_flagged_words": False,
862
  "flagged_words_max_cutoff": 0.2,
863
  "cond_check_lang_id": True,
864
  "lang_id_min_cutoff": 0.75,
26
  "cond_check_number_words": True,
27
  "tokenization": False,
28
  "strip_characters": special_characters_default,
29
+ "number_words_min_cutoff": 10,
30
  "number_words_max_cutoff": 100000,
31
  "cond_check_character_repetition_removal": True,
32
  "character_repetition_length": 10,
33
+ "character_repetition_max_cutoff": 0.2,
34
  "cond_check_word_repetition_removal": True,
35
  "word_repetition_length": 5,
36
+ "word_repetition_max_cutoff": 0.3,
37
  "cond_check_special_characters": True,
38
  "special_characters": special_characters_default,
39
  "special_characters_max_cutoff": 0.4,
40
  "cond_words_augmentation": False,
41
  "words_augmentation_group_sizes": [],
42
  "words_augmentation_join_char": "",
43
+ "cond_check_stopwords": True,
44
+ "stopwords_min_cutoff": 0.1,
45
+ "cond_check_flagged_words": True,
46
+ "flagged_words_max_cutoff": 0.1,
47
  "cond_check_lang_id": True,
48
  "lang_id_min_cutoff": 0.70,
49
  "cond_check_perplexity": False,
50
+ "perplexity_max_cutoff": 10000,
51
  }
52
 
53
  parameters_filtering_af = {
54
  "cond_uniform_whitespace": True,
55
  "cond_replace_unicode_punctuation": False,
56
+ "cond_remove_words_with_incorrect_substrings": True,
57
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
58
  "cond_remove_long_words": True,
59
  "length_word_max_cutoff": 25,
76
  "words_augmentation_join_char": "",
77
  "cond_check_stopwords": True,
78
  "stopwords_min_cutoff": 0,
79
+ "cond_check_flagged_words": True,
80
  "flagged_words_max_cutoff": 0.2,
81
  "cond_check_lang_id": True,
82
  "lang_id_min_cutoff": 0.6,
87
  parameters_filtering_ar = {
88
  "cond_uniform_whitespace": True,
89
  "cond_replace_unicode_punctuation": False,
90
+ "cond_remove_words_with_incorrect_substrings": True,
91
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
92
  "cond_remove_long_words": True,
93
  "length_word_max_cutoff": 25,
110
  "words_augmentation_join_char": "",
111
  "cond_check_stopwords": True,
112
  "stopwords_min_cutoff": 0,
113
+ "cond_check_flagged_words": True,
114
  "flagged_words_max_cutoff": 0.2,
115
  "cond_check_lang_id": True,
116
  "lang_id_min_cutoff": 0.75,
121
  parameters_filtering_arz = {
122
  "cond_uniform_whitespace": True,
123
  "cond_replace_unicode_punctuation": False,
124
+ "cond_remove_words_with_incorrect_substrings": True,
125
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
126
  "cond_remove_long_words": True,
127
  "length_word_max_cutoff": 25,
144
  "words_augmentation_join_char": "",
145
  "cond_check_stopwords": True,
146
  "stopwords_min_cutoff": 0,
147
+ "cond_check_flagged_words": True,
148
  "flagged_words_max_cutoff": 0.2,
149
  "cond_check_lang_id": True,
150
  "lang_id_min_cutoff": 0.75,
155
  parameters_filtering_as = {
156
  "cond_uniform_whitespace": True,
157
  "cond_replace_unicode_punctuation": False,
158
+ "cond_remove_words_with_incorrect_substrings": True,
159
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
160
  "cond_remove_long_words": True,
161
  "length_word_max_cutoff": 25,
178
  "words_augmentation_join_char": "",
179
  "cond_check_stopwords": True,
180
  "stopwords_min_cutoff": 0,
181
+ "cond_check_flagged_words": True,
182
  "flagged_words_max_cutoff": 0.2,
183
  "cond_check_lang_id": True,
184
  "lang_id_min_cutoff": 0.75,
189
  parameters_filtering_bn = {
190
  "cond_uniform_whitespace": True,
191
  "cond_replace_unicode_punctuation": False,
192
+ "cond_remove_words_with_incorrect_substrings": True,
193
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
194
  "cond_remove_long_words": True,
195
  "length_word_max_cutoff": 30,
212
  "words_augmentation_join_char": "",
213
  "cond_check_stopwords": True,
214
  "stopwords_min_cutoff": 0.05,
215
+ "cond_check_flagged_words": True,
216
  "flagged_words_max_cutoff": 0.2,
217
  "cond_check_lang_id": True,
218
  "lang_id_min_cutoff": 0.75,
246
  "words_augmentation_join_char": "",
247
  "cond_check_stopwords": True,
248
  "stopwords_min_cutoff": 0.25,
249
+ "cond_check_flagged_words": True,
250
  "flagged_words_max_cutoff": 0.1,
251
  "cond_check_lang_id": True,
252
  "lang_id_min_cutoff": 0.8,
291
  parameters_filtering_es = {
292
  "cond_uniform_whitespace": True,
293
  "cond_replace_unicode_punctuation": False,
294
+ "cond_remove_words_with_incorrect_substrings": True,
295
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
296
  "cond_remove_long_words": True,
297
  "length_word_max_cutoff": 30,
314
  "words_augmentation_join_char": "",
315
  "cond_check_stopwords": True,
316
  "stopwords_min_cutoff": 0.2,
317
+ "cond_check_flagged_words": True,
318
  "flagged_words_max_cutoff": 0.2,
319
  "cond_check_lang_id": True,
320
  "lang_id_min_cutoff": 0.75,
325
  parameters_filtering_eu = {
326
  "cond_uniform_whitespace": True,
327
  "cond_replace_unicode_punctuation": False,
328
+ "cond_remove_words_with_incorrect_substrings": True,
329
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
330
  "cond_remove_long_words": True,
331
  "length_word_max_cutoff": 35,
348
  "words_augmentation_join_char": "",
349
  "cond_check_stopwords": True,
350
  "stopwords_min_cutoff": 0,
351
+ "cond_check_flagged_words": True,
352
  "flagged_words_max_cutoff": 0.2,
353
  "cond_check_lang_id": True,
354
  "lang_id_min_cutoff": 0.75,
382
  "words_augmentation_join_char": "",
383
  "cond_check_stopwords": True,
384
  "stopwords_min_cutoff": 0.27,
385
+ "cond_check_flagged_words": True,
386
  "flagged_words_max_cutoff": 0.008,
387
  "cond_check_lang_id": True,
388
  "lang_id_min_cutoff": 0.8,
393
  parameters_filtering_gu = {
394
  "cond_uniform_whitespace": True,
395
  "cond_replace_unicode_punctuation": False,
396
+ "cond_remove_words_with_incorrect_substrings": True,
397
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
398
  "cond_remove_long_words": True,
399
  "length_word_max_cutoff": 30,
416
  "words_augmentation_join_char": "",
417
  "cond_check_stopwords": True,
418
  "stopwords_min_cutoff": 0,
419
+ "cond_check_flagged_words": True,
420
  "flagged_words_max_cutoff": 0.2,
421
  "cond_check_lang_id": True,
422
  "lang_id_min_cutoff": 0.75,
427
  parameters_filtering_hi = {
428
  "cond_uniform_whitespace": True,
429
  "cond_replace_unicode_punctuation": False,
430
+ "cond_remove_words_with_incorrect_substrings": True,
431
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
432
  "cond_remove_long_words": True,
433
  "length_word_max_cutoff": 25,
450
  "words_augmentation_join_char": "",
451
  "cond_check_stopwords": True,
452
  "stopwords_min_cutoff": 0,
453
+ "cond_check_flagged_words": True,
454
  "flagged_words_max_cutoff": 0.2,
455
  "cond_check_lang_id": True,
456
  "lang_id_min_cutoff": 0.75,
461
  parameters_filtering_id = {
462
  "cond_uniform_whitespace": True,
463
  "cond_replace_unicode_punctuation": False,
464
+ "cond_remove_words_with_incorrect_substrings": True,
465
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
466
  "cond_remove_long_words": True,
467
  "length_word_max_cutoff": 30,
468
  "cond_check_number_words": True,
469
  "tokenization": False,
470
  "strip_characters": special_characters_default,
471
+ "number_words_min_cutoff": 15,
472
  "number_words_max_cutoff": 100000,
473
  "cond_check_character_repetition_removal": True,
474
  "character_repetition_length": 10,
475
+ "character_repetition_max_cutoff": 0.15,
476
  "cond_check_word_repetition_removal": True,
477
  "word_repetition_length": 5,
478
+ "word_repetition_max_cutoff": 0.20,
479
  "cond_check_special_characters": True,
480
  "special_characters": special_characters_default,
481
+ "special_characters_max_cutoff": 0.34,
482
  "cond_words_augmentation": False,
483
  "words_augmentation_group_sizes": [],
484
  "words_augmentation_join_char": "",
485
  "cond_check_stopwords": True,
486
+ "stopwords_min_cutoff": 0.15,
487
+ "cond_check_flagged_words": True,
488
+ "flagged_words_max_cutoff": 0.01,
489
  "cond_check_lang_id": True,
490
+ "lang_id_min_cutoff": 0.7,
491
  "cond_check_perplexity": True,
492
+ "perplexity_max_cutoff": 5000,
493
  }
494
 
495
  parameters_filtering_kn = {
496
  "cond_uniform_whitespace": True,
497
  "cond_replace_unicode_punctuation": False,
498
+ "cond_remove_words_with_incorrect_substrings": True,
499
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
500
  "cond_remove_long_words": True,
501
  "length_word_max_cutoff": 50,
518
  "words_augmentation_join_char": "",
519
  "cond_check_stopwords": True,
520
  "stopwords_min_cutoff": 0,
521
+ "cond_check_flagged_words": True,
522
  "flagged_words_max_cutoff": 0.2,
523
  "cond_check_lang_id": True,
524
  "lang_id_min_cutoff": 0.75,
529
  parameters_filtering_ml = {
530
  "cond_uniform_whitespace": True,
531
  "cond_replace_unicode_punctuation": False,
532
+ "cond_remove_words_with_incorrect_substrings": True,
533
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
534
  "cond_remove_long_words": True,
535
  "length_word_max_cutoff": 50,
552
  "words_augmentation_join_char": "",
553
  "cond_check_stopwords": True,
554
  "stopwords_min_cutoff": 0,
555
+ "cond_check_flagged_words": True,
556
  "flagged_words_max_cutoff": 0.2,
557
  "cond_check_lang_id": True,
558
  "lang_id_min_cutoff": 0.75,
563
  parameters_filtering_mr = {
564
  "cond_uniform_whitespace": True,
565
  "cond_replace_unicode_punctuation": False,
566
+ "cond_remove_words_with_incorrect_substrings": True,
567
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
568
  "cond_remove_long_words": True,
569
  "length_word_max_cutoff": 30,
586
  "words_augmentation_join_char": "",
587
  "cond_check_stopwords": True,
588
  "stopwords_min_cutoff": 0,
589
+ "cond_check_flagged_words": True,
590
  "flagged_words_max_cutoff": 0.2,
591
  "cond_check_lang_id": True,
592
  "lang_id_min_cutoff": 0.75,
620
  "words_augmentation_join_char": "",
621
  "cond_check_stopwords": True,
622
  "stopwords_min_cutoff": 0.2,
623
+ "cond_check_flagged_words": True,
624
  "flagged_words_max_cutoff": 0.007,
625
  "cond_check_lang_id": True,
626
  "lang_id_min_cutoff": 0.6,
631
  parameters_filtering_sw = {
632
  "cond_uniform_whitespace": True,
633
  "cond_replace_unicode_punctuation": False,
634
+ "cond_remove_words_with_incorrect_substrings": True,
635
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
636
  "cond_remove_long_words": True,
637
  "length_word_max_cutoff": 30,
654
  "words_augmentation_join_char": "",
655
  "cond_check_stopwords": True,
656
  "stopwords_min_cutoff": 0,
657
+ "cond_check_flagged_words": True,
658
  "flagged_words_max_cutoff": 0.2,
659
  "cond_check_lang_id": True,
660
  "lang_id_min_cutoff": 0.75,
665
  parameters_filtering_ta = {
666
  "cond_uniform_whitespace": True,
667
  "cond_replace_unicode_punctuation": False,
668
+ "cond_remove_words_with_incorrect_substrings": True,
669
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
670
  "cond_remove_long_words": True,
671
  "length_word_max_cutoff": 50,
688
  "words_augmentation_join_char": "",
689
  "cond_check_stopwords": True,
690
  "stopwords_min_cutoff": 0,
691
+ "cond_check_flagged_words": True,
692
  "flagged_words_max_cutoff": 0.2,
693
  "cond_check_lang_id": True,
694
  "lang_id_min_cutoff": 0.75,
699
  parameters_filtering_te = {
700
  "cond_uniform_whitespace": True,
701
  "cond_replace_unicode_punctuation": False,
702
+ "cond_remove_words_with_incorrect_substrings": True,
703
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
704
  "cond_remove_long_words": True,
705
  "length_word_max_cutoff": 35,
722
  "words_augmentation_join_char": "",
723
  "cond_check_stopwords": True,
724
  "stopwords_min_cutoff": 0,
725
+ "cond_check_flagged_words": True,
726
  "flagged_words_max_cutoff": 0.2,
727
  "cond_check_lang_id": True,
728
  "lang_id_min_cutoff": 0.75,
733
  parameters_filtering_ur = {
734
  "cond_uniform_whitespace": True,
735
  "cond_replace_unicode_punctuation": False,
736
+ "cond_remove_words_with_incorrect_substrings": True,
737
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
738
  "cond_remove_long_words": True,
739
  "length_word_max_cutoff": 30,
756
  "words_augmentation_join_char": "",
757
  "cond_check_stopwords": True,
758
  "stopwords_min_cutoff": 0,
759
+ "cond_check_flagged_words": True,
760
  "flagged_words_max_cutoff": 0.2,
761
  "cond_check_lang_id": True,
762
  "lang_id_min_cutoff": 0.75,
767
  parameters_filtering_vi = {
768
  "cond_uniform_whitespace": True,
769
  "cond_replace_unicode_punctuation": False,
770
+ "cond_remove_words_with_incorrect_substrings": True,
771
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
772
  "cond_remove_long_words": True,
773
  "length_word_max_cutoff": 30,
790
  "words_augmentation_join_char": " ",
791
  "cond_check_stopwords": True,
792
  "stopwords_min_cutoff": 0,
793
+ "cond_check_flagged_words": True,
794
  "flagged_words_max_cutoff": 0.2,
795
  "cond_check_lang_id": True,
796
  "lang_id_min_cutoff": 0.75,
801
  parameters_filtering_yo = {
802
  "cond_uniform_whitespace": True,
803
  "cond_replace_unicode_punctuation": False,
804
+ "cond_remove_words_with_incorrect_substrings": True,
805
  "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
806
  "cond_remove_long_words": True,
807
  "length_word_max_cutoff": 30,
824
  "words_augmentation_join_char": "",
825
  "cond_check_stopwords": True,
826
  "stopwords_min_cutoff": 0,
827
+ "cond_check_flagged_words": True,
828
  "flagged_words_max_cutoff": 0.2,
829
  "cond_check_lang_id": True,
830
  "lang_id_min_cutoff": 0.75,
856
  "cond_words_augmentation": True,
857
  "words_augmentation_group_sizes": [2],
858
  "words_augmentation_join_char": "",
859
+ "cond_check_stopwords": True,
860
  "stopwords_min_cutoff": 0,
861
+ "cond_check_flagged_words": True,
862
  "flagged_words_max_cutoff": 0.2,
863
  "cond_check_lang_id": True,
864
  "lang_id_min_cutoff": 0.75,