import string import emoji main_special_characters = string.punctuation + string.digits + string.whitespace other_special_characters = ( "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═" "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖" "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" "」﴾》" ) emoji = list(emoji.UNICODE_EMOJI["en"].keys()) special_characters_default = set(main_special_characters + other_special_characters) special_characters_default.update(emoji) parameters_filtering_default = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": False, "length_word_max_cutoff": 50, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.4, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": False, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.70, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_af = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 25, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.3, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.6, "cond_check_perplexity": True, "perplexity_max_cutoff": 3000000, } parameters_filtering_ar = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 25, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.45, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 1000000, } parameters_filtering_arz = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 25, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.5, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_as = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 25, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.25, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_bn = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.275, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0.05, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 575000, } parameters_filtering_ca = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.35, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 1750000, } parameters_filtering_en = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": True, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 25, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 20, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.4, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0.3, "cond_check_flagged_words": True, "flagged_words_max_cutoff": 0.045, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.80, "cond_check_perplexity": True, "perplexity_max_cutoff": 2500, } parameters_filtering_es = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.3, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0.2, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 2500000, } parameters_filtering_eu = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 35, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.3, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_fr = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.35, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0.15, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 3000000, } parameters_filtering_gu = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.3, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 250000, } parameters_filtering_hi = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 25, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.35, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 600000, } parameters_filtering_id = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.25, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0.25, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 2500000, } parameters_filtering_kn = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 50, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.25, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 400000, } parameters_filtering_ml = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 50, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.2, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 1600000, } parameters_filtering_mr = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.25, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 425000, } parameters_filtering_pt = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.3, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0.15, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": True, "perplexity_max_cutoff": 3000000, } parameters_filtering_sw = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.275, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_ta = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 50, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.25, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_te = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 35, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.25, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_ur = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.4, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_vi = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.35, "cond_words_augmentation": True, "words_augmentation_group_sizes": [2], "words_augmentation_join_char": " ", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_yo = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": True, "length_word_max_cutoff": 30, "cond_check_number_words": True, "tokenization": False, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.3, "cond_words_augmentation": False, "words_augmentation_group_sizes": [], "words_augmentation_join_char": "", "cond_check_stopwords": True, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering_zh = { "cond_uniform_whitespace": True, "cond_replace_unicode_punctuation": False, "cond_remove_words_with_incorrect_substrings": False, "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], "cond_remove_long_words": False, "length_word_max_cutoff": 1000, "cond_check_number_words": True, "tokenization": True, "strip_characters": special_characters_default, "number_words_min_cutoff": 1, "number_words_max_cutoff": 100000, "cond_check_character_repetition_removal": True, "character_repetition_length": 10, "character_repetition_max_cutoff": 0.106, "cond_check_word_repetition_removal": True, "word_repetition_length": 5, "word_repetition_max_cutoff": 0.19, "cond_check_special_characters": True, "special_characters": special_characters_default, "special_characters_max_cutoff": 0.4, "cond_words_augmentation": True, "words_augmentation_group_sizes": [2], "words_augmentation_join_char": "", "cond_check_stopwords": False, "stopwords_min_cutoff": 0, "cond_check_flagged_words": False, "flagged_words_max_cutoff": 0.2, "cond_check_lang_id": True, "lang_id_min_cutoff": 0.75, "cond_check_perplexity": False, "perplexity_max_cutoff": 3000000, } parameters_filtering = { "default": parameters_filtering_default, "af": parameters_filtering_af, "ar": parameters_filtering_ar, "arz": parameters_filtering_arz, "as": parameters_filtering_as, "bn": parameters_filtering_bn, "ca": parameters_filtering_ca, "en": parameters_filtering_en, "es": parameters_filtering_es, "eu": parameters_filtering_eu, "fr": parameters_filtering_fr, "gu": parameters_filtering_gu, "hi": parameters_filtering_hi, "id": parameters_filtering_id, "kn": parameters_filtering_kn, "ml": parameters_filtering_ml, "mr": parameters_filtering_mr, "pt": parameters_filtering_pt, "sw": parameters_filtering_sw, "ta": parameters_filtering_ta, "te": parameters_filtering_te, "ur": parameters_filtering_ur, "vi": parameters_filtering_vi, "yo": parameters_filtering_yo, "zh": parameters_filtering_zh, }