import re import numpy as np import fasttext import sentencepiece import kenlm import pathlib from languages_id import langs_id from parameters_filtering import parameters_filtering from normalization import normalization from stopwords import stopwords from flagged_words import flagged_words class LoadParameters: @staticmethod def load_parameters(lang_dataset_id): if lang_dataset_id in parameters_filtering: param = parameters_filtering[lang_dataset_id] else: param = parameters_filtering["default"] return param @staticmethod def load_stopwords(lang_dataset_id): stopwords_lang_id = langs_id.loc[ langs_id["dataset_id"] == lang_dataset_id, "stopwords_id" ].iloc[0] if stopwords_lang_id: stopwords_lang = set(stopwords[stopwords_lang_id]) else: stopwords_lang = None return stopwords_lang @staticmethod def load_flagged_words(lang_dataset_id): flagged_words_lang_id = langs_id.loc[ langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id" ].iloc[0] if flagged_words_lang_id: flagged_words_lang = set(flagged_words[flagged_words_lang_id]) else: flagged_words_lang = None return flagged_words_lang @staticmethod def load_model_lang_id(lang_dataset_id, path_fasttext_model): fasttext_lang_id = langs_id.loc[ langs_id["dataset_id"] == lang_dataset_id, "fasttext_id" ].iloc[0] if fasttext_lang_id: model_lang_id = fasttext.load_model(path_fasttext_model) else: model_lang_id = None return model_lang_id @staticmethod def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model): sentencepiece_lang_id = langs_id.loc[ langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id" ].iloc[0] if sentencepiece_lang_id: sentencepiece_model = sentencepiece.SentencePieceProcessor() sentencepiece_model.load(path_sentencepiece_model) else: sentencepiece_model = None return sentencepiece_model @staticmethod def load_kenlm_model(lang_dataset_id, path_kenlm_model): kenlm_lang_id = langs_id.loc[ langs_id["dataset_id"] == lang_dataset_id, "kenlm_id" ].iloc[0] if kenlm_lang_id: kenlm_model = kenlm.Model(path_kenlm_model) else: kenlm_model = None return kenlm_model class ModifyingDocuments: @staticmethod def remove_empty_el_from_list(list_): return [el for el in list_ if el] @staticmethod def remove_non_printing_characters(document, non_printing_characters_re): return non_printing_characters_re.sub("", document) @staticmethod def uniform_whitespace( document, whitespace=[ " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "", "„", ], ): """There are different whitespace characters.""" whitespace = set(whitespace) document = "".join( [char if char not in whitespace else " " for char in document] ) return document @staticmethod def replace_digits_with_zeros(document, digits_re): return digits_re.sub("0", document) @staticmethod def replace_unicode_punctuation(document, unicode_punctuation): return "".join(unicode_punctuation.get(c, c) for c in document) @staticmethod def normalization( document, remove_non_printing_characters, strip, lower_case, uniform_whitespace, replace_digits_with_zeros, replace_unicode_punctuation, non_printing_characters_re=normalization["non_printing_characters_re"], digits_re=normalization["digits_re"], unicode_punctuation=normalization["unicode_punctuation"], ): if remove_non_printing_characters: document = ModifyingDocuments.remove_non_printing_characters( document, non_printing_characters_re ) if strip: document = document.strip() if not document: return document if lower_case: document = document.lower() if uniform_whitespace: document = ModifyingDocuments.uniform_whitespace(document) if replace_digits_with_zeros: document = ModifyingDocuments.replace_digits_with_zeros(document, digits_re) if replace_unicode_punctuation: document = ModifyingDocuments.replace_unicode_punctuation( document, unicode_punctuation ) return document @staticmethod def tokenization(document, sentencepiece_model, join_on_whitespace): document_tokenized = sentencepiece_model.encode_as_pieces(document) if join_on_whitespace: document_tokenized = " ".join(document_tokenized) return document_tokenized @staticmethod def split_on_whitespace( document, new_line=False, tab=False, ): """This method also removes concatenated spaces.""" sep = [" "] + new_line * ["\n"] + tab * ["\t"] sep = "|".join(sep) split_document = re.split(sep, document) split_document = ModifyingDocuments.remove_empty_el_from_list(split_document) return split_document @staticmethod def strip(document, strip_characters): """Way faster than document.strip(strip_characters) since strip_characters is now a set instead of a str, and it contains a lot of elements (all the emojis).""" if not document: return document beg_ind = 0 end_ind = len(document) for i in range(len(document)): if document[i] in strip_characters: beg_ind += 1 else: break for i in range(1, len(document) + 1): if document[-i] in strip_characters: end_ind -= 1 else: break document_stripped = document[beg_ind:end_ind] return document_stripped @staticmethod def get_words_from_document( document, sentencepiece_model_tok, lower_case, strip_characters ): """Get words from a document. Non reversible since the document is split on multiple characters, words are stripped of special characters and characters are converted to lower case. Useful to compute ratios, like the stopwords ratio.""" if sentencepiece_model_tok: document_normalized = ModifyingDocuments.normalization( document=document, remove_non_printing_characters=True, strip=True, lower_case=True, uniform_whitespace=True, replace_digits_with_zeros=True, replace_unicode_punctuation=True, ) words = ModifyingDocuments.tokenization( document_normalized, sentencepiece_model_tok, join_on_whitespace=False ) else: words = ModifyingDocuments.split_on_whitespace( document, new_line=True, tab=True ) if lower_case: words = [word.lower() for word in words] if strip_characters: words = [ModifyingDocuments.strip(word, strip_characters) for word in words] words = ModifyingDocuments.remove_empty_el_from_list(words) return words @staticmethod def words_augmentation(words, group_size, join_char): """Augment words, especially for Chinese (without a space between words) and Vietnamese (with a space between syllables).""" augmentation = [ join_char.join(words[i : i + group_size]) for i in range(len(words) - group_size + 1) ] return augmentation @staticmethod def split_on_newline_tab_whitespace(document): """First split on "\n", then on "\t", then on " ".""" sentences = document.split("\n") sentences = [sentence.split("\t") for sentence in sentences] sentences = [ [ ModifyingDocuments.split_on_whitespace(subsentence) for subsentence in sentence ] for sentence in sentences ] return sentences @staticmethod def merge_on_whitespace_tab_newline(sentences): """Invert the method split_on_newline_tab_whitespace. Removes concatenated separators.""" sentences = [ [" ".join(subsentence) for subsentence in sentence if subsentence] for sentence in sentences ] sentences = ["\t".join(sentence) for sentence in sentences if sentence] if not sentences: return "" document = "\n".join(sentences) return document @staticmethod def should_keep_word_with_incorrect_substrings( word, strip_characters, incorrect_word_substrings ): word = ModifyingDocuments.strip(word, strip_characters) should_keep = all( [(i_substr not in word) for i_substr in incorrect_word_substrings] ) return should_keep @staticmethod def remove_words_with_incorrect_substrings( document, strip_characters, incorrect_word_substrings, ): sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document) sentences = [ [ [ word for word in subsentence if ModifyingDocuments.should_keep_word_with_incorrect_substrings( word, strip_characters, incorrect_word_substrings ) ] for subsentence in sentence ] for sentence in sentences ] document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences) return document @staticmethod def should_keep_long_word(word, strip_characters, length_word_max_cutoff): """If the word is too long but it contains only one special character, it might be a concatenation of one word, a punctuation, and another word, with no space between them. In this case, we give the word a pass.""" if len(word) <= length_word_max_cutoff: return True word = ModifyingDocuments.strip(word, strip_characters) if not word: # The word consisted only of strip characters return False if len(word) <= length_word_max_cutoff: return True return False def remove_long_words( document, strip_characters, length_word_max_cutoff, ): sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document) sentences = [ [ [ word for word in subsentence if ModifyingDocuments.should_keep_long_word( word, strip_characters, length_word_max_cutoff, ) ] for subsentence in sentence ] for sentence in sentences ] document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences) return document @staticmethod def modifying_documents( document, cond_uniform_whitespace, cond_replace_unicode_punctuation, cond_remove_words_with_incorrect_substrings, strip_characters, incorrect_word_substrings, cond_remove_long_words, length_word_max_cutoff, ): document = ModifyingDocuments.normalization( document=document, remove_non_printing_characters=False, strip=True, lower_case=False, uniform_whitespace=cond_uniform_whitespace, replace_digits_with_zeros=False, replace_unicode_punctuation=cond_replace_unicode_punctuation, ) if cond_remove_words_with_incorrect_substrings: document = ModifyingDocuments.remove_words_with_incorrect_substrings( document, strip_characters, incorrect_word_substrings, ) if cond_remove_long_words: document = ModifyingDocuments.remove_long_words( document, strip_characters, length_word_max_cutoff, ) return document class FunctionDatasetModifyingDocuments: def __init__(self, lang_dataset_id): self.lang_dataset_id = lang_dataset_id self.param = LoadParameters.load_parameters(lang_dataset_id) def __call__(self, example): example["text"] = ModifyingDocuments.modifying_documents( document=example["text"], cond_uniform_whitespace=self.param["cond_uniform_whitespace"], cond_replace_unicode_punctuation=self.param[ "cond_replace_unicode_punctuation" ], cond_remove_words_with_incorrect_substrings=self.param[ "cond_remove_words_with_incorrect_substrings" ], strip_characters=self.param["strip_characters"], incorrect_word_substrings=self.param["incorrect_word_substrings"], cond_remove_long_words=self.param["cond_remove_long_words"], length_word_max_cutoff=self.param["length_word_max_cutoff"], ) return example def __reduce__(self): return (self.__class__, (self.lang_dataset_id,)) class Filtering: @staticmethod def check_number_words( document, sentencepiece_model_tok, strip_characters, number_words_min_cutoff, number_words_max_cutoff, ): words = ModifyingDocuments.get_words_from_document( document, sentencepiece_model_tok, lower_case=False, strip_characters=strip_characters, ) cond = (len(words) >= number_words_min_cutoff) and ( len(words) <= number_words_max_cutoff ) return cond @staticmethod def compute_character_repetition_ratio(document, character_repetition_length): def get_freq_character_ngrams(document, n): character_ngrams = [ document[i : i + n] for i in range(len(document) - n + 1) ] freq_character_ngrams = {} for character_ngram in character_ngrams: freq_character_ngrams[character_ngram] = ( freq_character_ngrams.get(character_ngram, 0) + 1 ) return freq_character_ngrams freq_character_ngrams = get_freq_character_ngrams( document, character_repetition_length ) if len(freq_character_ngrams) == 0: return 0 freq_character_ngrams = list(freq_character_ngrams.values()) freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) val_less_than_one = len([el for el in freq_character_ngrams if el > 1]) num_rep_character_ngrams = min( int(np.sqrt(len(freq_character_ngrams))), len(freq_character_ngrams) - val_less_than_one, ) character_repetition_ratio = sum( freq_character_ngrams[:num_rep_character_ngrams] ) / sum(freq_character_ngrams) return character_repetition_ratio @staticmethod def check_character_repetition_removal( document, character_repetition_length, character_repetition_max_cutoff, ): character_repetition_ratio = Filtering.compute_character_repetition_ratio( document, character_repetition_length ) cond = character_repetition_ratio <= character_repetition_max_cutoff return cond @staticmethod def compute_word_repetition_ratio( document, sentencepiece_model_tok, strip_characters, word_repetition_length ): def get_freq_word_ngrams( document, sentencepiece_model_tok, strip_characters, n ): words = ModifyingDocuments.get_words_from_document( document, sentencepiece_model_tok, lower_case=True, strip_characters=strip_characters, ) word_ngrams = [ " ".join(words[i : i + n]) for i in range(len(words) - n + 1) ] freq_word_ngrams = {} for word_ngram in word_ngrams: freq_word_ngrams[word_ngram] = freq_word_ngrams.get(word_ngram, 0) + 1 return freq_word_ngrams freq_word_ngrams = get_freq_word_ngrams( document, sentencepiece_model_tok, strip_characters, word_repetition_length ) if len(freq_word_ngrams) == 0: return 0 freq_word_ngrams = list(freq_word_ngrams.values()) word_repetition_ratio = sum( freq for freq in freq_word_ngrams if freq > 1 ) / sum(freq_word_ngrams) return word_repetition_ratio @staticmethod def check_word_repetition_removal( document, sentencepiece_model_tok, strip_characters, word_repetition_length, word_repetition_max_cutoff, ): word_repetition_ratio = Filtering.compute_word_repetition_ratio( document, sentencepiece_model_tok, strip_characters, word_repetition_length ) cond = word_repetition_ratio <= word_repetition_max_cutoff return cond @staticmethod def compute_special_characters_ratio(document, special_characters): if len(document) == 0: return 0 special_characters_ratio = len( [char for char in document if char in special_characters] ) / len(document) return special_characters_ratio @staticmethod def check_special_characters( document, special_characters, special_characters_max_cutoff, ): special_characters_ratio = Filtering.compute_special_characters_ratio( document, special_characters ) cond = special_characters_ratio <= special_characters_max_cutoff return cond @staticmethod def compute_stopwords_ratio( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, stopwords, ): words = ModifyingDocuments.get_words_from_document( document, sentencepiece_model_tok, lower_case=True, strip_characters=strip_characters, ) if not words: return 0 augmentation = [] if cond_words_augmentation: augmentation = [ ModifyingDocuments.words_augmentation( words, group_size, words_augmentation_join_char ) for group_size in words_augmentation_group_sizes ] augmentation = [word for augm in augmentation for word in augm] stopwords_ratio = len( [word for word in words + augmentation if word in stopwords] ) / len(words) if stopwords_ratio > 1.0: stopwords_ratio = 1.0 return stopwords_ratio @staticmethod def check_stopwords( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, stopwords, stopwords_min_cutoff, ): cond = True if stopwords: stopwords_ratio = Filtering.compute_stopwords_ratio( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, stopwords, ) cond = stopwords_ratio >= stopwords_min_cutoff return cond @staticmethod def compute_flagged_words_ratio( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, flagged_words, ): words = ModifyingDocuments.get_words_from_document( document, sentencepiece_model_tok, lower_case=True, strip_characters=strip_characters, ) if not words: return 0 augmentation = [] if cond_words_augmentation: augmentation = [ ModifyingDocuments.words_augmentation( words, group_size, words_augmentation_join_char ) for group_size in words_augmentation_group_sizes ] augmentation = [word for augm in augmentation for word in augm] flagged_words_ratio = len( [word for word in words + augmentation if word in flagged_words] ) / len(words) if flagged_words_ratio > 1.0: flagged_words_ratio = 1.0 return flagged_words_ratio @staticmethod def check_flagged_words( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, flagged_words, flagged_words_max_cutoff, ): cond = True if flagged_words: flagged_words_ratio = Filtering.compute_flagged_words_ratio( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, flagged_words, ) cond = flagged_words_ratio <= flagged_words_max_cutoff return cond @staticmethod def compute_lang_id_pred_score(document, model_lang_id): document = document.lower().replace("\n", " ") pred = model_lang_id.predict(document) lang_pred_fasttext_id = pred[0][0].replace("__label__", "") score_pred = pred[1][0] lang_pred_dataset_id = langs_id.loc[ langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id" ] if len(lang_pred_dataset_id) > 0: lang_pred_dataset_id = lang_pred_dataset_id.iloc[0] else: lang_pred_dataset_id = "unknown" return lang_pred_dataset_id, score_pred @staticmethod def check_lang_id( document, lang_dataset_id, model_lang_id, lang_id_min_cutoff, ): cond = True if model_lang_id: lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score( document, model_lang_id ) cond = (lang_pred_dataset_id == lang_dataset_id) and ( score_pred >= lang_id_min_cutoff ) return cond @staticmethod def compute_perplexity_score(document, sentencepiece_model, kenlm_model): document = ModifyingDocuments.normalization( document=document, remove_non_printing_characters=True, strip=True, lower_case=False, uniform_whitespace=True, replace_digits_with_zeros=True, replace_unicode_punctuation=True, ) document = ModifyingDocuments.tokenization( document, sentencepiece_model, join_on_whitespace=True ) doc_log_score, doc_length = 0, 0 for line in document.split("\n"): log_score = kenlm_model.score(line) length = len(line.split()) + 1 doc_log_score += log_score doc_length += length pp_score = 10.0 ** (-doc_log_score / doc_length) pp_score = round(pp_score, 1) return pp_score @staticmethod def check_perplexity( document, sentencepiece_model, kenlm_model, perplexity_max_cutoff, ): cond = True if kenlm_model: score = Filtering.compute_perplexity_score( document, sentencepiece_model, kenlm_model ) cond = score <= perplexity_max_cutoff return cond @staticmethod def filtering( document, cond_check_number_words, sentencepiece_model_tok, strip_characters, number_words_min_cutoff, number_words_max_cutoff, cond_check_character_repetition_removal, character_repetition_length, character_repetition_max_cutoff, cond_check_word_repetition_removal, word_repetition_length, word_repetition_max_cutoff, cond_check_special_characters, special_characters, special_characters_max_cutoff, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, cond_check_stopwords, stopwords, stopwords_min_cutoff, cond_check_flagged_words, flagged_words, flagged_words_max_cutoff, cond_check_lang_id, lang_dataset_id, model_lang_id, lang_id_min_cutoff, cond_check_perplexity, sentencepiece_model, kenlm_model, perplexity_max_cutoff, ): if cond_check_number_words: if not Filtering.check_number_words( document, sentencepiece_model_tok, strip_characters, number_words_min_cutoff, number_words_max_cutoff, ): return False if cond_check_character_repetition_removal: if not Filtering.check_character_repetition_removal( document, character_repetition_length, character_repetition_max_cutoff, ): return False if cond_check_word_repetition_removal: if not Filtering.check_word_repetition_removal( document, sentencepiece_model_tok, strip_characters, word_repetition_length, word_repetition_max_cutoff, ): return False if cond_check_special_characters: if not Filtering.check_special_characters( document, special_characters, special_characters_max_cutoff, ): return False if cond_check_stopwords: if not Filtering.check_stopwords( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, stopwords, stopwords_min_cutoff, ): return False if cond_check_flagged_words: if not Filtering.check_flagged_words( document, sentencepiece_model_tok, strip_characters, cond_words_augmentation, words_augmentation_group_sizes, words_augmentation_join_char, flagged_words, flagged_words_max_cutoff, ): return False if cond_check_lang_id: if not Filtering.check_lang_id( document, lang_dataset_id, model_lang_id, lang_id_min_cutoff, ): return False if cond_check_perplexity: if not Filtering.check_perplexity( document, sentencepiece_model, kenlm_model, perplexity_max_cutoff, ): return False return True class FunctionDatasetFiltering: def __init__( self, lang_dataset_id, path_fasttext_model, path_sentencepiece_model, path_kenlm_model, ): self.lang_dataset_id = lang_dataset_id self.path_fasttext_model = path_fasttext_model self.path_sentencepiece_model = path_sentencepiece_model self.path_kenlm_model = path_kenlm_model self.param = LoadParameters.load_parameters(lang_dataset_id) self.stopwords = LoadParameters.load_stopwords(lang_dataset_id) self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id) self.model_lang_id = LoadParameters.load_model_lang_id( lang_dataset_id, path_fasttext_model ) self.sentencepiece_model = LoadParameters.load_sentencepiece_model( lang_dataset_id, path_sentencepiece_model ) self.sentencepiece_model_tok = ( self.sentencepiece_model if self.param["tokenization"] else None ) self.kenlm_model = LoadParameters.load_kenlm_model( lang_dataset_id, path_kenlm_model ) def __call__(self, example): keep_example = Filtering.filtering( document=example["text"], cond_check_number_words=self.param["cond_check_number_words"], sentencepiece_model_tok=self.sentencepiece_model_tok, strip_characters=self.param["strip_characters"], number_words_min_cutoff=self.param["number_words_min_cutoff"], number_words_max_cutoff=self.param["number_words_max_cutoff"], cond_check_character_repetition_removal=self.param[ "cond_check_character_repetition_removal" ], character_repetition_length=self.param["character_repetition_length"], character_repetition_max_cutoff=self.param[ "character_repetition_max_cutoff" ], cond_check_word_repetition_removal=self.param[ "cond_check_word_repetition_removal" ], word_repetition_length=self.param["word_repetition_length"], word_repetition_max_cutoff=self.param["word_repetition_max_cutoff"], cond_check_special_characters=self.param["cond_check_special_characters"], special_characters=self.param["special_characters"], special_characters_max_cutoff=self.param["special_characters_max_cutoff"], cond_words_augmentation=self.param["cond_words_augmentation"], words_augmentation_group_sizes=self.param["words_augmentation_group_sizes"], words_augmentation_join_char=self.param["words_augmentation_join_char"], cond_check_stopwords=self.param["cond_check_stopwords"], stopwords=self.stopwords, stopwords_min_cutoff=self.param["stopwords_min_cutoff"], cond_check_flagged_words=self.param["cond_check_flagged_words"], flagged_words=self.flagged_words, flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"], cond_check_lang_id=self.param["cond_check_lang_id"], lang_dataset_id=self.lang_dataset_id, model_lang_id=self.model_lang_id, lang_id_min_cutoff=self.param["lang_id_min_cutoff"], cond_check_perplexity=self.param["cond_check_perplexity"], sentencepiece_model=self.sentencepiece_model, kenlm_model=self.kenlm_model, perplexity_max_cutoff=self.param["perplexity_max_cutoff"], ) return keep_example def __reduce__(self): return ( self.__class__, ( self.lang_dataset_id, self.path_fasttext_model, self.path_sentencepiece_model, self.path_kenlm_model, ), ) class DatasetFiltering: def __init__( self, dataset, lang_dataset_id, path_fasttext_model, path_sentencepiece_model, path_kenlm_model, num_proc, path_dir_save_dataset, ): self.ds = dataset self.lang_dataset_id = lang_dataset_id self.path_fasttext_model = path_fasttext_model self.path_sentencepiece_model = path_sentencepiece_model self.path_kenlm_model = path_kenlm_model self.num_proc = num_proc self.path_dir_save_dataset = path_dir_save_dataset def modifying_documents(self): func_dataset_modifying_documents = FunctionDatasetModifyingDocuments( self.lang_dataset_id ) self.ds = self.ds.map(func_dataset_modifying_documents, num_proc=self.num_proc) def filtering(self): func_dataset_filtering = FunctionDatasetFiltering( self.lang_dataset_id, self.path_fasttext_model, self.path_sentencepiece_model, self.path_kenlm_model, ) self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc) def save_dataset(self): pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True) path_dir_save_dataset = pathlib.PurePath( self.path_dir_save_dataset, self.lang_dataset_id ) pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True) self.ds.save_to_disk(path_dir_save_dataset)