Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Jan 18, 2022

Commit

f217a73

1 Parent(s): bfbcd60

rename badwords to flagged words + new flagged words list of 68 words

Browse files

Files changed (7) hide show

app.py +17 -16
en_examples_with_stats.json +2 -2
explanation_filtering_pipeline.pdf +0 -0
filtering.py +33 -36
badwords.py → flagged_words.py +29 -444
languages_id.py +25 -25
parameters_filtering.py +52 -52

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import os
 import base64
 import json
 import pandas as pd
 pd.options.mode.chained_assignment = None
 import numpy as np
@@ -40,7 +41,7 @@ class Visualization:
         self.lang_dataset_id = lang_dataset_id
         self.param = LoadParameters.load_parameters(lang_dataset_id)
         self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
-        self.badwords = LoadParameters.load_badwords(lang_dataset_id)
         self.model_lang_id = LoadParameters.load_model_lang_id(
             lang_dataset_id, path_fasttext_model
         )
@@ -222,16 +223,16 @@ class Visualization:
                 print_discared_by_cond(cond)
                 conds["stopwords_ratio"] = [cond]
-            if "badwords_ratio" in columns:
-                cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
-                cutoff_badwords_ratio = st.sidebar.slider(
                     cutoff_def, 0.0, 1.0, 1.0, step=0.01
                 )
-                new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
                 keys.append(new_key)
                 cond = get_cond(new_key[0], new_key[1], new_key[2])
                 print_discared_by_cond(cond)
-                conds["badwords_ratio"] = [cond]
             if "lang_id_score" in columns:
                 cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
@@ -316,11 +317,11 @@ class Visualization:
                     "Discarded documents for the filter on the stop words ratio",
                 )
-            if "badwords_ratio" in columns:
-                cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
                 display_dataset(
                     cond_filter,
-                    "Discarded documents for the filter on the bad words ratio",
                 )
             if "lang_id_score" in columns:
@@ -504,19 +505,19 @@ class Visualization:
                     if is_doc_discarded(key, stopwords_ratio):
                         is_discarded = True
-                elif key[0] == "badwords_ratio":
-                    badwords_ratio = Filtering.compute_badwords_ratio(
                         personal_doc,
                         self.sentencepiece_model_tok,
                         self.param["strip_characters"],
                         self.param["cond_words_augmentation"],
                         self.param["words_augmentation_group_sizes"],
                         self.param["words_augmentation_join_char"],
-                        self.badwords,
                     )
-                    badwords_ratio = round(badwords_ratio, 3)
-                    st.markdown(f"Flagged words ratio: {badwords_ratio}")
-                    if is_doc_discarded(key, badwords_ratio):
                         is_discarded = True
                 elif key[0] == "lang_id_score":
@@ -530,7 +531,7 @@ class Visualization:
                     st.markdown(
                         f"Language identification confidence score: {lang_id_score}"
                     )
-                    if is_doc_discarded(key, badwords_ratio) or (
                         self.lang_dataset_id != lang_pred_dataset_id
                     ):
                         is_discarded = True

 import base64
 import json
 import pandas as pd
 pd.options.mode.chained_assignment = None
 import numpy as np
         self.lang_dataset_id = lang_dataset_id
         self.param = LoadParameters.load_parameters(lang_dataset_id)
         self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
         self.model_lang_id = LoadParameters.load_model_lang_id(
             lang_dataset_id, path_fasttext_model
         )
                 print_discared_by_cond(cond)
                 conds["stopwords_ratio"] = [cond]
+            if "flagged_words_ratio" in columns:
+                cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
+                cutoff_flagged_words_ratio = st.sidebar.slider(
                     cutoff_def, 0.0, 1.0, 1.0, step=0.01
                 )
+                new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
                 keys.append(new_key)
                 cond = get_cond(new_key[0], new_key[1], new_key[2])
                 print_discared_by_cond(cond)
+                conds["flagged_words_ratio"] = [cond]
             if "lang_id_score" in columns:
                 cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
                     "Discarded documents for the filter on the stop words ratio",
                 )
+            if "flagged_words_ratio" in columns:
+                cond_filter = np.invert(np.all(conds["flagged_words_ratio"], axis=0))
                 display_dataset(
                     cond_filter,
+                    "Discarded documents for the filter on the flagged words ratio",
                 )
             if "lang_id_score" in columns:
                     if is_doc_discarded(key, stopwords_ratio):
                         is_discarded = True
+                elif key[0] == "flagged_words_ratio":
+                    flagged_words_ratio = Filtering.compute_flagged_words_ratio(
                         personal_doc,
                         self.sentencepiece_model_tok,
                         self.param["strip_characters"],
                         self.param["cond_words_augmentation"],
                         self.param["words_augmentation_group_sizes"],
                         self.param["words_augmentation_join_char"],
+                        self.flagged_words,
                     )
+                    flagged_words_ratio = round(flagged_words_ratio, 3)
+                    st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
+                    if is_doc_discarded(key, flagged_words_ratio):
                         is_discarded = True
                 elif key[0] == "lang_id_score":
                     st.markdown(
                         f"Language identification confidence score: {lang_id_score}"
                     )
+                    if is_doc_discarded(key, flagged_words_ratio) or (
                         self.lang_dataset_id != lang_pred_dataset_id
                     ):
                         is_discarded = True

en_examples_with_stats.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00106fc2a9d51bbc78ce1ca2d05f2f402bf927a1f741f6c092b3f17cb9f16801
-size 237353442

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffbb8afeba42822e4b10341112999321e0e14a19a5eeebc342dc68a9f65d3c7f
+size 237426014

explanation_filtering_pipeline.pdf CHANGED Viewed

Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ

filtering.py CHANGED Viewed

@@ -13,7 +13,7 @@ from languages_id import langs_id
 from parameters_filtering import parameters_filtering
 from normalization import normalization
 from stopwords import stopwords
-from badwords import badwords
 class LoadParameters:
@@ -37,15 +37,15 @@ class LoadParameters:
         return stopwords_lang
     @staticmethod
-    def load_badwords(lang_dataset_id):
-        badwords_lang_id = langs_id.loc[
-            langs_id["dataset_id"] == lang_dataset_id, "badwords_id"
         ].iloc[0]
-        if badwords_lang_id:
-            badwords_lang = set(badwords[badwords_lang_id])
         else:
-            badwords_lang = None
-        return badwords_lang
     @staticmethod
     def load_model_lang_id(lang_dataset_id, path_fasttext_model):
@@ -533,14 +533,14 @@ class Filtering:
         return cond
     @staticmethod
-    def compute_badwords_ratio(
         document,
         sentencepiece_model_tok,
         strip_characters,
         cond_words_augmentation,
         words_augmentation_group_sizes,
         words_augmentation_join_char,
-        badwords,
     ):
         words = ModifyingDocuments.get_words_from_document(
             document,
@@ -559,39 +559,36 @@ class Filtering:
                 for group_size in words_augmentation_group_sizes
             ]
             augmentation = [word for augm in augmentation for word in augm]
-        badwords_ratio = len(
-            [word for word in words + augmentation if word in badwords]
         ) / len(words)
-        if badwords_ratio > 1.0:
-            badwords_ratio = 1.0
-        for word in augmentation:
-            if word in badwords:
-                print(word)
-        return badwords_ratio
     @staticmethod
-    def check_badwords(
         document,
         sentencepiece_model_tok,
         strip_characters,
         cond_words_augmentation,
         words_augmentation_group_sizes,
         words_augmentation_join_char,
-        badwords,
-        badwords_max_cutoff,
     ):
         cond = True
-        if badwords:
-            badwords_ratio = Filtering.compute_badwords_ratio(
                 document,
                 sentencepiece_model_tok,
                 strip_characters,
                 cond_words_augmentation,
                 words_augmentation_group_sizes,
                 words_augmentation_join_char,
-                badwords,
             )
-            cond = badwords_ratio <= badwords_max_cutoff
         return cond
     @staticmethod
@@ -685,9 +682,9 @@ class Filtering:
         cond_check_stopwords,
         stopwords,
         stopwords_min_cutoff,
-        cond_check_badwords,
-        badwords,
-        badwords_max_cutoff,
         cond_check_lang_id,
         lang_dataset_id,
         model_lang_id,
@@ -732,16 +729,16 @@ class Filtering:
                 stopwords_min_cutoff,
             ):
                 return False
-        if cond_check_badwords:
-            if not Filtering.check_badwords(
                 document,
                 sentencepiece_model_tok,
                 strip_characters,
                 cond_words_augmentation,
                 words_augmentation_group_sizes,
                 words_augmentation_join_char,
-                badwords,
-                badwords_max_cutoff,
             ):
                 return False
         if cond_check_lang_id:
@@ -778,7 +775,7 @@ class FunctionDatasetFiltering:
         self.param = LoadParameters.load_parameters(lang_dataset_id)
         self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
-        self.badwords = LoadParameters.load_badwords(lang_dataset_id)
         self.model_lang_id = LoadParameters.load_model_lang_id(
             lang_dataset_id, path_fasttext_model
         )
@@ -812,9 +809,9 @@ class FunctionDatasetFiltering:
             cond_check_stopwords=self.param["cond_check_stopwords"],
             stopwords=self.stopwords,
             stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
-            cond_check_badwords=self.param["cond_check_badwords"],
-            badwords=self.badwords,
-            badwords_max_cutoff=self.param["badwords_max_cutoff"],
             cond_check_lang_id=self.param["cond_check_lang_id"],
             lang_dataset_id=self.lang_dataset_id,
             model_lang_id=self.model_lang_id,

 from parameters_filtering import parameters_filtering
 from normalization import normalization
 from stopwords import stopwords
+from flagged_words import flagged_words
 class LoadParameters:
         return stopwords_lang
     @staticmethod
+    def load_flagged_words(lang_dataset_id):
+        flagged_words_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id"
         ].iloc[0]
+        if flagged_words_lang_id:
+            flagged_words_lang = set(flagged_words[flagged_words_lang_id])
         else:
+            flagged_words_lang = None
+        return flagged_words_lang
     @staticmethod
     def load_model_lang_id(lang_dataset_id, path_fasttext_model):
         return cond
     @staticmethod
+    def compute_flagged_words_ratio(
         document,
         sentencepiece_model_tok,
         strip_characters,
         cond_words_augmentation,
         words_augmentation_group_sizes,
         words_augmentation_join_char,
+        flagged_words,
     ):
         words = ModifyingDocuments.get_words_from_document(
             document,
                 for group_size in words_augmentation_group_sizes
             ]
             augmentation = [word for augm in augmentation for word in augm]
+        flagged_words_ratio = len(
+            [word for word in words + augmentation if word in flagged_words]
         ) / len(words)
+        if flagged_words_ratio > 1.0:
+            flagged_words_ratio = 1.0
+        return flagged_words_ratio
     @staticmethod
+    def check_flagged_words(
         document,
         sentencepiece_model_tok,
         strip_characters,
         cond_words_augmentation,
         words_augmentation_group_sizes,
         words_augmentation_join_char,
+        flagged_words,
+        flagged_words_max_cutoff,
     ):
         cond = True
+        if flagged_words:
+            flagged_words_ratio = Filtering.compute_flagged_words_ratio(
                 document,
                 sentencepiece_model_tok,
                 strip_characters,
                 cond_words_augmentation,
                 words_augmentation_group_sizes,
                 words_augmentation_join_char,
+                flagged_words,
             )
+            cond = flagged_words_ratio <= flagged_words_max_cutoff
         return cond
     @staticmethod
         cond_check_stopwords,
         stopwords,
         stopwords_min_cutoff,
+        cond_check_flagged_words,
+        flagged_words,
+        flagged_words_max_cutoff,
         cond_check_lang_id,
         lang_dataset_id,
         model_lang_id,
                 stopwords_min_cutoff,
             ):
                 return False
+        if cond_check_flagged_words:
+            if not Filtering.check_flagged_words(
                 document,
                 sentencepiece_model_tok,
                 strip_characters,
                 cond_words_augmentation,
                 words_augmentation_group_sizes,
                 words_augmentation_join_char,
+                flagged_words,
+                flagged_words_max_cutoff,
             ):
                 return False
         if cond_check_lang_id:
         self.param = LoadParameters.load_parameters(lang_dataset_id)
         self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
         self.model_lang_id = LoadParameters.load_model_lang_id(
             lang_dataset_id, path_fasttext_model
         )
             cond_check_stopwords=self.param["cond_check_stopwords"],
             stopwords=self.stopwords,
             stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
+            cond_check_flagged_words=self.param["cond_check_flagged_words"],
+            flagged_words=self.flagged_words,
+            flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"],
             cond_check_lang_id=self.param["cond_check_lang_id"],
             lang_dataset_id=self.lang_dataset_id,
             model_lang_id=self.model_lang_id,

badwords.py → flagged_words.py RENAMED Viewed

@@ -6,89 +6,21 @@
 # https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
-english_badwords = [
-    "abuse",
     "anal",
-    "anilingus",
-    "anus",
-    "aroused",
-    "arse",
-    "arsehole",
-    "ass",
-    "asses",
-    "assfuck",
-    "asshat",
-    "asshole",
-    "assholes",
-    "autoerotic",
-    "bangbros",
-    "banging",
     "bareback",
-    "bastard",
-    "bastards",
-    "bazongas",
     "bbw",
     "bdsm",
-    "biatch",
-    "bicurious",
-    "bigass",
-    "bigtits",
-    "bimbo",
-    "bimbos",
-    "bitch",
-    "bitches",
-    "bitching",
     "blowjob",
     "blowjobs",
-    "boche",
-    "boner",
-    "boners",
-    "boob",
-    "boobies",
-    "boobs",
-    "booty",
-    "brothel",
-    "buceta",
-    "bugger",
-    "buggered",
-    "buggery",
     "bukkake",
-    "bule",
-    "buttcheeks",
-    "buttfuck",
-    "butthead",
-    "butthole",
-    "buttplug",
-    "cameltoe",
     "camgirl",
     "camwhore",
-    "chink",
-    "chinks",
-    "cialis",
-    "clit",
-    "clitoris",
-    "clits",
-    "clitty",
-    "clusterfuck",
-    "cock",
-    "cock-head",
-    "cockblock",
-    "cockfight",
-    "cockhead",
-    "cocks",
-    "cocksman",
-    "cocksucker",
     "cocksucking",
-    "coital",
-    "coitus",
-    "coochie",
-    "cooly",
-    "coon",
-    "coons",
-    "copulate",
-    "cowgirl",
-    "crabs",
     "creampie",
     "cum",
     "cumming",
     "cums",
@@ -96,399 +28,58 @@ english_badwords = [
     "cumshots",
     "cumslut",
     "cunnilingus",
-    "cunny",
-    "cunt",
-    "cunts",
-    "cybersex",
-    "darkey",
-    "darkie",
-    "darkies",
-    "darky",
     "deepthroat",
     "deepthroating",
-    "dick",
-    "dickhole",
-    "dicks",
     "dildo",
     "dildos",
     "dogging",
-    "doggy-style",
     "doggystyle",
     "dominatrix",
-    "dommes",
-    "dong",
-    "dp",
-    "dupa",
-    "dyke",
-    "dykes",
-    "ecchi",
-    "ejaculate",
-    "ejaculated",
-    "ejaculates",
-    "ejaculating",
-    "ejaculation",
-    "ejaculations",
-    "enema",
-    "erect",
-    "erection",
-    "ero",
     "erotic",
-    "erotism",
-    "escort",
-    "fag",
-    "fagging",
-    "faggot",
-    "fagot",
-    "fagots",
-    "fags",
-    "felch",
-    "fellate",
     "fellatio",
     "femdom",
-    "fetish",
-    "figging",
-    "fingerbang",
     "fingering",
-    "fisted",
-    "fister",
     "fisting",
-    "floozy",
-    "fondle",
-    "footfetish",
     "footjob",
-    "foreskin",
-    "fornicate",
-    "foursome",
-    "fuck",
-    "fuckable",
-    "fuckbook",
-    "fuckboy",
-    "fuckbuddy",
-    "fucked",
-    "fucker",
-    "fuckers",
-    "fuckfest",
-    "fuckhole",
-    "fuckin",
-    "fucking",
-    "fucks",
-    "fuk",
-    "fukin",
-    "fuking",
-    "g-spot",
     "gangbang",
-    "gangbanged",
-    "gangbanger",
-    "gangbangs",
-    "genital",
-    "genitals",
-    "gigolo",
-    "glans",
-    "gonad",
-    "gonads",
-    "gook",
-    "gringo",
-    "gringos",
-    "grope",
-    "gspot",
-    "guido",
     "handjob",
-    "haole",
-    "hapa",
-    "hardcore",
-    "hardon",
-    "harem",
     "hentai",
-    "hindoo",
-    "hoe",
-    "hoes",
-    "honky",
-    "hooker",
-    "hookers",
-    "hooter",
-    "hooters",
-    "hori",
-    "horndog",
     "horney",
     "horniest",
     "horny",
-    "humped",
-    "humper",
-    "humping",
-    "hussy",
-    "hymen",
-    "ikey",
-    "incest",
-    "injun",
-    "intercourse",
-    "interracial",
-    "jack-off",
-    "jackoff",
-    "jailbait",
-    "jerk-off",
-    "jerkoff",
-    "jiggy",
     "jism",
     "jizz",
-    "jizzed",
-    "kaffir",
-    "kafir",
-    "kike",
-    "kikes",
-    "kinkster",
-    "kinky",
-    "kkk",
-    "klan",
-    "kraut",
-    "labia",
-    "lapdance",
-    "libido",
-    "licker",
-    "licking",
-    "limey",
-    "lingerie",
-    "livesex",
-    "lolita",
-    "lovemaking",
-    "lust",
-    "lusting",
-    "masochist",
-    "masterbate",
     "masterbating",
-    "masterbation",
     "masturbate",
     "masturbating",
     "masturbation",
     "milf",
-    "minge",
-    "missionary",
-    "molest",
-    "molestation",
-    "molester",
-    "munging",
-    "muschi",
-    "nads",
-    "naked",
-    "necked",
-    "necro",
-    "negress",
-    "negro",
-    "negroes",
-    "negroid",
-    "negros",
-    "nig",
-    "nigar",
-    "nigga",
-    "niggas",
-    "niggaz",
-    "nigger",
-    "niggers",
-    "nigra",
-    "nipple",
-    "nipples",
-    "nookie",
-    "nooky",
-    "nooner",
-    "nude",
-    "nudie",
-    "nudity",
-    "nymph",
-    "nympho",
-    "nymphomania",
-    "orgasim",
-    "orgasm",
-    "orgasms",
     "orgies",
     "orgy",
-    "orifice",
-    "p0rn",
-    "paedophile",
-    "pantie",
-    "panties",
-    "panty",
-    "pastie",
-    "pecker",
-    "pedo",
-    "pedophile",
-    "pedophilia",
-    "pedophiliac",
-    "peeper",
-    "peepshow",
     "pegging",
-    "penetrate",
-    "penetration",
-    "penile",
-    "penis",
-    "penises",
-    "penus",
-    "perv",
-    "phallic",
-    "phonesex",
-    "pickaninnies",
-    "pimp",
-    "playboy",
-    "playgirl",
-    "poontang",
     "porn",
     "porno",
-    "pornography",
     "pornos",
-    "pr0n",
-    "premature",
-    "preteen",
-    "pron",
-    "prostitute",
-    "pube",
-    "pubes",
-    "pubic",
-    "pubis",
-    "punani",
-    "pussies",
-    "pussy",
-    "pussys",
-    "pusy",
-    "puta",
-    "puto",
-    "queef",
-    "quickie",
-    "quicky",
-    "quim",
-    "randy",
-    "rape",
-    "raped",
-    "raper",
-    "raping",
-    "rapist",
-    "rectum",
-    "redneck",
-    "rednecks",
-    "redskin",
-    "redskins",
-    "rimjob",
     "rimming",
-    "russki",
-    "s&m",
-    "sadism",
-    "sadist",
-    "sambo",
-    "santorum",
-    "schlong",
-    "scissoring",
-    "semen",
-    "sex",
-    "sexed",
-    "sexi",
-    "sexing",
-    "sexo",
-    "sexpot",
-    "sextoy",
-    "sexual",
-    "sexually",
-    "sexx",
-    "sexxx",
-    "sexxxy",
-    "sexxy",
-    "sexy",
-    "sh!t",
-    "sh1t",
-    "shagging",
-    "shemale",
-    "sissy",
-    "skank",
-    "skanks",
-    "slapper",
-    "slut",
-    "sluts",
-    "slutting",
     "slutty",
-    "smut",
-    "smutty",
-    "sodomise",
-    "sodomite",
-    "sodomize",
-    "sodomy",
-    "spank",
-    "sperm",
-    "spic",
-    "spick",
-    "splooge",
-    "spooge",
-    "squaw",
     "squirting",
-    "steamy",
-    "stiffy",
     "strapon",
-    "suck",
-    "sucked",
-    "sucker",
-    "sucking",
-    "sucks",
-    "swallow",
-    "swallower",
-    "swinger",
-    "teabagging",
-    "testical",
-    "testicle",
-    "testicles",
-    "testis",
     "threesome",
-    "threeway",
-    "titfuck",
-    "titjob",
-    "tits",
-    "tittie",
-    "titties",
-    "titty",
-    "tittyfuck",
-    "tity",
-    "toots",
-    "topless",
-    "trannie",
-    "tranny",
-    "tribadism",
-    "twat",
-    "twats",
-    "undies",
-    "undressing",
-    "upskirt",
-    "vag",
-    "vagina",
-    "vaginal",
-    "viagra",
     "vibrator",
-    "virgin",
-    "vixen",
-    "voyeur",
-    "vulva",
-    "wank",
-    "wanker",
-    "wanking",
-    "wazoo",
-    "wedgie",
-    "wench",
-    "wetback",
-    "whore",
-    "whored",
-    "whorehouse",
-    "whores",
-    "whoring",
-    "wigger",
-    "willie",
-    "willies",
-    "willy",
-    "wog",
-    "wop",
-    "x-rated",
     "xxx",
-    "xxxxxx",
-    "yaoi",
-    "yid",
-    "zoophile",
-    "zoophilia",
 ]
-badwords = {
-    "ar": english_badwords
     + [
         "احتلام",
         "اغتصاب",
@@ -529,9 +120,8 @@ badwords = {
         "مفلقسة",
         "نيك",
     ],
-    "ca": english_badwords
     + [
-        "avortament",
         "anal",
         "anus",
         "cul",
@@ -664,8 +254,8 @@ badwords = {
         "x classificat",
         "xxx",
     ],
-    "en": english_badwords,
-    "es": english_badwords
     + [
         "Asesinato",
         "Bollera",
@@ -846,7 +436,6 @@ badwords = {
         "tirón",
         "tizón",
         "tonto",
-        "transexual",
         "vagina",
         "vete a la mierda",
         "viagra",
@@ -858,7 +447,7 @@ badwords = {
         "xxx",
         "zurullo",
     ],
-    "eu": english_badwords
     + [
         "abortu",
         "anal",
@@ -899,13 +488,12 @@ badwords = {
         "bagina",
         "viagra",
     ],
-    "fr": english_badwords
     + [
         "MALPT",
         "anal",
         "anus",
         "arracher",
-        "avortement",
         "baise",
         "baiser",
         "baiseur de chien",
@@ -1078,7 +666,6 @@ badwords = {
         "testicule",
         "teuch",
         "titt",
-        "transexuelle",
         "tremper",
         "tringler",
         "trique",
@@ -1102,7 +689,7 @@ badwords = {
         "éjaculé",
         "étron",
     ],
-    "hi": english_badwords
     + [
         "aand",
         "aandu",
@@ -1163,7 +750,6 @@ badwords = {
         "fagging",
         "fagots",
         "felching",
-        "fuckers",
         "fuckings",
         "fucks",
         "gaand",
@@ -1361,7 +947,7 @@ badwords = {
         "होमोसेक्सुअल",
         "होर",
     ],
-    "id": english_badwords
     + [
         "abortus",
         "anal",
@@ -1479,7 +1065,7 @@ badwords = {
         "x diberi peringkat",
         "xxx",
     ],
-    "kn": english_badwords
     + [
         "ಗರ್ಭಪಾತ",
         "ಗುದ",
@@ -1622,7 +1208,7 @@ badwords = {
         "x ರೇಟೆಡ್",
         "xxx",
     ],
-    "ml": english_badwords
     + [
         "ഗർഭഛിദ്രം",
         "വ��ശപ്പ്",
@@ -1759,7 +1345,7 @@ badwords = {
         "x റേറ്റുചെയ്തു",
         "xxx",
     ],
-    "mr": english_badwords
     + [
         "गर्भपात",
         "गुदा",
@@ -1898,7 +1484,7 @@ badwords = {
         "एक्स रेट केले",
         "xxx",
     ],
-    "pt": english_badwords
     + [
         "aborto",
         "amador",
@@ -2000,7 +1586,6 @@ badwords = {
         "gozar",
         "grelho",
         "heroína",
-        "homem gay",
         "homoerótico",
         "homosexual",
         "hore",
@@ -2097,7 +1682,7 @@ badwords = {
         "xxx",
         "ânus",
     ],
-    "te": english_badwords
     + [
         "గర్భస్రావం",
         "అంగ",
@@ -2244,7 +1829,7 @@ badwords = {
         "x రేట్",
         "xxx",
     ],
-    "vi": english_badwords
     + [
         "sự phá thai",
         "hậu môn",
@@ -2362,7 +1947,7 @@ badwords = {
         "x đánh giá",
         "xxx",
     ],
-    "zh": english_badwords
     + [
         "13.",
         "13点",

 # https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
+english_flagged_words = [
     "anal",
     "bareback",
     "bbw",
     "bdsm",
     "blowjob",
     "blowjobs",
+    "brazzers",
     "bukkake",
     "camgirl",
     "camwhore",
     "cocksucking",
+    "cougar",
     "creampie",
+    "cuckold",
     "cum",
     "cumming",
     "cums",
     "cumshots",
     "cumslut",
     "cunnilingus",
     "deepthroat",
     "deepthroating",
     "dildo",
     "dildos",
     "dogging",
     "doggystyle",
     "dominatrix",
     "erotic",
     "fellatio",
     "femdom",
     "fingering",
     "fisting",
     "footjob",
     "gangbang",
     "handjob",
     "hentai",
     "horney",
     "horniest",
     "horny",
     "jism",
     "jizz",
     "masterbating",
     "masturbate",
     "masturbating",
     "masturbation",
     "milf",
     "orgies",
     "orgy",
     "pegging",
     "porn",
+    "pornhub",
     "porno",
     "pornos",
+    "pornstar",
+    "pornstars",
+    "redtube",
     "rimming",
     "slutty",
     "squirting",
     "strapon",
     "threesome",
     "vibrator",
+    "xhamster",
+    "xnxx",
+    "xvideos",
     "xxx",
+    "youporn",
 ]
+flagged_words = {
+    "ar": english_flagged_words
     + [
         "احتلام",
         "اغتصاب",
         "مفلقسة",
         "نيك",
     ],
+    "ca": english_flagged_words
     + [
         "anal",
         "anus",
         "cul",
         "x classificat",
         "xxx",
     ],
+    "en": english_flagged_words,
+    "es": english_flagged_words
     + [
         "Asesinato",
         "Bollera",
         "tirón",
         "tizón",
         "tonto",
         "vagina",
         "vete a la mierda",
         "viagra",
         "xxx",
         "zurullo",
     ],
+    "eu": english_flagged_words
     + [
         "abortu",
         "anal",
         "bagina",
         "viagra",
     ],
+    "fr": english_flagged_words
     + [
         "MALPT",
         "anal",
         "anus",
         "arracher",
         "baise",
         "baiser",
         "baiseur de chien",
         "testicule",
         "teuch",
         "titt",
         "tremper",
         "tringler",
         "trique",
         "éjaculé",
         "étron",
     ],
+    "hi": english_flagged_words
     + [
         "aand",
         "aandu",
         "fagging",
         "fagots",
         "felching",
         "fuckings",
         "fucks",
         "gaand",
         "होमोसेक्सुअल",
         "होर",
     ],
+    "id": english_flagged_words
     + [
         "abortus",
         "anal",
         "x diberi peringkat",
         "xxx",
     ],
+    "kn": english_flagged_words
     + [
         "ಗರ್ಭಪಾತ",
         "ಗುದ",
         "x ರೇಟೆಡ್",
         "xxx",
     ],
+    "ml": english_flagged_words
     + [
         "ഗർഭഛിദ്രം",
         "വ��ശപ്പ്",
         "x റേറ്റുചെയ്തു",
         "xxx",
     ],
+    "mr": english_flagged_words
     + [
         "गर्भपात",
         "गुदा",
         "एक्स रेट केले",
         "xxx",
     ],
+    "pt": english_flagged_words
     + [
         "aborto",
         "amador",
         "gozar",
         "grelho",
         "heroína",
         "homoerótico",
         "homosexual",
         "hore",
         "xxx",
         "ânus",
     ],
+    "te": english_flagged_words
     + [
         "గర్భస్రావం",
         "అంగ",
         "x రేట్",
         "xxx",
     ],
+    "vi": english_flagged_words
     + [
         "sự phá thai",
         "hậu môn",
         "x đánh giá",
         "xxx",
     ],
+    "zh": english_flagged_words
     + [
         "13.",
         "13点",

languages_id.py CHANGED Viewed

@@ -6,7 +6,7 @@ langs_id = [
         "lang": "Afrikaans",
         "dataset_id": "af",
         "stopwords_id": "af",
-        "badwords_id": None,
         "fasttext_id": "af",
         "sentencepiece_id": "af",
         "kenlm_id": "af",
@@ -15,7 +15,7 @@ langs_id = [
         "lang": "Arabic",
         "dataset_id": "ar",
         "stopwords_id": "ar",
-        "badwords_id": "ar",
         "fasttext_id": "ar",
         "sentencepiece_id": "ar",
         "kenlm_id": "ar",
@@ -24,7 +24,7 @@ langs_id = [
         "lang": "Egyptian Arabic",
         "dataset_id": "arz",
         "stopwords_id": None,
-        "badwords_id": None,
         "fasttext_id": "arz",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -33,7 +33,7 @@ langs_id = [
         "lang": "Assamese",
         "dataset_id": "as",
         "stopwords_id": None,
-        "badwords_id": None,
         "fasttext_id": "as",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -42,7 +42,7 @@ langs_id = [
         "lang": "Bengali",
         "dataset_id": "bn",
         "stopwords_id": "bn",
-        "badwords_id": None,
         "fasttext_id": "bn",
         "sentencepiece_id": "bn",
         "kenlm_id": "bn",
@@ -51,7 +51,7 @@ langs_id = [
         "lang": "Catalan",
         "dataset_id": "ca",
         "stopwords_id": "ca",
-        "badwords_id": "ca",
         "fasttext_id": "ca",
         "sentencepiece_id": "ca",
         "kenlm_id": "ca",
@@ -60,7 +60,7 @@ langs_id = [
         "lang": "English",
         "dataset_id": "en",
         "stopwords_id": "en",
-        "badwords_id": "en",
         "fasttext_id": "en",
         "sentencepiece_id": "en",
         "kenlm_id": "en",
@@ -69,7 +69,7 @@ langs_id = [
         "lang": "Spanish",
         "dataset_id": "es",
         "stopwords_id": "es",
-        "badwords_id": "es",
         "fasttext_id": "es",
         "sentencepiece_id": "es",
         "kenlm_id": "es",
@@ -78,7 +78,7 @@ langs_id = [
         "lang": "Basque",
         "dataset_id": "eu",
         "stopwords_id": "eu",
-        "badwords_id": "eu",
         "fasttext_id": "eu",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -87,7 +87,7 @@ langs_id = [
         "lang": "French",
         "dataset_id": "fr",
         "stopwords_id": "fr",
-        "badwords_id": "fr",
         "fasttext_id": "fr",
         "sentencepiece_id": "fr",
         "kenlm_id": "fr",
@@ -96,7 +96,7 @@ langs_id = [
         "lang": "Gujarati",
         "dataset_id": "gu",
         "stopwords_id": None,
-        "badwords_id": None,
         "fasttext_id": "gu",
         "sentencepiece_id": "gu",
         "kenlm_id": "gu",
@@ -105,7 +105,7 @@ langs_id = [
         "lang": "Hindi",
         "dataset_id": "hi",
         "stopwords_id": "hi",
-        "badwords_id": "hi",
         "fasttext_id": "hi",
         "sentencepiece_id": "hi",
         "kenlm_id": "hi",
@@ -114,7 +114,7 @@ langs_id = [
         "lang": "Indonesian",
         "dataset_id": "id",
         "stopwords_id": "id",
-        "badwords_id": "id",
         "fasttext_id": "id",
         "sentencepiece_id": "id",
         "kenlm_id": "id",
@@ -123,7 +123,7 @@ langs_id = [
         "lang": "Kannada",
         "dataset_id": "kn",
         "stopwords_id": None,
-        "badwords_id": "kn",
         "fasttext_id": "kn",
         "sentencepiece_id": "kn",
         "kenlm_id": "kn",
@@ -132,7 +132,7 @@ langs_id = [
         "lang": "Malayalam",
         "dataset_id": "ml",
         "stopwords_id": None,
-        "badwords_id": "ml",
         "fasttext_id": "ml",
         "sentencepiece_id": "ml",
         "kenlm_id": "ml",
@@ -141,7 +141,7 @@ langs_id = [
         "lang": "Marathi",
         "dataset_id": "mr",
         "stopwords_id": "mr",
-        "badwords_id": "mr",
         "fasttext_id": "mr",
         "sentencepiece_id": "mr",
         "kenlm_id": "mr",
@@ -150,7 +150,7 @@ langs_id = [
         "lang": "Portuguese",
         "dataset_id": "pt",
         "stopwords_id": "pt",
-        "badwords_id": "pt",
         "fasttext_id": "pt",
         "sentencepiece_id": "pt",
         "kenlm_id": "pt",
@@ -159,7 +159,7 @@ langs_id = [
         "lang": "Somali",
         "dataset_id": "so",
         "stopwords_id": "so",
-        "badwords_id": None,
         "fasttext_id": "so",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -168,7 +168,7 @@ langs_id = [
         "lang": "Swahili",
         "dataset_id": "sw",
         "stopwords_id": "sw",
-        "badwords_id": None,
         "fasttext_id": "sw",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -177,7 +177,7 @@ langs_id = [
         "lang": "Tamil",
         "dataset_id": "ta",
         "stopwords_id": None,
-        "badwords_id": None,
         "fasttext_id": "ta",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -186,7 +186,7 @@ langs_id = [
         "lang": "Telugu",
         "dataset_id": "te",
         "stopwords_id": None,
-        "badwords_id": "te",
         "fasttext_id": "te",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -195,7 +195,7 @@ langs_id = [
         "lang": "Urdu",
         "dataset_id": "ur",
         "stopwords_id": "ur",
-        "badwords_id": None,
         "fasttext_id": "ur",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -204,7 +204,7 @@ langs_id = [
         "lang": "Vietnamese",
         "dataset_id": "vi",
         "stopwords_id": "vi",
-        "badwords_id": "vi",
         "fasttext_id": "vi",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -213,7 +213,7 @@ langs_id = [
         "lang": "Yoruba",
         "dataset_id": "yo",
         "stopwords_id": "yo",
-        "badwords_id": None,
         "fasttext_id": "yo",
         "sentencepiece_id": None,
         "kenlm_id": None,
@@ -222,7 +222,7 @@ langs_id = [
         "lang": "Chinese",
         "dataset_id": "zh",
         "stopwords_id": "zh",
-        "badwords_id": "zh",
         "fasttext_id": "zh",
         "sentencepiece_id": "zh",
         "kenlm_id": "zh",

         "lang": "Afrikaans",
         "dataset_id": "af",
         "stopwords_id": "af",
+        "flagged_words_id": None,
         "fasttext_id": "af",
         "sentencepiece_id": "af",
         "kenlm_id": "af",
         "lang": "Arabic",
         "dataset_id": "ar",
         "stopwords_id": "ar",
+        "flagged_words_id": "ar",
         "fasttext_id": "ar",
         "sentencepiece_id": "ar",
         "kenlm_id": "ar",
         "lang": "Egyptian Arabic",
         "dataset_id": "arz",
         "stopwords_id": None,
+        "flagged_words_id": None,
         "fasttext_id": "arz",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Assamese",
         "dataset_id": "as",
         "stopwords_id": None,
+        "flagged_words_id": None,
         "fasttext_id": "as",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Bengali",
         "dataset_id": "bn",
         "stopwords_id": "bn",
+        "flagged_words_id": None,
         "fasttext_id": "bn",
         "sentencepiece_id": "bn",
         "kenlm_id": "bn",
         "lang": "Catalan",
         "dataset_id": "ca",
         "stopwords_id": "ca",
+        "flagged_words_id": "ca",
         "fasttext_id": "ca",
         "sentencepiece_id": "ca",
         "kenlm_id": "ca",
         "lang": "English",
         "dataset_id": "en",
         "stopwords_id": "en",
+        "flagged_words_id": "en",
         "fasttext_id": "en",
         "sentencepiece_id": "en",
         "kenlm_id": "en",
         "lang": "Spanish",
         "dataset_id": "es",
         "stopwords_id": "es",
+        "flagged_words_id": "es",
         "fasttext_id": "es",
         "sentencepiece_id": "es",
         "kenlm_id": "es",
         "lang": "Basque",
         "dataset_id": "eu",
         "stopwords_id": "eu",
+        "flagged_words_id": "eu",
         "fasttext_id": "eu",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "French",
         "dataset_id": "fr",
         "stopwords_id": "fr",
+        "flagged_words_id": "fr",
         "fasttext_id": "fr",
         "sentencepiece_id": "fr",
         "kenlm_id": "fr",
         "lang": "Gujarati",
         "dataset_id": "gu",
         "stopwords_id": None,
+        "flagged_words_id": None,
         "fasttext_id": "gu",
         "sentencepiece_id": "gu",
         "kenlm_id": "gu",
         "lang": "Hindi",
         "dataset_id": "hi",
         "stopwords_id": "hi",
+        "flagged_words_id": "hi",
         "fasttext_id": "hi",
         "sentencepiece_id": "hi",
         "kenlm_id": "hi",
         "lang": "Indonesian",
         "dataset_id": "id",
         "stopwords_id": "id",
+        "flagged_words_id": "id",
         "fasttext_id": "id",
         "sentencepiece_id": "id",
         "kenlm_id": "id",
         "lang": "Kannada",
         "dataset_id": "kn",
         "stopwords_id": None,
+        "flagged_words_id": "kn",
         "fasttext_id": "kn",
         "sentencepiece_id": "kn",
         "kenlm_id": "kn",
         "lang": "Malayalam",
         "dataset_id": "ml",
         "stopwords_id": None,
+        "flagged_words_id": "ml",
         "fasttext_id": "ml",
         "sentencepiece_id": "ml",
         "kenlm_id": "ml",
         "lang": "Marathi",
         "dataset_id": "mr",
         "stopwords_id": "mr",
+        "flagged_words_id": "mr",
         "fasttext_id": "mr",
         "sentencepiece_id": "mr",
         "kenlm_id": "mr",
         "lang": "Portuguese",
         "dataset_id": "pt",
         "stopwords_id": "pt",
+        "flagged_words_id": "pt",
         "fasttext_id": "pt",
         "sentencepiece_id": "pt",
         "kenlm_id": "pt",
         "lang": "Somali",
         "dataset_id": "so",
         "stopwords_id": "so",
+        "flagged_words_id": None,
         "fasttext_id": "so",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Swahili",
         "dataset_id": "sw",
         "stopwords_id": "sw",
+        "flagged_words_id": None,
         "fasttext_id": "sw",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Tamil",
         "dataset_id": "ta",
         "stopwords_id": None,
+        "flagged_words_id": None,
         "fasttext_id": "ta",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Telugu",
         "dataset_id": "te",
         "stopwords_id": None,
+        "flagged_words_id": "te",
         "fasttext_id": "te",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Urdu",
         "dataset_id": "ur",
         "stopwords_id": "ur",
+        "flagged_words_id": None,
         "fasttext_id": "ur",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Vietnamese",
         "dataset_id": "vi",
         "stopwords_id": "vi",
+        "flagged_words_id": "vi",
         "fasttext_id": "vi",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Yoruba",
         "dataset_id": "yo",
         "stopwords_id": "yo",
+        "flagged_words_id": None,
         "fasttext_id": "yo",
         "sentencepiece_id": None,
         "kenlm_id": None,
         "lang": "Chinese",
         "dataset_id": "zh",
         "stopwords_id": "zh",
+        "flagged_words_id": "zh",
         "fasttext_id": "zh",
         "sentencepiece_id": "zh",
         "kenlm_id": "zh",

parameters_filtering.py CHANGED Viewed

@@ -39,8 +39,8 @@ parameters_filtering_default = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": False,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.70,
     "cond_check_perplexity": False,
@@ -70,8 +70,8 @@ parameters_filtering_af = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.6,
     "cond_check_perplexity": True,
@@ -101,8 +101,8 @@ parameters_filtering_ar = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -132,8 +132,8 @@ parameters_filtering_arz = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -163,8 +163,8 @@ parameters_filtering_as = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -194,8 +194,8 @@ parameters_filtering_bn = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.05,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -225,8 +225,8 @@ parameters_filtering_ca = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -256,8 +256,8 @@ parameters_filtering_en = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.3,
-    "cond_check_badwords": True,
-    "badwords_max_cutoff": 0.045,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.80,
     "cond_check_perplexity": True,
@@ -287,8 +287,8 @@ parameters_filtering_es = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.2,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -318,8 +318,8 @@ parameters_filtering_eu = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -349,8 +349,8 @@ parameters_filtering_fr = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.15,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -380,8 +380,8 @@ parameters_filtering_gu = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -411,8 +411,8 @@ parameters_filtering_hi = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -442,8 +442,8 @@ parameters_filtering_id = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.25,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -473,8 +473,8 @@ parameters_filtering_kn = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -504,8 +504,8 @@ parameters_filtering_ml = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -535,8 +535,8 @@ parameters_filtering_mr = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -566,8 +566,8 @@ parameters_filtering_pt = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.15,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
@@ -597,8 +597,8 @@ parameters_filtering_so = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": False,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -628,8 +628,8 @@ parameters_filtering_sw = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -659,8 +659,8 @@ parameters_filtering_ta = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -690,8 +690,8 @@ parameters_filtering_te = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -721,8 +721,8 @@ parameters_filtering_ur = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -752,8 +752,8 @@ parameters_filtering_vi = {
     "words_augmentation_join_char": " ",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -783,8 +783,8 @@ parameters_filtering_yo = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
@@ -814,8 +814,8 @@ parameters_filtering_zh = {
     "words_augmentation_join_char": "",
     "cond_check_stopwords": False,
     "stopwords_min_cutoff": 0,
-    "cond_check_badwords": False,
-    "badwords_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,

     "words_augmentation_join_char": "",
     "cond_check_stopwords": False,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.70,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.6,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.05,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.3,
+    "cond_check_flagged_words": True,
+    "flagged_words_max_cutoff": 0.045,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.80,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.2,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.15,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.25,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0.15,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": True,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": False,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": " ",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": True,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,
     "words_augmentation_join_char": "",
     "cond_check_stopwords": False,
     "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
     "cond_check_lang_id": True,
     "lang_id_min_cutoff": 0.75,
     "cond_check_perplexity": False,