Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Jan 11, 2022

Commit

d1e3e7b

1 Parent(s): 22701ae

test

Browse files

Files changed (11) hide show

app.py +6 -0
badwords.py +2682 -0
en.arpa.bin +3 -0
en.sp.model +3 -0
filtering.py +879 -0
languages_id.py +231 -0
lid.176.bin +3 -0
normalization.py +52 -0
requirements.txt → packages.txt +0 -0
parameters_filtering.py +852 -0
stopwords.py +0 -0

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ import numpy as np
 import matplotlib.pyplot as plt
 class Visualization:
     def __init__(
@@ -390,6 +392,9 @@ class Visualization:
                 ax.set_ylabel("frequency in the documents")
                 st.pyplot(fig)
     def download_data(self):
         st.header("Download data")
@@ -408,6 +413,7 @@ class Visualization:
         self.filtering_of_words()
         self.plot_distributions_filtering_parameters()
         #self.plot_zipf_law()
         self.download_data()

 import matplotlib.pyplot as plt
+from filtering import Filtering
 class Visualization:
     def __init__(
                 ax.set_ylabel("frequency in the documents")
                 st.pyplot(fig)
+    def check_personal_doc(self):
+        pass
     def download_data(self):
         st.header("Download data")
         self.filtering_of_words()
         self.plot_distributions_filtering_parameters()
         #self.plot_zipf_law()
+        self.check_personal_doc()
         self.download_data()

badwords.py ADDED Viewed

	@@ -0,0 +1,2682 @@

+# Merge
+# https://github.com/zacanger/profane-words
+# and
+# https://github.com/thisandagain/washyourmouthoutwithsoap/blob/develop/data/build.json
+# and
+# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
+english_badwords = [
+    "abuse",
+    "anal",
+    "anilingus",
+    "anus",
+    "aroused",
+    "arse",
+    "arsehole",
+    "ass",
+    "asses",
+    "assfuck",
+    "asshat",
+    "asshole",
+    "assholes",
+    "autoerotic",
+    "bangbros",
+    "banging",
+    "bareback",
+    "bastard",
+    "bastards",
+    "bazongas",
+    "bbw",
+    "bdsm",
+    "biatch",
+    "bicurious",
+    "bigass",
+    "bigtits",
+    "bimbo",
+    "bimbos",
+    "bitch",
+    "bitches",
+    "bitching",
+    "blowjob",
+    "blowjobs",
+    "boche",
+    "boner",
+    "boners",
+    "boob",
+    "boobies",
+    "boobs",
+    "booty",
+    "brothel",
+    "buceta",
+    "bugger",
+    "buggered",
+    "buggery",
+    "bukkake",
+    "bule",
+    "buttcheeks",
+    "buttfuck",
+    "butthead",
+    "butthole",
+    "buttplug",
+    "cameltoe",
+    "camgirl",
+    "camwhore",
+    "chink",
+    "chinks",
+    "cialis",
+    "clit",
+    "clitoris",
+    "clits",
+    "clitty",
+    "clusterfuck",
+    "cock",
+    "cock-head",
+    "cockblock",
+    "cockfight",
+    "cockhead",
+    "cocks",
+    "cocksman",
+    "cocksucker",
+    "cocksucking",
+    "coital",
+    "coitus",
+    "coochie",
+    "cooly",
+    "coon",
+    "coons",
+    "copulate",
+    "cowgirl",
+    "crabs",
+    "creampie",
+    "cum",
+    "cumming",
+    "cums",
+    "cumshot",
+    "cumshots",
+    "cumslut",
+    "cunnilingus",
+    "cunny",
+    "cunt",
+    "cunts",
+    "cybersex",
+    "darkey",
+    "darkie",
+    "darkies",
+    "darky",
+    "deepthroat",
+    "deepthroating",
+    "dick",
+    "dickhole",
+    "dicks",
+    "dildo",
+    "dildos",
+    "dogging",
+    "doggy-style",
+    "doggystyle",
+    "dominatrix",
+    "dommes",
+    "dong",
+    "dp",
+    "dupa",
+    "dyke",
+    "dykes",
+    "ecchi",
+    "ejaculate",
+    "ejaculated",
+    "ejaculates",
+    "ejaculating",
+    "ejaculation",
+    "ejaculations",
+    "enema",
+    "erect",
+    "erection",
+    "ero",
+    "erotic",
+    "erotism",
+    "escort",
+    "fag",
+    "fagging",
+    "faggot",
+    "fagot",
+    "fagots",
+    "fags",
+    "felch",
+    "fellate",
+    "fellatio",
+    "femdom",
+    "fetish",
+    "figging",
+    "fingerbang",
+    "fingering",
+    "fisted",
+    "fister",
+    "fisting",
+    "floozy",
+    "fondle",
+    "footfetish",
+    "footjob",
+    "foreskin",
+    "fornicate",
+    "foursome",
+    "fuck",
+    "fuckable",
+    "fuckbook",
+    "fuckboy",
+    "fuckbuddy",
+    "fucked",
+    "fucker",
+    "fuckers",
+    "fuckfest",
+    "fuckhole",
+    "fuckin",
+    "fucking",
+    "fucks",
+    "fuk",
+    "fukin",
+    "fuking",
+    "g-spot",
+    "gangbang",
+    "gangbanged",
+    "gangbanger",
+    "gangbangs",
+    "genital",
+    "genitals",
+    "gigolo",
+    "glans",
+    "gonad",
+    "gonads",
+    "gook",
+    "gringo",
+    "gringos",
+    "grope",
+    "gspot",
+    "guido",
+    "handjob",
+    "haole",
+    "hapa",
+    "hardcore",
+    "hardon",
+    "harem",
+    "hentai",
+    "hindoo",
+    "hoe",
+    "hoes",
+    "honky",
+    "hooker",
+    "hookers",
+    "hooter",
+    "hooters",
+    "hori",
+    "horndog",
+    "horney",
+    "horniest",
+    "horny",
+    "humped",
+    "humper",
+    "humping",
+    "hussy",
+    "hymen",
+    "ikey",
+    "incest",
+    "injun",
+    "intercourse",
+    "interracial",
+    "jack-off",
+    "jackoff",
+    "jailbait",
+    "jerk-off",
+    "jerkoff",
+    "jiggy",
+    "jism",
+    "jizz",
+    "jizzed",
+    "kaffir",
+    "kafir",
+    "kike",
+    "kikes",
+    "kinkster",
+    "kinky",
+    "kkk",
+    "klan",
+    "kraut",
+    "labia",
+    "lapdance",
+    "libido",
+    "licker",
+    "licking",
+    "limey",
+    "lingerie",
+    "livesex",
+    "lolita",
+    "lovemaking",
+    "lust",
+    "lusting",
+    "masochist",
+    "masterbate",
+    "masterbating",
+    "masterbation",
+    "masturbate",
+    "masturbating",
+    "masturbation",
+    "milf",
+    "minge",
+    "missionary",
+    "molest",
+    "molestation",
+    "molester",
+    "munging",
+    "muschi",
+    "nads",
+    "naked",
+    "necked",
+    "necro",
+    "negress",
+    "negro",
+    "negroes",
+    "negroid",
+    "negros",
+    "nig",
+    "nigar",
+    "nigga",
+    "niggas",
+    "niggaz",
+    "nigger",
+    "niggers",
+    "nigra",
+    "nipple",
+    "nipples",
+    "nookie",
+    "nooky",
+    "nooner",
+    "nude",
+    "nudie",
+    "nudity",
+    "nymph",
+    "nympho",
+    "nymphomania",
+    "orgasim",
+    "orgasm",
+    "orgasms",
+    "orgies",
+    "orgy",
+    "orifice",
+    "p0rn",
+    "paedophile",
+    "pantie",
+    "panties",
+    "panty",
+    "pastie",
+    "pecker",
+    "pedo",
+    "pedophile",
+    "pedophilia",
+    "pedophiliac",
+    "peeper",
+    "peepshow",
+    "pegging",
+    "penetrate",
+    "penetration",
+    "penile",
+    "penis",
+    "penises",
+    "penus",
+    "perv",
+    "phallic",
+    "phonesex",
+    "pickaninnies",
+    "pimp",
+    "playboy",
+    "playgirl",
+    "poontang",
+    "porn",
+    "porno",
+    "pornography",
+    "pornos",
+    "pr0n",
+    "premature",
+    "preteen",
+    "pron",
+    "prostitute",
+    "pube",
+    "pubes",
+    "pubic",
+    "pubis",
+    "punani",
+    "pussies",
+    "pussy",
+    "pussys",
+    "pusy",
+    "puta",
+    "puto",
+    "queef",
+    "quickie",
+    "quicky",
+    "quim",
+    "randy",
+    "rape",
+    "raped",
+    "raper",
+    "raping",
+    "rapist",
+    "rectum",
+    "redneck",
+    "rednecks",
+    "redskin",
+    "redskins",
+    "rimjob",
+    "rimming",
+    "russki",
+    "s&m",
+    "sadism",
+    "sadist",
+    "sambo",
+    "santorum",
+    "schlong",
+    "scissoring",
+    "semen",
+    "sex",
+    "sexed",
+    "sexi",
+    "sexing",
+    "sexo",
+    "sexpot",
+    "sextoy",
+    "sexual",
+    "sexually",
+    "sexx",
+    "sexxx",
+    "sexxxy",
+    "sexxy",
+    "sexy",
+    "sh!t",
+    "sh1t",
+    "shagging",
+    "shemale",
+    "sissy",
+    "skank",
+    "skanks",
+    "slapper",
+    "slut",
+    "sluts",
+    "slutting",
+    "slutty",
+    "smut",
+    "smutty",
+    "sodomise",
+    "sodomite",
+    "sodomize",
+    "sodomy",
+    "spank",
+    "sperm",
+    "spic",
+    "spick",
+    "splooge",
+    "spooge",
+    "squaw",
+    "squirting",
+    "steamy",
+    "stiffy",
+    "strapon",
+    "suck",
+    "sucked",
+    "sucker",
+    "sucking",
+    "sucks",
+    "swallow",
+    "swallower",
+    "swinger",
+    "teabagging",
+    "testical",
+    "testicle",
+    "testicles",
+    "testis",
+    "threesome",
+    "threeway",
+    "titfuck",
+    "titjob",
+    "tits",
+    "tittie",
+    "titties",
+    "titty",
+    "tittyfuck",
+    "tity",
+    "toots",
+    "topless",
+    "trannie",
+    "tranny",
+    "tribadism",
+    "twat",
+    "twats",
+    "undies",
+    "undressing",
+    "upskirt",
+    "vag",
+    "vagina",
+    "vaginal",
+    "viagra",
+    "vibrator",
+    "virgin",
+    "vixen",
+    "voyeur",
+    "vulva",
+    "wank",
+    "wanker",
+    "wanking",
+    "wazoo",
+    "wedgie",
+    "wench",
+    "wetback",
+    "whore",
+    "whored",
+    "whorehouse",
+    "whores",
+    "whoring",
+    "wigger",
+    "willie",
+    "willies",
+    "willy",
+    "wog",
+    "wop",
+    "x-rated",
+    "xxx",
+    "xxxxxx",
+    "yaoi",
+    "yid",
+    "zoophile",
+    "zoophilia",
+]
+badwords = {
+    "ar": english_badwords
+    + [
+        "احتلام",
+        "اغتصاب",
+        "بز",
+        "بزاز",
+        "بظر",
+        "بيضان",
+        "تمص",
+        "ثدي",
+        "جماع",
+        "حلمة",
+        "خنثي",
+        "خول",
+        "زب",
+        "سحاق",
+        "سحاقية",
+        "سكس",
+        "شاذ",
+        "شرج",
+        "شرموطة",
+        "شهوة",
+        "طيز",
+        "عاهرة",
+        "عرص",
+        "فرج",
+        "قحبة",
+        "قضيب",
+        "كس",
+        "لبوة",
+        "لحس",
+        "لعق",
+        "لواط",
+        "لوطي",
+        "مبادل",
+        "متناك",
+        "متناكة",
+        "مص",
+        "مفلقسة",
+        "نيك",
+    ],
+    "ca": english_badwords
+    + [
+        "avortament",
+        "anal",
+        "anus",
+        "cul",
+        "ass-fucker",
+        "asss",
+        "asshole",
+        "assholes",
+        "bolera",
+        "boles",
+        "bastardo",
+        "bellend",
+        "bestial",
+        "bestialitat",
+        "puta",
+        "femelles",
+        "picant",
+        "sagnant",
+        "mamada",
+        "bollok",
+        "boob",
+        "pits",
+        "buceta",
+        "bum",
+        "culata",
+        "catifa muncher",
+        "picar",
+        "cipa",
+        "clitoris",
+        "polla",
+        "galletejador",
+        "gallines",
+        "coon",
+        "merda",
+        "cum",
+        "correguda",
+        "cunillingus",
+        "boig",
+        "maleït",
+        "consolador",
+        "consoladors",
+        "dink",
+        "canalla",
+        "duche",
+        "dique",
+        "ejaculació",
+        "ejaculat",
+        "ejacula",
+        "ejaculant",
+        "fag",
+        "fagging",
+        "fagot",
+        "fagots",
+        "fanny",
+        "felching",
+        "fel.lació",
+        "brida",
+        "follar",
+        "follat",
+        "escuradents",
+        "follant",
+        "folles",
+        "fucks",
+        "empacadora de llaminadures",
+        "déu maldit",
+        "deu meu",
+        "infern",
+        "hore",
+        "córrer",
+        "retrocés",
+        "kock",
+        "llavis",
+        "lujuria",
+        "lució",
+        "masoquista",
+        "masturbarse",
+        "puta mare",
+        "nazi",
+        "nigger",
+        "negres",
+        "orgasim",
+        "orgasme",
+        "orgasmes",
+        "pecker",
+        "penis",
+        "piss",
+        "mossegat",
+        "pisser",
+        "pisses",
+        "pissing",
+        "treure de polleguera",
+        "caca",
+        "porno",
+        "pornografia",
+        "picades",
+        "pube",
+        "coques",
+        "gatet",
+        "violació",
+        "violador",
+        "recte",
+        "retard",
+        "rimming",
+        "sàdic",
+        "cargolar",
+        "escrot",
+        "semen",
+        "sexe",
+        "shag",
+        "borratxos",
+        "transsexual",
+        "mossegar",
+        "shitted",
+        "skank",
+        "smegma",
+        "smut",
+        "arrebat",
+        "fill de puta",
+        "spac",
+        "spunk",
+        "testicle",
+        "tit",
+        "tetas",
+        "titt",
+        "turd",
+        "vagina",
+        "viagra",
+        "vulva",
+        "wang",
+        "wank",
+        "x classificat",
+        "xxx",
+    ],
+    "en": english_badwords,
+    "es": english_badwords
+    + [
+        "Asesinato",
+        "Bollera",
+        "Cabrón",
+        "Caca",
+        "Chupada",
+        "Chupapollas",
+        "Chupetón",
+        "Concha de tu madre",
+        "Coprofagía",
+        "Coño",
+        "Culo",
+        "Drogas",
+        "Esperma",
+        "Fiesta de salchichas",
+        "Follador",
+        "Follar",
+        "Gilipichis",
+        "Gilipollas",
+        "Hacer una paja",
+        "Haciendo el amor",
+        "Heroína",
+        "Hija de puta",
+        "Hijaputa",
+        "Hijo de puta",
+        "Hijoputa",
+        "Idiota",
+        "Imbécil",
+        "Jilipollas",
+        "Kapullo",
+        "Lameculos",
+        "Maciza",
+        "Macizorra",
+        "Mamada",
+        "Marica",
+        "Mariconazo",
+        "Maricón",
+        "Mierda",
+        "Nazi",
+        "Orina",
+        "Pedo",
+        "Pendejo",
+        "Pervertido",
+        "Pezón",
+        "Pinche",
+        "Pis",
+        "Prostituta",
+        "Puta",
+        "Racista",
+        "Ramera",
+        "Semen",
+        "Sexo",
+        "Sexo oral",
+        "Soplagaitas",
+        "Soplapollas",
+        "Sádico",
+        "Tetas grandes",
+        "Travesti",
+        "Trio",
+        "Tía buena",
+        "Verga",
+        "Vulva",
+        "aborto",
+        "agallas",
+        "anal",
+        "ano",
+        "arrebatar",
+        "asno",
+        "atornillar",
+        "bastardo",
+        "bestial",
+        "bestialidad",
+        "bolas",
+        "bollok",
+        "bolsa de pelota",
+        "brida",
+        "buceta",
+        "cabron",
+        "cagadas",
+        "cagado",
+        "cagando",
+        "campana",
+        "carajo",
+        "chupar la polla",
+        "cipa",
+        "clítoris",
+        "concha",
+        "consolador",
+        "consoladores",
+        "corrida",
+        "coño",
+        "coños",
+        "culo",
+        "culos",
+        "cunillingus",
+        "córneo",
+        "de mierda",
+        "dique",
+        "duche",
+        "enojado",
+        "escroto",
+        "espacio",
+        "estúpido",
+        "extremo",
+        "eyacula",
+        "eyaculación",
+        "eyaculado",
+        "eyacular",
+        "fagging",
+        "felación",
+        "felching",
+        "folla",
+        "follada",
+        "follador de culo",
+        "folladores",
+        "follar",
+        "fudge packer",
+        "gallos",
+        "grieta",
+        "hacerse una paja",
+        "hijo de puta",
+        "hore",
+        "infierno",
+        "kock",
+        "labios vaginales",
+        "los pechos",
+        "lujuria",
+        "madre folladora",
+        "maldita sea",
+        "maldito",
+        "maldito sea",
+        "mamada",
+        "mapache",
+        "maricones",
+        "maricón",
+        "martillo",
+        "masoquista",
+        "masturbarse",
+        "mear",
+        "mierda",
+        "molesto",
+        "muncher alfombra",
+        "nazi",
+        "negro",
+        "niggers",
+        "orgasimo",
+        "orgasmo",
+        "orgasmos",
+        "orinando",
+        "pelusa",
+        "pene",
+        "perra",
+        "perras",
+        "perro follador",
+        "pinchazo",
+        "pinchazos",
+        "pisser",
+        "polla",
+        "porno",
+        "pornografía",
+        "pube",
+        "puta",
+        "putas",
+        "pájaro carpintero",
+        "quejas",
+        "recto",
+        "retardar",
+        "rimming",
+        "sangriento",
+        "semen",
+        "sexo",
+        "skank",
+        "smegma",
+        "sádico",
+        "testículo",
+        "teta",
+        "tetas",
+        "tirón",
+        "tizón",
+        "tonto",
+        "transexual",
+        "vagina",
+        "vete a la mierda",
+        "viagra",
+        "violación",
+        "violador",
+        "vulva",
+        "wang",
+        "x clasificado",
+        "xxx",
+        "zurullo",
+    ],
+    "eu": english_badwords
+    + [
+        "abortu",
+        "anal",
+        "ipurdi",
+        "kabroi",
+        "puta",
+        "clitoris",
+        "cunillingus",
+        "madarikatu",
+        "zakil",
+        "hazia isuri",
+        "arraio",
+        "izorratu",
+        "infernu",
+        "emagaldu",
+        "lizunkeri",
+        "lizun",
+        "masokista",
+        "masturbatu",
+        "nazi",
+        "beltz",
+        "orgasmo",
+        "pixa",
+        "porno",
+        "pornografia",
+        "alu",
+        "bortxaketa",
+        "bortxatzaile",
+        "sadista",
+        "ipurzulo",
+        "hazi",
+        "semen",
+        "sexu",
+        "kaka",
+        "putaseme",
+        "barrabil",
+        "titi",
+        "bagina",
+        "viagra",
+    ],
+    "fr": english_badwords
+    + [
+        "MALPT",
+        "anal",
+        "anus",
+        "arracher",
+        "avortement",
+        "baise",
+        "baiser",
+        "baiseur de chien",
+        "baiseurs",
+        "baisée",
+        "bander",
+        "bellend",
+        "bestial",
+        "bestialité",
+        "bigornette",
+        "bite",
+        "bitte",
+        "bloblos",
+        "bollok",
+        "boob",
+        "bordel",
+        "bourré",
+        "bourrée",
+        "bout",
+        "brackmard",
+        "branlage",
+        "branler",
+        "branlette",
+        "branleur",
+        "branleuse",
+        "bride",
+        "brouter le cresson",
+        "buceta",
+        "caca",
+        "chatte",
+        "chattes",
+        "chiasse",
+        "chienne",
+        "chiennes",
+        "chier",
+        "chiottes",
+        "chié",
+        "cipa",
+        "clito",
+        "clitoris",
+        "clochard",
+        "cochonneries",
+        "con",
+        "connard",
+        "connards",
+        "connasse",
+        "conne",
+        "convoitise",
+        "coq",
+        "coqs",
+        "corné",
+        "couilles",
+        "cramouille",
+        "cran",
+        "cul",
+        "culs",
+        "cunillingus",
+        "damné",
+        "des balles",
+        "digue",
+        "duché",
+        "déconne",
+        "déconner",
+        "emballeur de fudge",
+        "emmerdant",
+        "emmerder",
+        "emmerdeur",
+        "emmerdeuse",
+        "enculer",
+        "enculeur",
+        "enculeurs",
+        "enculé",
+        "enculée",
+        "enfer",
+        "enfoiré",
+        "enfoirée",
+        "espacer",
+        "fagging",
+        "fagot",
+        "fagots",
+        "faire chier",
+        "fellation",
+        "fente",
+        "fille de pute",
+        "fils de pute",
+        "folle",
+        "foutre",
+        "fuckings",
+        "gerbe",
+        "gerber",
+        "godemiché",
+        "godes",
+        "gouine",
+        "grande folle",
+        "grogniasse",
+        "gueule",
+        "hore",
+        "jouir",
+        "kock",
+        "la putain de ta mère",
+        "les lèvres",
+        "les seins",
+        "luxure",
+        "masochiste",
+        "masturber",
+        "merde",
+        "merdeuse",
+        "merdeux",
+        "merdique",
+        "meuf",
+        "mère enculée",
+        "ménage à trois",
+        "mésange",
+        "nazi",
+        "negro",
+        "nique ta mère",
+        "nique ta race",
+        "nègre",
+        "nègres",
+        "orgasim",
+        "orgasme",
+        "orgasmes",
+        "palucher",
+        "penchant",
+        "pipe",
+        "pipi",
+        "piquer",
+        "piqûres",
+        "pisse",
+        "pisser",
+        "porno",
+        "pornographie",
+        "pouffiasse",
+        "pousse-crotte",
+        "pube",
+        "putain",
+        "putain de",
+        "pute",
+        "pédale",
+        "pédé",
+        "pénis",
+        "péter",
+        "queue",
+        "quéquette",
+        "ramoner",
+        "rectum",
+        "retard",
+        "rimming",
+        "râpé",
+        "sac de billes",
+        "sac à foutre",
+        "sac à merde",
+        "sadique",
+        "salaud",
+        "salope",
+        "salopes",
+        "sanglant",
+        "scrotum",
+        "se branler",
+        "seins",
+        "sexe",
+        "skank",
+        "smegma",
+        "sperme",
+        "suce",
+        "suceuse",
+        "tanche",
+        "tapette",
+        "tapis muncher",
+        "testicule",
+        "teuch",
+        "titt",
+        "transexuelle",
+        "tremper",
+        "tringler",
+        "trique",
+        "troncher",
+        "trou du cul",
+        "turlute",
+        "vagin",
+        "viagra",
+        "violeur",
+        "vulve",
+        "wang",
+        "x évalué",
+        "xxx",
+        "zigounette",
+        "zizi",
+        "zut",
+        "éjaculant",
+        "éjaculation",
+        "éjacule",
+        "éjaculer",
+        "éjaculé",
+        "étron",
+    ],
+    "hi": english_badwords
+    + [
+        "aand",
+        "aandu",
+        "balatkar",
+        "balatkari",
+        "behen chod",
+        "beti chod",
+        "bhadva",
+        "bhadve",
+        "bhandve",
+        "bhangi",
+        "bhootni ke",
+        "bhosad",
+        "bhosadi ke",
+        "bitching",
+        "blowjob",
+        "bollok",
+        "boobe",
+        "buceta",
+        "chakke",
+        "chinaal",
+        "chinki",
+        "chod",
+        "chodu",
+        "chodu bhagat",
+        "chooche",
+        "choochi",
+        "choope",
+        "choot",
+        "choot ke baal",
+        "chootia",
+        "chootiya",
+        "chuche",
+        "chuchi",
+        "chudaap",
+        "chudai khanaa",
+        "chudam chudai",
+        "chude",
+        "chut",
+        "chut ka chuha",
+        "chut ka churan",
+        "chut ka mail",
+        "chut ke baal",
+        "chut ke dhakkan",
+        "chut maarli",
+        "chutad",
+        "chutadd",
+        "chutan",
+        "chutia",
+        "chutiya",
+        "cipa",
+        "cunillingus",
+        "dink",
+        "duche",
+        "ejaculated",
+        "ejaculates",
+        "ejaculating",
+        "fagging",
+        "fagots",
+        "felching",
+        "fuckers",
+        "fuckings",
+        "fucks",
+        "gaand",
+        "gaandfat",
+        "gaandmasti",
+        "gaandufad",
+        "gandfattu",
+        "gandu",
+        "gashti",
+        "gasti",
+        "ghassa",
+        "ghasti",
+        "gucchi",
+        "gucchu",
+        "harami",
+        "haramzade",
+        "hawas",
+        "hawas ke pujari",
+        "hijda",
+        "hijra",
+        "jhant",
+        "jhant chaatu",
+        "jhant ka keeda",
+        "jhant ke baal",
+        "jhant ke pissu",
+        "jhantu",
+        "kamine",
+        "kaminey",
+        "kanjar",
+        "kutta",
+        "kutta kamina",
+        "kutte ki aulad",
+        "kutte ki jat",
+        "kuttiya",
+        "loda",
+        "lodu",
+        "lund",
+        "lund choos",
+        "lund ka bakkal",
+        "lund khajoor",
+        "lundtopi",
+        "lundure",
+        "lusting",
+        "maa ki chut",
+        "maal",
+        "madar chod",
+        "madarchod",
+        "madhavchod",
+        "masochist",
+        "mooh mein le",
+        "mutth",
+        "mutthal",
+        "najayaz",
+        "najayaz aulaad",
+        "najayaz paidaish",
+        "orgasim",
+        "paki",
+        "pataka",
+        "patakha",
+        "pisser",
+        "pisses",
+        "pissing",
+        "pube",
+        "pussies",
+        "raand",
+        "randaap",
+        "randi",
+        "randi rona",
+        "rimming",
+        "saala",
+        "saala kutta",
+        "saali kutti",
+        "saali randi",
+        "shagging",
+        "shite",
+        "shitted",
+        "shitting",
+        "shitty",
+        "skank",
+        "sluts",
+        "spac",
+        "suar",
+        "suar ke lund",
+        "suar ki aulad",
+        "tatte",
+        "tatti",
+        "teri maa ka bhosada",
+        "teri maa ka boba chusu",
+        "teri maa ki behenchod ",
+        "teri maa ki chut",
+        "tharak",
+        "tharki",
+        "titt",
+        "tu chuda",
+        "turd",
+        "wank",
+        "xxx",
+        "अंडकोश की थैली",
+        "अंडा",
+        "अरे नहीं",
+        "अश्लील",
+        "उल्लू",
+        "एक्स रेटेड",
+        "ओगाज़्म",
+        "कमबख्त",
+        "काम करना",
+        "कामोद्दीपक चित्र",
+        "कालीन का चूरा",
+        "किन्नर",
+        "कुतिया",
+        "कुत्ते-कमीने",
+        "कून",
+        "कॉक",
+        "गड़बड़",
+        "गधा कमीने",
+        "गधे",
+        "गर्भपात",
+        "गुदा",
+        "गेंद का थैला",
+        "गेंदों",
+        "गोली चलाने की आवाज़",
+        "घटिया इंसान",
+        "चाकलेट का रंग",
+        "चिंक",
+        "चुभन",
+        "चूची",
+        "चूतड़",
+        "चोंच",
+        "छीनना",
+        "जी में आये करो",
+        "झटका बंद",
+        "ठगना पैकर",
+        "डिल्डो",
+        "दुष्ट",
+        "दूर जाने का अभद्र संकेत देना",
+        "धत् तेरे की",
+        "नरक",
+        "नाजी",
+        "निकला हुआ किनारा",
+        "नितंब",
+        "पंगा लेना",
+        "पिछाड़ी",
+        "पीड़न कामुक",
+        "पेशाब",
+        "पॉर्न",
+        "फटना",
+        "फूहड़",
+        "बकवास",
+        "बट",
+        "बलात्कार",
+        "बहुत मदहोश",
+        "बांध",
+        "बिल्ली",
+        "बेल अंत",
+        "बेवकूफों",
+        "बोल पड़ना",
+        "भगवान-शापित",
+        "भगशेफ",
+        "मल",
+        "मलाशय",
+        "माँ कमीने",
+        "मुखमैथुन",
+        "मुर्गा",
+        "मुर्गा के",
+        "मुर्गा चूसने वाला",
+        "मूर्ख",
+        "मैल",
+        "योनि",
+        "योनी",
+        "यौन-संबंध",
+        "रक्तरंजित",
+        "लानत है",
+        "लिंग",
+        "लुटेरा",
+        "लेबिया",
+        "वहशी",
+        "वहशीता",
+        "वियाग्रा",
+        "वीर्य",
+        "वेश्या",
+        "वैंग",
+        "वो साले",
+        "शिफ़्ट को",
+        "शिश्नमल",
+        "संभोग सुख",
+        "सह",
+        "सह शॉट",
+        "साहस",
+        "सिगरेट",
+        "सींग का बना हुआ",
+        "स्तन",
+        "स्तनों",
+        "हवस",
+        "हस्तमैथुन",
+        "होमोसेक्सुअल",
+        "होर",
+    ],
+    "id": english_badwords
+    + [
+        "abortus",
+        "anal",
+        "dubur",
+        "pantat",
+        "bajingan",
+        "keledai",
+        "keparat",
+        "tas bola",
+        "bola",
+        "bellend",
+        "kejam",
+        "kebinatangan",
+        "menggerutu",
+        "pelacur",
+        "berdarah",
+        "blowjob",
+        "bollok",
+        "dada",
+        "payudara",
+        "buceta",
+        "gelandangan",
+        "pengunyah karpet",
+        "celah",
+        "cipa",
+        "kelentit",
+        "kokang",
+        "pengisap ayam",
+        "ayam",
+        "coon",
+        "sampah",
+        "air mani",
+        "cumshot",
+        "cunillingus",
+        "vagina",
+        "mengutuk",
+        "kontol",
+        "dildo",
+        "dink",
+        "anjing-keparat",
+        "duche",
+        "tanggul",
+        "berejakulasi",
+        "ejakulasi",
+        "homo",
+        "fagging",
+        "kayu bakar",
+        "penggemar",
+        "felching",
+        "fellatio",
+        "flens",
+        "brengsek",
+        "kacau",
+        "sialan",
+        "persetan",
+        "pengepakan fudge",
+        "terkutuk",
+        "ya tuhan",
+        "neraka",
+        "hore",
+        "terangsang",
+        "kock",
+        "labia",
+        "nafsu",
+        "bernafsu",
+        "masokis",
+        "masturbasi",
+        "keparat ibu",
+        "nazi",
+        "orang negro",
+        "negro",
+        "orgasim",
+        "orgasme",
+        "cotok",
+        "penis",
+        "kencing",
+        "kesal",
+        "pisser",
+        "bikin",
+        "buritan",
+        "porno",
+        "pornografi",
+        "tusukan",
+        "menusuk",
+        "pube",
+        "pussies",
+        "memperkosa",
+        "pemerkosa",
+        "memperlambat",
+        "rimming",
+        "sadis",
+        "meniduri",
+        "skrotum",
+        "seks",
+        "bercinta",
+        "waria",
+        "kotoran",
+        "shite",
+        "kengerian",
+        "dikirim",
+        "buang hajat",
+        "menyebalkan",
+        "smegma",
+        "jelaga",
+        "merebut",
+        "dasar bajingan",
+        "ruang",
+        "keberanian",
+        "buah pelir",
+        "titt",
+        "viagra",
+        "vulva",
+        "wang",
+        "terima kasih",
+        "x diberi peringkat",
+        "xxx",
+    ],
+    "kn": english_badwords
+    + [
+        "ಗರ್ಭಪಾತ",
+        "ಗುದ",
+        "ಗುದದ್ವಾರ",
+        "ಕತ್ತೆ",
+        "ಆಶ್-ಫಕರ್",
+        "ಅಸ್ಹೋಲ್",
+        "ಅಸೋಲೆಸ್",
+        "ಬಾಲ್ಬಾಗ್",
+        "ಚೆಂಡುಗಳು",
+        "ಬಾಸ್ಟರ್ಡ್",
+        "ಬೆಲೆಂಡ್",
+        "ಮೃದ್ವಂಗಿ",
+        "ಪ್ರಾಣಿಜನ್ಯತೆ",
+        "ಬಿಚ್",
+        "ಬಿಟ್ಚಿಸ್",
+        "ಬೆಚಿಂಗ್",
+        "ರಕ್ತಸಿಕ್ತ",
+        "ಬ್ಲೋಜಾಬ್",
+        "ಬೊಲ್ಲೊಕ್",
+        "ಕುರುಚಲು ಗಿಡ",
+        "ಬೂಬಿಗಳು",
+        "ಸ್ತನಗಳನ್ನು",
+        "ಬುಕೆಟಾ",
+        "ತಿಕ",
+        "ಬಟ್",
+        "ಕಾರ್ಪೆಟ್ ಮಂಚರ್",
+        "ಚಿಂಕ್",
+        "ಸಿಪಾ",
+        "ಚಂದ್ರನಾಡಿ",
+        "ಕೋಳಿ",
+        "ಕೋಳಿ ಸಕ್ಕರ್",
+        "ಕಾಕ್ಸ್",
+        "ಕೂನ್",
+        "ಅಮೇಧ್ಯ",
+        "ಕಮ್",
+        "ಕಮ್ಶಾಟ್",
+        "ಕುನಿಲ್ಲಸ್",
+        "ಕಂಟ್",
+        "ಡ್ಯಾಮ್",
+        "ಡಿಕ್",
+        "ದ್ವಿಧ್ರುವಿ",
+        "dildos",
+        "ಡಿಂಕ್",
+        "ನಾಯಿ-ಫಕರ್",
+        "ಡಚೆ",
+        "ಡೈಕ್",
+        "ಹೊರಹೊಮ್ಮಿಸು",
+        "ಸ್ಫೂರ್ತಿ",
+        "ಎಜಾಕ್ಯುಲೇಟ್ಸ್",
+        "ಇಜಲಲೇಟಿಂಗ್",
+        "ಉದ್ಗಾರ",
+        "ತಮಾಷೆ",
+        "ಮಂದಗತಿ",
+        "ಮಬ್ಬು",
+        "fagots",
+        "ಫ್ಯಾನಿ",
+        "ಹೊಡೆತ",
+        "ಪತನ",
+        "ಚಾಚುಪಟ್ಟಿ",
+        "ಫಕ್",
+        "ನಾಶವಾಗಿದ್ದನು",
+        "ಫಕರ್",
+        "fuckers",
+        "ಫಕಿಂಗ್",
+        "ಫಕಿಂಗ್ಸ್",
+        "ಇಷ್ಟಪಡುತ್ತಾನೆ",
+        "ಮಿಠಾಯಿ ಪ್ಯಾಕರ್",
+        "ದೇವರನ್ನು ಹಾನಿಗೊಳಗಾಯಿತು",
+        "ಗಾಡ್ಡಮ್",
+        "ನರಕ",
+        "ಹೋರ್",
+        "ಮೊನಚಾದ",
+        "ಜರ್ಕ್-ಆಫ್",
+        "ಕೋಕ್",
+        "ಯೋನಿಯ",
+        "ಕಾಮ",
+        "ಕಾಮುಕ",
+        "ಮಾಸೋಚಿಸ್ಟ್",
+        "ಹಸ್ತಮೈಥುನ ಮಾಡು",
+        "ತಾಯಿ ಫಕರ್",
+        "ನಾಜಿ",
+        "ನಿಗರ್",
+        "ನಿಗ್ಗರ್ಗಳು",
+        "ಒರಾಸಿಮ್",
+        "ಪರಾಕಾಷ್ಠೆ",
+        "ಪರಾಕಾಷ್ಠೆಗಳನ್ನು",
+        "ಪೆಕರ್",
+        "ಶಿಶ್ನ",
+        "ಮೂತ್ರ ವಿಸರ್ಜಿಸು",
+        "ನಿರುತ್ಸಾಹಗೊಂಡಿದೆ",
+        "ಪಿಸರ್",
+        "ಮೂತ್ರಪಿಂಡಗಳು",
+        "pissing",
+        "ಪಿಸ್ಸಾಫ್",
+        "ಪೂಪ್",
+        "ಅಶ್ಲೀಲತೆ",
+        "ಅಶ್ಲೀಲ",
+        "ಚುಚ್ಚು",
+        "ಪ್ರಿಕ್ಸ್",
+        "ಪಬ್",
+        "ಪುಸಿಗಳು",
+        "ಪುಸಿ",
+        "ಅತ್ಯಾಚಾರ",
+        "ಅತ್ಯಾಚಾರಿ",
+        "ಗುದನಾಳದ",
+        "ರಿಟಾರ್ಡ್",
+        "ಹಚ್ಚುವುದು",
+        "ದುಃಖಗಾರ",
+        "ತಿರುಗಿಸುವುದು",
+        "ಸ್ಕ್ರೋಟಮ್",
+        "ವೀರ್ಯ",
+        "ಲೈಂಗಿಕತೆ",
+        "ಶಾಗ್",
+        "ಶಾಗ್ಗಿಂಗ್",
+        "ಶೆಮೇಲ್",
+        "ಶಿಟ್",
+        "ಷೈಟ್",
+        "ಶಿಟ್ಸ್",
+        "shitted",
+        "ಅಲುಗಾಡುವಿಕೆ",
+        "ಅಸಹ್ಯ",
+        "ಸ್ಕಾಂಕ್",
+        "ಸೂಳೆ",
+        "ಸ್ಲಟ್ಗಳು",
+        "ಸ್ಮೆಗ್ಮಾ",
+        "ಕೊಳೆತ",
+        "ಸ್ನ್ಯಾಚ್",
+        "ಮಗ-ಆಫ್-ಬಿಚ್",
+        "spac",
+        "ಉಬ್ಬು",
+        "ವೃಷಣ",
+        "ಟಿಟ್",
+        "ಚೇಕಡಿ ಹಕ್ಕಿಗಳು",
+        "turd",
+        "ಯೋನಿ",
+        "ವಯಾಗ್ರ",
+        "ವಾಂಗ್",
+        "ಮುಷ್ಕರ",
+        "x ರೇಟೆಡ್",
+        "xxx",
+    ],
+    "ml": english_badwords
+    + [
+        "ഗർഭഛിദ്രം",
+        "വിശപ്പ്",
+        "മലദ്വാരം",
+        "കഴുത",
+        "അസി ഫക്കർ",
+        "കഴുതകളെ",
+        "ആസ്ഹോൾ",
+        "അശ്ളീലങ്ങൾ",
+        "ബോൾബാഗ്",
+        "പന്തുകൾ",
+        "തന്തയില്ലാത്തവൻ",
+        "ബെല്ലെൻഡ്",
+        "മൃഗീയമായ",
+        "മൃഗീയത",
+        "ബിച്ച്",
+        "ബിച്ചുകൾ",
+        "ബിപിഡിംഗ്",
+        "രക്തരൂക്ഷിതമായ",
+        "ആശ്വാസം",
+        "ബലോക്ക്",
+        "ബോബ്",
+        "പൂക്കൾ",
+        "സ്തനങ്ങൾ",
+        "ബ്യൂട്ടാ",
+        "ബം",
+        "മയക്കുമരുന്ന്",
+        "പരവതാനി മാൻച്ചർ",
+        "ചുംബ്",
+        "സിപാ",
+        "ക്ലോറിസിസ്",
+        "കോക്ക്",
+        "കോക്ക് സക്കർ",
+        "കോക്സ്",
+        "കോൺ",
+        "ക്രാപ്പ്",
+        "ശുക്ലം",
+        "പുരുഷാരം",
+        "സി",
+        "മുഷിഞ്ഞ",
+        "കഷ്ടം",
+        "ഡിക്ക്",
+        "ഡിൽഡോ",
+        "dildos",
+        "ഡൈൻ",
+        "നായ-ഫക്കർ",
+        "ഡച്ച്",
+        "ഡൈകെ",
+        "ശമിപ്പിക്കുക",
+        "മോഷ്ടിച്ചു",
+        "വികാരങ്ങൾ",
+        "വിരസത",
+        "മടി",
+        "ക്ഷീണിപ്പിക്കുക",
+        "fagot",
+        "വഞ്ചന",
+        "ഫാനി",
+        "വേദന",
+        "flange",
+        "ഊമ്പി",
+        "സംഭോഗം ചെയ്യുക",
+        "ഫക്കർ",
+        "നർമ്മം",
+        "ഫഡ്ജ് പാക്കർ",
+        "ദൈവം-കൊള്ളിത",
+        "ഗോഡ്ഡം",
+        "നരകം",
+        "വയ്ക്കുക",
+        "വൃത്തികെട്ട",
+        "ജെർക് ഓഫ്",
+        "കിക്ക്",
+        "ലാബിയ",
+        "മോഹം",
+        "മോഹഭംഗം",
+        "മാസോച്ചിസ്റ്റ്",
+        "സ്വയംഭോഗം ചെയ്യുക",
+        "അമ്മ ഫക്കർ",
+        "നാസി",
+        "നിഗർ",
+        "മയക്കുമരുന്നുകൾ",
+        "രതിമൂർച്ഛ",
+        "പെക്കർ",
+        "ലിംഗം",
+        "മൂത്രമൊഴിക്കുക",
+        "കുഴഞ്ഞുവീഴുന്നു",
+        "പിസ്സർ",
+        "പിസ്സകൾ",
+        "pissing",
+        "പിസ്സോഫ്",
+        "poop",
+        "അശ്ലീലം",
+        "അശ്ലീലത",
+        "പ്രാവി",
+        "വിസർജ്യങ്ങൾ",
+        "പ്യൂബ്",
+        "pussies",
+        "pussy",
+        "ബലാൽസംഗം",
+        "ബലാത്സംഗം",
+        "മലാശയം",
+        "തുടരുക",
+        "റിമ്മിംഗ്",
+        "സചിസ്റ്റ്",
+        "വഞ്ചി",
+        "പുല്ല്",
+        "ബീജം",
+        "ശവം",
+        "ഷാഗിംഗ്",
+        "അവൾ",
+        "ഷീറ്റ്",
+        "ഷെയ്റ്റ്",
+        "shits",
+        "തിന്നിട്ടില്ല",
+        "ഷോർട്ട്",
+        "ഷൈറ്റി",
+        "സ്കാൻ",
+        "മന്ദഹസരം",
+        "സ്നെഗമാ",
+        "പുഞ്ചിരി",
+        "പിടിക്കുക",
+        "വെറുക്കപ്പെട്ടയാൾ",
+        "സ്പെയ്ക്",
+        "തുളച്ച്",
+        "വൃഷണം",
+        "പേ",
+        "ടിത്ത്",
+        "കുഴപ്പമില്ല",
+        "യോനി",
+        "വരാഗ്ര",
+        "വാൽവ",
+        "വാങ്",
+        "വാൻ",
+        "വേശ്യ",
+        "x റേറ്റുചെയ്തു",
+        "xxx",
+    ],
+    "mr": english_badwords
+    + [
+        "गर्भपात",
+        "गुदा",
+        "गाढव",
+        "गांडुळ",
+        "asses",
+        "asshole",
+        "assholes",
+        "ballbag",
+        "चेंडू",
+        "बॅस्टर्ड",
+        "बेलेंड",
+        "बेस्टियल",
+        "प्राण्यांबरोबर",
+        "कुत्री",
+        "बिट्स",
+        "खूनी",
+        "blowjob",
+        "बोलोक",
+        "बोब",
+        "स्तन",
+        "बसीटा",
+        "बम",
+        "बट",
+        "कार्पेट मुन्चर",
+        "चिंक",
+        "सिपा",
+        "क्लिटोरिस",
+        "मुर्ख",
+        "मांसाहारी",
+        "कॉक्स",
+        "कॉनन",
+        "बकवास",
+        "सह",
+        "cumshot",
+        "कनिलिंगस",
+        "कांट",
+        "धिक्कार",
+        "डिक",
+        "dildo",
+        "डिल्डो",
+        "डंक",
+        "duche",
+        "डाईक",
+        "उद्गार",
+        "उत्साही",
+        "ejaculates",
+        "उत्सुकता",
+        "स्खलन",
+        "फॅग",
+        "फॅगिंग",
+        "फॅगॉट",
+        "फॅगॉट्स",
+        "फॅनी",
+        "फेलिंग",
+        "फॅलेटीओ",
+        "निकला",
+        "fucked",
+        "गुप्तचर",
+        "fuckers",
+        "fucking",
+        "fuckings",
+        "fucks",
+        "फडगे पॅकर",
+        "देव-शापित",
+        "देव",
+        "नरक",
+        "होरे",
+        "शिंग",
+        "झटका बंद",
+        "कॉक",
+        "लॅबिया",
+        "वासना",
+        "मासोचिस्ट",
+        "हस्तमैथुन करा",
+        "आई माकड",
+        "नाझी",
+        "निगर",
+        "निगार",
+        "ऑर्गॅसिम",
+        "संभोग",
+        "orgasms",
+        "चापटी",
+        "पुरुषाचे जननेंद्रिय",
+        "पेशी",
+        "pissed",
+        "पिसर",
+        "pisses",
+        "पिसिंग",
+        "पिसोफ",
+        "घाट",
+        "अश्लील",
+        "पोर्नोग्राफी",
+        "मुरुम",
+        "प्रिक्स",
+        "प्यूब",
+        "pussies",
+        "मांजर",
+        "बलात्कार",
+        "गुदाशय",
+        "मंद",
+        "rimming",
+        "दुःखी",
+        "screwing",
+        "स्क्रोटम",
+        "वीर्य",
+        "लिंग",
+        "शेग",
+        "shagging",
+        "शेमले",
+        "विचित्र",
+        "shite",
+        "shits",
+        "shitted",
+        "shitting",
+        "shitty",
+        "घाणेरडा",
+        "फट",
+        "sluts",
+        "सुगंध",
+        "स्मट",
+        "छेडछाड",
+        "मुलगा-एक-कुत्री",
+        "spac",
+        "तिरस्कार",
+        "परीक्षक",
+        "शीर्षक",
+        "टिट",
+        "टर्ड",
+        "योनी",
+        "वियाग्रा",
+        "वल्वा",
+        "वांग",
+        "विंक",
+        "वेश्या",
+        "एक्स रेट केले",
+        "xxx",
+    ],
+    "pt": english_badwords
+    + [
+        "aborto",
+        "amador",
+        "anal",
+        "aparafusar",
+        "aranha",
+        "ariano",
+        "arrebatar",
+        "ass-filho da puta",
+        "asses",
+        "balalao",
+        "bastardo",
+        "bate uma",
+        "bellend",
+        "bestial",
+        "bestialidade",
+        "bicha",
+        "bichano",
+        "bichanos",
+        "bichas",
+        "biscate",
+        "bissexual",
+        "boceta",
+        "bolas",
+        "bollok",
+        "boob",
+        "boquete",
+        "bosta",
+        "braulio de borracha",
+        "buceta",
+        "bumbum",
+        "bunda",
+        "burro",
+        "cabrao",
+        "cacete",
+        "cadela",
+        "cadelas",
+        "cagando",
+        "cagar",
+        "calçado",
+        "camisinha",
+        "caralho",
+        "cerveja",
+        "chochota",
+        "chupar",
+        "cipa",
+        "clitoris",
+        "clitóris",
+        "cobiçoso",
+        "cocaína",
+        "cocô",
+        "coito",
+        "colhoes",
+        "com tesão",
+        "comedor de tapetes",
+        "comer",
+        "cona",
+        "consolo",
+        "coon",
+        "coragem",
+        "corno",
+        "cu",
+        "cunillingus",
+        "dar o rabo",
+        "desgraçado",
+        "dildo",
+        "dildos",
+        "dink",
+        "dog-filho da puta",
+        "droga",
+        "duche",
+        "dum raio",
+        "ejacula",
+        "ejaculado",
+        "ejacular",
+        "ejaculação",
+        "empacotador de fudge",
+        "escroto",
+        "esporra",
+        "estuprador",
+        "estupro",
+        "fagging",
+        "fanny",
+        "fecal",
+        "felação",
+        "felching",
+        "fenda",
+        "filho da puta",
+        "filhos da puta",
+        "foda",
+        "foda-se",
+        "fode",
+        "foder",
+        "fodido",
+        "frango assado",
+        "galo",
+        "galos",
+        "gozada",
+        "gozar",
+        "grelho",
+        "heroína",
+        "homem gay",
+        "homoerótico",
+        "homosexual",
+        "hore",
+        "idiota",
+        "idiotas",
+        "inferno",
+        "kock",
+        "lolita",
+        "luxúria",
+        "lábios",
+        "lésbica",
+        "maldito",
+        "mama",
+        "masoquista",
+        "masturbar",
+        "merda",
+        "merdas",
+        "mesa",
+        "mijando",
+        "mijar",
+        "nazista",
+        "negro",
+        "niggers",
+        "não me chateies",
+        "orgasim",
+        "orgasmo",
+        "orgasmos",
+        "otário",
+        "paneleiro",
+        "passar um cheque",
+        "pau",
+        "peidar",
+        "peitos",
+        "peituda",
+        "pica",
+        "picadas",
+        "pinto",
+        "pisser",
+        "porcaria",
+        "porno",
+        "pornografia",
+        "pornô",
+        "porra",
+        "prostituta",
+        "pube",
+        "punheta",
+        "puta",
+        "puta que pariu",
+        "puta que te pariu",
+        "putaria",
+        "puto",
+        "pênis",
+        "queca",
+        "retardar",
+        "reto",
+        "rimming",
+        "sacanagem",
+        "saco",
+        "saco de bola",
+        "sangrento",
+        "sapatona",
+        "sexo",
+        "shite",
+        "skank",
+        "smegma",
+        "spac",
+        "sujeira",
+        "sádico",
+        "sêmen",
+        "testículo",
+        "tetas",
+        "titt",
+        "torneira",
+        "transando",
+        "transar",
+        "transsexual",
+        "trepada",
+        "vadia",
+        "vadias",
+        "vagabunda",
+        "vagabundo",
+        "vagina",
+        "vai tomar no cu",
+        "vai-te foder",
+        "veado",
+        "viagra",
+        "vibrador",
+        "vulva",
+        "wang",
+        "x avaliado",
+        "xana",
+        "xixi",
+        "xochota",
+        "xxx",
+        "ânus",
+    ],
+    "te": english_badwords
+    + [
+        "గర్భస్రావం",
+        "అంగ",
+        "పాయువు",
+        "గాడిద",
+        "గాడిద-fucker",
+        "asses",
+        "assholes",
+        "బాల్బ్యాగ్",
+        "బంతుల్లో",
+        "బాస్టర్డ్",
+        "బెల్లెండ్",
+        "మృగ",
+        "బెస్టియాలిటీ",
+        "బిచ్",
+        "bitches",
+        "బిట్చింగ్",
+        "బ్లడీ",
+        "blowjob",
+        "బోల్లక",
+        "బూబ్",
+        "వక్షోజాలను",
+        "ఛాతీ",
+        "buceta",
+        "బం",
+        "బట్",
+        "కార్పెట్ ముంచర్",
+        "చింక్",
+        "cipa",
+        "స్త్రీగుహ్యాంకురము",
+        "ఆత్మవిశ్వాసం",
+        "కాక్-సక్కర్",
+        "కాక్స్",
+        "కూన్",
+        "చెత్త",
+        "కం",
+        "cumshot",
+        "క్యునిల్లింగస్",
+        "కంట్",
+        "తిట్టు",
+        "డిక్",
+        "లైంగిక సంతృప్తి కోసం స్త్రీలు ఉపయోగించే పురుషాంగము వంటి పరికరము",
+        "డిల్డోస్",
+        "dink",
+        "కుక్క-fucker",
+        "డూష్",
+        "డైక్",
+        "స్ఖలించు",
+        "ఎజాక్యులేటెడ్",
+        "ఎజాక్యులేట్స్",
+        "ఎరాక్యులేటింగ్",
+        "స్ఖలనం",
+        "నవుకరు",
+        "ఫాగ్గింగ్",
+        "ఫాగాట్",
+        "ఫగాట్స్",
+        "fanny",
+        "ఫెల్చింగ్",
+        "కుడుచుట",
+        "అచ్చు",
+        "ఫక్",
+        "ఇబ్బంది పెట్టాడు",
+        "fucker",
+        "ఫకర్స్",
+        "ఫకింగ్",
+        "ఫకింగ్స్",
+        "ఫక్స్",
+        "ఫడ్జ్ ప్యాకర్",
+        "దేవతలా మంచిది",
+        "గాడ్డామ్",
+        "నరకం",
+        "హోర్",
+        "horny",
+        "జెర్క్-ఆఫ్",
+        "కాక్",
+        "పెదవి",
+        "కామం",
+        "మనసు పడ్డట్లు చిత్రించారు",
+        "masochist",
+        "హస్తప్రయోగం",
+        "తల్లి ఫెకర్",
+        "నాజీ",
+        "నిగ్గర్",
+        "నిగ్గర్స్",
+        "ఆర్గాసిమ్",
+        "స్కలనం",
+        "orgasms",
+        "pecker",
+        "పురుషాంగం",
+        "విసర్జన",
+        "pissed",
+        "పిస్సర్",
+        "పిస్సీస్",
+        "పిస్సింగ్",
+        "పిస్సాఫ్",
+        "poop",
+        "శృంగార",
+        "పోర్నో",
+        "అశ్లీల",
+        "బుడతడు",
+        "ప్రిక్స్",
+        "ప్యూబ్",
+        "pussies",
+        "పుస్సీ",
+        "రేప్",
+        "ఉన్నప్పటికీ బలాత్కారం",
+        "పురీషనాళం",
+        "రిటార్డ్",
+        "రిమ్మింగ్",
+        "పీడన కాముకత",
+        "screwing",
+        "స్క్రోటమ్",
+        "వీర్యం",
+        "సెక్స్",
+        "బొచ్చు",
+        "షగ్గింగ్",
+        "షీమేల్",
+        "ఒంటి",
+        "షైట్",
+        "షిట్స్",
+        "షిట్టెడ్",
+        "షిట్టింగ్",
+        "shitty",
+        "స్కాన్క్",
+        "నీతి",
+        "స్లట్స్",
+        "శిశ్న",
+        "స్మట్",
+        "స్నాచ్",
+        "ఒక బిచ్ కుమారుడు ఆఫ్",
+        "spac",
+        "స్పంక్",
+        "వృషణాలు",
+        "తునక",
+        "టిట్స్",
+        "టిట్",
+        "turd",
+        "యోని",
+        "వయాగ్రా",
+        "జననాంగం",
+        "వాంగ్",
+        "వ్యాంక్",
+        "వేశ్య",
+        "x రేట్",
+        "xxx",
+    ],
+    "vi": english_badwords
+    + [
+        "sự phá thai",
+        "hậu môn",
+        "mông",
+        "đồ ngu",
+        "lừa",
+        "lỗ đít",
+        "túi bóng",
+        "những quả bóng",
+        "đồ khốn",
+        "tuyệt vời",
+        "mục sư",
+        "lòng tốt",
+        "chó cái",
+        "dính máu",
+        "công việc thổi",
+        "bollok",
+        "boob",
+        "ngực",
+        "buceta",
+        "ăn mày",
+        "thảm muncher",
+        "sứt mẻ",
+        "cipa",
+        "âm vật",
+        "gà",
+        "gà hút",
+        "gà trống",
+        "coon",
+        "tào lao",
+        "kiêm",
+        "cum",
+        "cunillingus",
+        "lồn",
+        "chỉ trích",
+        "tinh ranh",
+        "dương vật giả",
+        "dink",
+        "chó-chó",
+        "duche",
+        "đê",
+        "xuất tinh",
+        "fag",
+        "đóng băng",
+        "fagot",
+        "đồ ăn vặt",
+        "người hâm mộ",
+        "nỉ",
+        "thất bại",
+        "mặt bích",
+        "chết tiệt",
+        "quái",
+        "đụ",
+        "ôm",
+        "đóng gói fudge",
+        "địa ngục",
+        "có",
+        "sừng",
+        "giật",
+        "kock",
+        "môi âm",
+        "ham muốn",
+        "khổ dâm",
+        "thủ dâm",
+        "mẹ kiếp",
+        "nazi",
+        "người da đen",
+        "người mách nước",
+        "cực khoái",
+        "người mổ",
+        "dương vật",
+        "đi tiểu",
+        "bực mình",
+        "đái",
+        "phân",
+        "khiêu dâm",
+        "nội dung khiêu dâm",
+        "châm",
+        "chích",
+        "pube",
+        "pussies",
+        "âm hộ",
+        "hiếp dâm",
+        "trực tràng",
+        "chậm phát triển",
+        "xé",
+        "người tàn bạo",
+        "vặn vít",
+        "bìu",
+        "tinh dịch",
+        "tình dục",
+        "lông",
+        "xáo trộn",
+        "đồng tính",
+        "cứt",
+        "shite",
+        "ván trượt",
+        "đĩ",
+        "quần lót",
+        "smegma",
+        "xì trum",
+        "con trai",
+        "spac",
+        "spunk",
+        "tinh hoàn",
+        "ăn miếng trả miếng",
+        "titt",
+        "cỏ",
+        "âm đạo",
+        "viagra",
+        "âm môn",
+        "wang",
+        "đã ngủ",
+        "con điếm",
+        "x đánh giá",
+        "xxx",
+    ],
+    "zh": english_badwords
+    + [
+        "13.",
+        "13点",
+        "㞗",
+        "三级片",
+        "下三烂",
+        "下贱",
+        "个老子的",
+        "九游",
+        "乳",
+        "乳交",
+        "乳头",
+        "乳房",
+        "乳波臀浪",
+        "交配",
+        "仆街",
+        "仆街",
+        "他奶奶",
+        "他奶奶的",
+        "他奶娘的",
+        "他妈",
+        "他妈ㄉ王八蛋",
+        "他妈地",
+        "他妈的",
+        "他娘",
+        "他马的",
+        "你个傻比",
+        "你他马的",
+        "你全家",
+        "你奶奶的",
+        "你她马的",
+        "你妈",
+        "你妈的",
+        "你娘",
+        "你娘卡好",
+        "你娘咧",
+        "你它妈的",
+        "你它马的",
+        "你是鸡",
+        "你是鸭",
+        "你老味",
+        "你老母",
+        "你老闆",
+        "你马的",
+        "做爱",
+        "傻比",
+        "傻逼",
+        "册那",
+        "冚家拎",
+        "冚家鏟",
+        "军妓",
+        "几八",
+        "几叭",
+        "几巴",
+        "几芭",
+        "刚度",
+        "刚瘪三",
+        "包皮",
+        "十三点",
+        "卖B",
+        "卖比",
+        "卖淫",
+        "卵",
+        "卵子",
+        "双峰微颤",
+        "口交",
+        "口肯",
+        "叫床",
+        "吃屎",
+        "后庭",
+        "吹箫",
+        "咸家伶",
+        "咸家鏟",
+        "塞你公",
+        "塞你娘",
+        "塞你母",
+        "塞你爸",
+        "塞你老师",
+        "塞你老母",
+        "处女",
+        "外阴",
+        "大卵子",
+        "大卵泡",
+        "大鸡巴",
+        "奶",
+        "奶奶的熊",
+        "奶子",
+        "奸",
+        "奸你",
+        "她妈地",
+        "她妈的",
+        "她马的",
+        "妈B",
+        "妈个B",
+        "妈个比",
+        "妈个老比",
+        "妈妈的",
+        "妈比",
+        "妈的",
+        "妈的B",
+        "妈逼",
+        "妓",
+        "妓女",
+        "妓院",
+        "妳她妈的",
+        "妳妈的",
+        "妳娘的",
+        "妳老母的",
+        "妳马的",
+        "姘头",
+        "姣西",
+        "姦",
+        "娘个比",
+        "娘的",
+        "���子",
+        "婊子养的",
+        "嫖娼",
+        "嫖客",
+        "它妈地",
+        "它妈的",
+        "密洞",
+        "射你",
+        "射精",
+        "小乳头",
+        "小卵子",
+        "小卵泡",
+        "小瘪三",
+        "小肉粒",
+        "小骚比",
+        "小骚货",
+        "小鸡巴",
+        "小鸡鸡",
+        "尻",
+        "屁眼",
+        "屁股",
+        "屄",
+        "屌",
+        "屎忽",
+        "巨乳",
+        "干x娘",
+        "干七八",
+        "干你",
+        "干你妈",
+        "干你娘",
+        "干你老母",
+        "干你良",
+        "干妳妈",
+        "干妳娘",
+        "干妳老母",
+        "干妳马",
+        "干您娘",
+        "干机掰",
+        "干死CS",
+        "干死GM",
+        "干死你",
+        "干死客服",
+        "幹",
+        "强奸",
+        "强奸你",
+        "性",
+        "性交",
+        "性器",
+        "性无能",
+        "性爱",
+        "情色",
+        "想上你",
+        "懆您妈",
+        "懆您娘",
+        "懒8",
+        "懒八",
+        "懒叫",
+        "懒教",
+        "成人",
+        "我操你祖宗十八代",
+        "扒光",
+        "打炮",
+        "打飞机",
+        "抽插",
+        "招妓",
+        "插你",
+        "插死你",
+        "撒尿",
+        "撚",
+        "操你",
+        "操你全家",
+        "操你奶奶",
+        "操你妈",
+        "操你娘",
+        "操你祖宗",
+        "操你老妈",
+        "操你老母",
+        "操妳",
+        "操妳全家",
+        "操妳妈",
+        "操妳娘",
+        "操妳祖宗",
+        "操机掰",
+        "操比",
+        "操逼",
+        "放荡",
+        "日他娘",
+        "日你",
+        "日你妈",
+        "日你老娘",
+        "日你老母",
+        "日批",
+        "月经",
+        "机八",
+        "机巴",
+        "机机歪歪",
+        "杂种",
+        "柒",
+        "浪叫",
+        "淫",
+        "淫乱",
+        "淫妇",
+        "淫棍",
+        "淫水",
+        "淫秽",
+        "淫荡",
+        "淫西",
+        "湿透的内裤",
+        "激情",
+        "灨你娘",
+        "烂货",
+        "烂逼",
+        "爛",
+        "狗屁",
+        "狗日",
+        "狗狼养的",
+        "玉杵",
+        "王八蛋",
+        "瓜娃子",
+        "瓜婆娘",
+        "瓜批",
+        "瘪三",
+        "白烂",
+        "白痴",
+        "白癡",
+        "硬膠",
+        "祖宗",
+        "私服",
+        "笨實",
+        "笨蛋",
+        "粉腸",
+        "精子",
+        "老二",
+        "老味",
+        "老母",
+        "老瘪三",
+        "老骚比",
+        "老骚货",
+        "肉壁",
+        "肉棍子",
+        "肉棒",
+        "肉缝",
+        "肏",
+        "肛交",
+        "肥西",
+        "色情",
+        "花柳",
+        "荡妇",
+        "賤",
+        "贝肉",
+        "贱B",
+        "贱人",
+        "贱货",
+        "贼你妈",
+        "赛你老母",
+        "赛妳阿母",
+        "赣您娘",
+        "躝癱",
+        "轮奸",
+        "迷药",
+        "逼",
+        "逼样",
+        "野鸡",
+        "閪",
+        "阳具",
+        "阳萎",
+        "阴唇",
+        "阴户",
+        "阴核",
+        "阴毛",
+        "阴茎",
+        "阴道",
+        "阴部",
+        "陰莖",
+        "雞巴",
+        "靠北",
+        "靠母",
+        "靠爸",
+        "靠背",
+        "靠腰",
+        "驶你公",
+        "驶你娘",
+        "驶你母",
+        "驶你爸",
+        "驶你老师",
+        "驶你老母",
+        "骚比",
+        "骚货",
+        "骚逼",
+        "鬼公",
+        "鳩",
+        "鸡8",
+        "鸡八",
+        "鸡叭",
+        "鸡吧",
+        "鸡奸",
+        "鸡巴",
+        "鸡芭",
+        "鸡鸡",
+        "龟儿子",
+        "龟头",
+    ],
+}

en.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e90c9b25af01dcaa2667ed45d012d891269760fc6eccfe8dbbd161eb20e01d7d
+size 4403509656

en.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:262c0b0bd4ebc592e439453bc7e006d0ed12d1914e206a1fb8c7fba091f52c4d
+size 1389058

filtering.py ADDED Viewed

	@@ -0,0 +1,879 @@

+import re
+import numpy as np
+import fasttext
+import sentencepiece
+import kenlm
+import pathlib
+from languages_id import langs_id
+from parameters_filtering import parameters_filtering
+from normalization import normalization
+from stopwords import stopwords
+from badwords import badwords
+class LoadParameters:
+    @staticmethod
+    def load_parameters(lang_dataset_id):
+        if lang_dataset_id in parameters_filtering:
+            param = parameters_filtering[lang_dataset_id]
+        else:
+            param = parameters_filtering["default"]
+        return param
+    @staticmethod
+    def load_stopwords(lang_dataset_id):
+        stopwords_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "stopwords_id"
+        ].iloc[0]
+        if stopwords_lang_id:
+            stopwords_lang = set(stopwords[stopwords_lang_id])
+        else:
+            stopwords_lang = None
+        return stopwords_lang
+    @staticmethod
+    def load_badwords(lang_dataset_id):
+        badwords_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "badwords_id"
+        ].iloc[0]
+        if badwords_lang_id:
+            badwords_lang = set(badwords[badwords_lang_id])
+        else:
+            badwords_lang = None
+        return badwords_lang
+    @staticmethod
+    def load_model_lang_id(lang_dataset_id, path_fasttext_model):
+        fasttext_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "fasttext_id"
+        ].iloc[0]
+        if fasttext_lang_id:
+            model_lang_id = fasttext.load_model(path_fasttext_model)
+        else:
+            model_lang_id = None
+        return model_lang_id
+    @staticmethod
+    def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model):
+        sentencepiece_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id"
+        ].iloc[0]
+        if sentencepiece_lang_id:
+            sentencepiece_model = sentencepiece.SentencePieceProcessor()
+            sentencepiece_model.load(path_sentencepiece_model)
+        else:
+            sentencepiece_model = None
+        return sentencepiece_model
+    @staticmethod
+    def load_kenlm_model(lang_dataset_id, path_kenlm_model):
+        kenlm_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "kenlm_id"
+        ].iloc[0]
+        if kenlm_lang_id:
+            kenlm_model = kenlm.Model(path_kenlm_model)
+        else:
+            kenlm_model = None
+        return kenlm_model
+class ModifyingDocuments:
+    @staticmethod
+    def remove_empty_el_from_list(list_):
+        return [el for el in list_ if el]
+    @staticmethod
+    def remove_non_printing_characters(document, non_printing_characters_re):
+        return non_printing_characters_re.sub("", document)
+    @staticmethod
+    def uniform_whitespace(
+        document,
+        whitespace=[
+            " ",
+            " ",
+            " ",
+            " ",
+            " ",
+            "　",
+            " ",
+            " ",
+            " ",
+            " ",
+            "",
+            "",
+        ],
+    ):
+        """There are different whitespace characters."""
+        whitespace = set(whitespace)
+        document = "".join(
+            [char if char not in whitespace else " " for char in document]
+        )
+        return document
+    @staticmethod
+    def replace_digits_with_zeros(document, digits_re):
+        return digits_re.sub("0", document)
+    @staticmethod
+    def replace_unicode_punctuation(document, unicode_punctuation):
+        return "".join(unicode_punctuation.get(c, c) for c in document)
+    @staticmethod
+    def normalization(
+        document,
+        remove_non_printing_characters,
+        strip,
+        lower_case,
+        uniform_whitespace,
+        replace_digits_with_zeros,
+        replace_unicode_punctuation,
+        non_printing_characters_re=normalization["non_printing_characters_re"],
+        digits_re=normalization["digits_re"],
+        unicode_punctuation=normalization["unicode_punctuation"],
+    ):
+        if remove_non_printing_characters:
+            document = ModifyingDocuments.remove_non_printing_characters(
+                document, non_printing_characters_re
+            )
+        if strip:
+            document = document.strip()
+        if not document:
+            return document
+        if lower_case:
+            document = document.lower()
+        if uniform_whitespace:
+            document = ModifyingDocuments.uniform_whitespace(document)
+        if replace_digits_with_zeros:
+            document = ModifyingDocuments.replace_digits_with_zeros(document, digits_re)
+        if replace_unicode_punctuation:
+            document = ModifyingDocuments.replace_unicode_punctuation(
+                document, unicode_punctuation
+            )
+        return document
+    @staticmethod
+    def tokenization(document, sentencepiece_model, join_on_whitespace):
+        document_tokenized = sentencepiece_model.encode_as_pieces(document)
+        if join_on_whitespace:
+            document_tokenized = " ".join(document_tokenized)
+        return document_tokenized
+    @staticmethod
+    def split_on_whitespace(
+        document,
+        new_line=False,
+        tab=False,
+    ):
+        """This method also removes concatenated spaces."""
+        sep = [" "] + new_line * ["\n"] + tab * ["\t"]
+        sep = "|".join(sep)
+        split_document = re.split(sep, document)
+        split_document = ModifyingDocuments.remove_empty_el_from_list(split_document)
+        return split_document
+    @staticmethod
+    def strip(document, strip_characters):
+        """Way faster than document.strip(strip_characters)
+        since strip_characters is now a set instead of a str,
+        and it contains a lot of elements (all the emojis)."""
+        if not document:
+            return document
+        beg_ind = 0
+        end_ind = len(document)
+        for i in range(len(document)):
+            if document[i] in strip_characters:
+                beg_ind += 1
+            else:
+                break
+        for i in range(1, len(document) + 1):
+            if document[-i] in strip_characters:
+                end_ind -= 1
+            else:
+                break
+        document_stripped = document[beg_ind:end_ind]
+        return document_stripped
+    @staticmethod
+    def get_words_from_document(
+        document, sentencepiece_model_tok, lower_case, strip_characters
+    ):
+        """Get words from a document. Non reversible since the document
+        is split on multiple characters, words are stripped of
+        special characters and characters are converted to lower case.
+        Useful to compute ratios, like the stopwords ratio."""
+        if sentencepiece_model_tok:
+            document_normalized = ModifyingDocuments.normalization(
+                document=document,
+                remove_non_printing_characters=True,
+                strip=True,
+                lower_case=True,
+                uniform_whitespace=True,
+                replace_digits_with_zeros=True,
+                replace_unicode_punctuation=True,
+            )
+            words = ModifyingDocuments.tokenization(
+                document_normalized, sentencepiece_model_tok, join_on_whitespace=False
+            )
+        else:
+            words = ModifyingDocuments.split_on_whitespace(
+                document, new_line=True, tab=True
+            )
+        if lower_case:
+            words = [word.lower() for word in words]
+        if strip_characters:
+            words = [ModifyingDocuments.strip(word, strip_characters) for word in words]
+            words = ModifyingDocuments.remove_empty_el_from_list(words)
+        return words
+    @staticmethod
+    def words_augmentation(words, group_size, join_char):
+        """Augment words, especially for Chinese (without a space between words)
+        and Vietnamese (with a space between syllables)."""
+        augmentation = [
+            join_char.join(words[i : i + group_size])
+            for i in range(len(words) - group_size + 1)
+        ]
+        return augmentation
+    @staticmethod
+    def split_on_newline_tab_whitespace(document):
+        """First split on "\n", then on "\t", then on " "."""
+        sentences = document.split("\n")
+        sentences = [sentence.split("\t") for sentence in sentences]
+        sentences = [
+            [
+                ModifyingDocuments.split_on_whitespace(subsentence)
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        return sentences
+    @staticmethod
+    def merge_on_whitespace_tab_newline(sentences):
+        """Invert the method split_on_newline_tab_whitespace.
+        Removes concatenated separators."""
+        sentences = [
+            [" ".join(subsentence) for subsentence in sentence if subsentence]
+            for sentence in sentences
+        ]
+        sentences = ["\t".join(sentence) for sentence in sentences if sentence]
+        if not sentences:
+            return ""
+        document = "\n".join(sentences)
+        return document
+    @staticmethod
+    def should_keep_word_with_incorrect_substrings(
+        word, strip_characters, incorrect_word_substrings
+    ):
+        word = ModifyingDocuments.strip(word, strip_characters)
+        should_keep = all(
+            [(i_substr not in word) for i_substr in incorrect_word_substrings]
+        )
+        return should_keep
+    @staticmethod
+    def remove_words_with_incorrect_substrings(
+        document,
+        strip_characters,
+        incorrect_word_substrings,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_word_with_incorrect_substrings(
+                        word, strip_characters, incorrect_word_substrings
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+    @staticmethod
+    def should_keep_long_word(word, strip_characters, length_word_max_cutoff):
+        """If the word is too long but it contains only one
+        special character, it might be a concatenation of one word,
+        a punctuation, and another word, with no space between them.
+        In this case, we give the word a pass."""
+        if len(word) <= length_word_max_cutoff:
+            return True
+        word = ModifyingDocuments.strip(word, strip_characters)
+        if not word:  # The word consisted only of strip characters
+            return False
+        if len(word) <= length_word_max_cutoff:
+            return True
+        return False
+    def remove_long_words(
+        document,
+        strip_characters,
+        length_word_max_cutoff,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_long_word(
+                        word,
+                        strip_characters,
+                        length_word_max_cutoff,
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+    @staticmethod
+    def modifying_documents(
+        document,
+        cond_uniform_whitespace,
+        cond_replace_unicode_punctuation,
+        cond_remove_words_with_incorrect_substrings,
+        strip_characters,
+        incorrect_word_substrings,
+        cond_remove_long_words,
+        length_word_max_cutoff,
+    ):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=False,
+            strip=True,
+            lower_case=False,
+            uniform_whitespace=cond_uniform_whitespace,
+            replace_digits_with_zeros=False,
+            replace_unicode_punctuation=cond_replace_unicode_punctuation,
+        )
+        if cond_remove_words_with_incorrect_substrings:
+            document = ModifyingDocuments.remove_words_with_incorrect_substrings(
+                document,
+                strip_characters,
+                incorrect_word_substrings,
+            )
+        if cond_remove_long_words:
+            document = ModifyingDocuments.remove_long_words(
+                document,
+                strip_characters,
+                length_word_max_cutoff,
+            )
+        return document
+class FunctionDatasetModifyingDocuments:
+    def __init__(self, lang_dataset_id):
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+    def __call__(self, example):
+        example["text"] = ModifyingDocuments.modifying_documents(
+            document=example["text"],
+            cond_uniform_whitespace=self.param["cond_uniform_whitespace"],
+            cond_replace_unicode_punctuation=self.param[
+                "cond_replace_unicode_punctuation"
+            ],
+            cond_remove_words_with_incorrect_substrings=self.param[
+                "cond_remove_words_with_incorrect_substrings"
+            ],
+            strip_characters=self.param["strip_characters"],
+            incorrect_word_substrings=self.param["incorrect_word_substrings"],
+            cond_remove_long_words=self.param["cond_remove_long_words"],
+            length_word_max_cutoff=self.param["length_word_max_cutoff"],
+        )
+        return example
+    def __reduce__(self):
+        return (self.__class__, (self.lang_dataset_id,))
+class Filtering:
+    @staticmethod
+    def check_number_words(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=False,
+            strip_characters=strip_characters,
+        )
+        cond = (len(words) >= number_words_min_cutoff) and (
+            len(words) <= number_words_max_cutoff
+        )
+        return cond
+    @staticmethod
+    def compute_repetitions_ratio(document, repetitions_length):
+        def get_freq_ngrams(document, n):
+            ngrams = [document[i : i + n] for i in range(len(document) - n + 1)]
+            freq_ngrams = {}
+            for ngram in ngrams:
+                freq_ngrams[ngram] = freq_ngrams.get(ngram, 0) + 1
+            return freq_ngrams
+        freq_ngrams = get_freq_ngrams(document, repetitions_length)
+        if len(freq_ngrams) == 0:
+            return 0
+        freq_ngrams = list(freq_ngrams.values())
+        freq_ngrams = sorted(freq_ngrams, reverse=True)
+        num_rep_ngrams = int(np.sqrt(len(freq_ngrams)))
+        repetitions_ratio = sum(freq_ngrams[:num_rep_ngrams]) / sum(freq_ngrams)
+        return repetitions_ratio
+    @staticmethod
+    def check_repetitions_removal(
+        document,
+        repetitions_length,
+        repetitions_max_cutoff,
+    ):
+        repetitions_ratio = Filtering.compute_repetitions_ratio(
+            document, repetitions_length
+        )
+        cond = repetitions_ratio <= repetitions_max_cutoff
+        return cond
+    @staticmethod
+    def compute_special_characters_ratio(document, special_characters):
+        special_characters_ratio = len(
+            [char for char in document if char in special_characters]
+        ) / len(document)
+        return special_characters_ratio
+    @staticmethod
+    def check_special_characters(
+        document,
+        special_characters,
+        special_characters_max_cutoff,
+    ):
+        special_characters_ratio = Filtering.compute_special_characters_ratio(
+            document, special_characters
+        )
+        cond = special_characters_ratio <= special_characters_max_cutoff
+        return cond
+    @staticmethod
+    def compute_stopwords_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        stopwords_ratio = len(
+            [word for word in words + augmentation if word in stopwords]
+        ) / len(words)
+        if stopwords_ratio > 1.0:
+            stopwords_ratio = 1.0
+        return stopwords_ratio
+    @staticmethod
+    def check_stopwords(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+        stopwords_min_cutoff,
+    ):
+        cond = True
+        if stopwords:
+            stopwords_ratio = Filtering.compute_stopwords_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+            )
+            cond = stopwords_ratio >= stopwords_min_cutoff
+        return cond
+    @staticmethod
+    def compute_badwords_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        badwords,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        badwords_ratio = len(
+            [word for word in words + augmentation if word in badwords]
+        ) / len(words)
+        if badwords_ratio > 1.0:
+            badwords_ratio = 1.0
+        for word in augmentation:
+            if word in badwords:
+                print(word)
+        return badwords_ratio
+    @staticmethod
+    def check_badwords(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        badwords,
+        badwords_max_cutoff,
+    ):
+        cond = True
+        if badwords:
+            badwords_ratio = Filtering.compute_badwords_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                badwords,
+            )
+            cond = badwords_ratio <= badwords_max_cutoff
+        return cond
+    @staticmethod
+    def compute_lang_id_pred_score(document, model_lang_id):
+        document = document.lower().replace("\n", " ")
+        pred = model_lang_id.predict(document)
+        lang_pred_fasttext_id = pred[0][0].replace("__label__", "")
+        score_pred = pred[1][0]
+        lang_pred_dataset_id = langs_id.loc[
+            langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id"
+        ]
+        if len(lang_pred_dataset_id) > 0:
+            lang_pred_dataset_id = lang_pred_dataset_id.iloc[0]
+        else:
+            lang_pred_dataset_id = "unknown"
+        return lang_pred_dataset_id, score_pred
+    @staticmethod
+    def check_lang_id(
+        document,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+    ):
+        cond = True
+        if model_lang_id:
+            lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score(
+                document, model_lang_id
+            )
+            cond = (lang_pred_dataset_id == lang_dataset_id) and (
+                score_pred >= lang_id_min_cutoff
+            )
+        return cond
+    @staticmethod
+    def compute_perplexity_score(document, sentencepiece_model, kenlm_model):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=True,
+            strip=True,
+            lower_case=True,
+            uniform_whitespace=True,
+            replace_digits_with_zeros=True,
+            replace_unicode_punctuation=True,
+        )
+        document = ModifyingDocuments.tokenization(
+            document, sentencepiece_model, join_on_whitespace=True
+        )
+        doc_log_score, doc_length = 0, 0
+        for line in document.split("\n"):
+            log_score = kenlm_model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        pp_score = 10.0 ** (-doc_log_score / doc_length)
+        pp_score = round(pp_score, 1)
+        return pp_score
+    @staticmethod
+    def check_perplexity(
+        document,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        cond = True
+        if kenlm_model:
+            score = Filtering.compute_perplexity_score(
+                document, sentencepiece_model, kenlm_model
+            )
+            cond = score <= perplexity_max_cutoff
+        return cond
+    @staticmethod
+    def filtering(
+        document,
+        cond_check_number_words,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+        cond_check_repetitions_removal,
+        repetitions_length,
+        repetitions_max_cutoff,
+        cond_check_special_characters,
+        special_characters,
+        special_characters_max_cutoff,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        cond_check_stopwords,
+        stopwords,
+        stopwords_min_cutoff,
+        cond_check_badwords,
+        badwords,
+        badwords_max_cutoff,
+        cond_check_lang_id,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+        cond_check_perplexity,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        if cond_check_number_words:
+            if not Filtering.check_number_words(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                number_words_min_cutoff,
+                number_words_max_cutoff,
+            ):
+                return False
+        if cond_check_repetitions_removal:
+            if not Filtering.check_repetitions_removal(
+                document,
+                repetitions_length,
+                repetitions_max_cutoff,
+            ):
+                return False
+        if cond_check_special_characters:
+            if not Filtering.check_special_characters(
+                document,
+                special_characters,
+                special_characters_max_cutoff,
+            ):
+                return False
+        if cond_check_stopwords:
+            if not Filtering.check_stopwords(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+                stopwords_min_cutoff,
+            ):
+                return False
+        if cond_check_badwords:
+            if not Filtering.check_badwords(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                badwords,
+                badwords_max_cutoff,
+            ):
+                return False
+        if cond_check_lang_id:
+            if not Filtering.check_lang_id(
+                document,
+                lang_dataset_id,
+                model_lang_id,
+                lang_id_min_cutoff,
+            ):
+                return False
+        if cond_check_perplexity:
+            if not Filtering.check_perplexity(
+                document,
+                sentencepiece_model,
+                kenlm_model,
+                perplexity_max_cutoff,
+            ):
+                return False
+        return True
+class FunctionDatasetFiltering:
+    def __init__(
+        self,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+    ):
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.badwords = LoadParameters.load_badwords(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
+    def __call__(self, example):
+        keep_example = Filtering.filtering(
+            document=example["text"],
+            cond_check_number_words=self.param["cond_check_number_words"],
+            sentencepiece_model_tok=self.sentencepiece_model_tok,
+            strip_characters=self.param["strip_characters"],
+            number_words_min_cutoff=self.param["number_words_min_cutoff"],
+            number_words_max_cutoff=self.param["number_words_max_cutoff"],
+            cond_check_repetitions_removal=self.param["check_repetitions_removal"],
+            repetitions_length=self.param["repetitions_length"],
+            repetitions_max_cutoff=self.param["repetitions_max_cutoff"],
+            cond_check_special_characters=self.param["cond_check_special_characters"],
+            special_characters=self.param["special_characters"],
+            special_characters_max_cutoff=self.param["special_characters_max_cutoff"],
+            cond_words_augmentation=self.param["cond_words_augmentation"],
+            words_augmentation_group_sizes=self.param["words_augmentation_group_sizes"],
+            words_augmentation_join_char=self.param["words_augmentation_join_char"],
+            cond_check_stopwords=self.param["cond_check_stopwords"],
+            stopwords=self.stopwords,
+            stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
+            cond_check_badwords=self.param["cond_check_badwords"],
+            badwords=self.badwords,
+            badwords_max_cutoff=self.param["badwords_max_cutoff"],
+            cond_check_lang_id=self.param["cond_check_lang_id"],
+            lang_dataset_id=self.lang_dataset_id,
+            model_lang_id=self.model_lang_id,
+            lang_id_min_cutoff=self.param["lang_id_min_cutoff"],
+            cond_check_perplexity=self.param["cond_check_perplexity"],
+            sentencepiece_model=self.sentencepiece_model,
+            kenlm_model=self.kenlm_model,
+            perplexity_max_cutoff=self.param["perplexity_max_cutoff"],
+        )
+        return keep_example
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.lang_dataset_id,
+                self.path_fasttext_model,
+                self.path_sentencepiece_model,
+                self.path_kenlm_model,
+            ),
+        )
+class DatasetFiltering:
+    def __init__(
+        self,
+        dataset,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+        num_proc,
+        path_dir_save_dataset,
+    ):
+        self.ds = dataset
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+        self.num_proc = num_proc
+        self.path_dir_save_dataset = path_dir_save_dataset
+    def modifying_documents(self):
+        dataset_modifying_documents = FunctionDatasetModifyingDocuments(
+            self.lang_dataset_id
+        )
+        self.ds = self.ds.map(dataset_modifying_documents, num_proc=self.num_proc)
+    def filtering(self):
+        func_dataset_filtering = FunctionDatasetFiltering(
+            self.lang_dataset_id,
+            self.path_fasttext_model,
+            self.path_sentencepiece_model,
+            self.path_kenlm_model,
+        )
+        self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc)
+    def save_dataset(self):
+        pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        path_dir_save_dataset = pathlib.PurePath(
+            self.path_dir_save_dataset, self.lang_dataset_id
+        )
+        pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        self.ds.save_to_disk(path_dir_save_dataset)

languages_id.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import pandas as pd
+langs_id = [
+    {
+        "lang": "Afrikaans",
+        "dataset_id": "af",
+        "stopwords_id": "af",
+        "badwords_id": None,
+        "fasttext_id": "af",
+        "sentencepiece_id": "af",
+        "kenlm_id": "af",
+    },
+    {
+        "lang": "Arabic",
+        "dataset_id": "ar",
+        "stopwords_id": "ar",
+        "badwords_id": "ar",
+        "fasttext_id": "ar",
+        "sentencepiece_id": "ar",
+        "kenlm_id": "ar",
+    },
+    {
+        "lang": "Egyptian Arabic",
+        "dataset_id": "arz",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "arz",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Assamese",
+        "dataset_id": "as",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "as",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Bengali",
+        "dataset_id": "bn",
+        "stopwords_id": "bn",
+        "badwords_id": None,
+        "fasttext_id": "bn",
+        "sentencepiece_id": "bn",
+        "kenlm_id": "bn",
+    },
+    {
+        "lang": "Catalan",
+        "dataset_id": "ca",
+        "stopwords_id": "ca",
+        "badwords_id": "ca",
+        "fasttext_id": "ca",
+        "sentencepiece_id": "ca",
+        "kenlm_id": "ca",
+    },
+    {
+        "lang": "English",
+        "dataset_id": "en",
+        "stopwords_id": "en",
+        "badwords_id": "en",
+        "fasttext_id": "en",
+        "sentencepiece_id": "en",
+        "kenlm_id": "en",
+    },
+    {
+        "lang": "Spanish",
+        "dataset_id": "es",
+        "stopwords_id": "es",
+        "badwords_id": "es",
+        "fasttext_id": "es",
+        "sentencepiece_id": "es",
+        "kenlm_id": "es",
+    },
+    {
+        "lang": "Basque",
+        "dataset_id": "eu",
+        "stopwords_id": "eu",
+        "badwords_id": "eu",
+        "fasttext_id": "eu",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "French",
+        "dataset_id": "fr",
+        "stopwords_id": "fr",
+        "badwords_id": "fr",
+        "fasttext_id": "fr",
+        "sentencepiece_id": "fr",
+        "kenlm_id": "fr",
+    },
+    {
+        "lang": "Gujarati",
+        "dataset_id": "gu",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "gu",
+        "sentencepiece_id": "gu",
+        "kenlm_id": "gu",
+    },
+    {
+        "lang": "Hindi",
+        "dataset_id": "hi",
+        "stopwords_id": "hi",
+        "badwords_id": "hi",
+        "fasttext_id": "hi",
+        "sentencepiece_id": "hi",
+        "kenlm_id": "hi",
+    },
+    {
+        "lang": "Indonesian",
+        "dataset_id": "id",
+        "stopwords_id": "id",
+        "badwords_id": "id",
+        "fasttext_id": "id",
+        "sentencepiece_id": "id",
+        "kenlm_id": "id",
+    },
+    {
+        "lang": "Kannada",
+        "dataset_id": "kn",
+        "stopwords_id": None,
+        "badwords_id": "kn",
+        "fasttext_id": "kn",
+        "sentencepiece_id": "kn",
+        "kenlm_id": "kn",
+    },
+    {
+        "lang": "Malayalam",
+        "dataset_id": "ml",
+        "stopwords_id": None,
+        "badwords_id": "ml",
+        "fasttext_id": "ml",
+        "sentencepiece_id": "ml",
+        "kenlm_id": "ml",
+    },
+    {
+        "lang": "Marathi",
+        "dataset_id": "mr",
+        "stopwords_id": "mr",
+        "badwords_id": "mr",
+        "fasttext_id": "mr",
+        "sentencepiece_id": "mr",
+        "kenlm_id": "mr",
+    },
+    {
+        "lang": "Portuguese",
+        "dataset_id": "pt",
+        "stopwords_id": "pt",
+        "badwords_id": "pt",
+        "fasttext_id": "pt",
+        "sentencepiece_id": "pt",
+        "kenlm_id": "pt",
+    },
+    {
+        "lang": "Somali",
+        "dataset_id": "so",
+        "stopwords_id": "so",
+        "badwords_id": None,
+        "fasttext_id": "so",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Swahili",
+        "dataset_id": "sw",
+        "stopwords_id": "sw",
+        "badwords_id": None,
+        "fasttext_id": "sw",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Tamil",
+        "dataset_id": "ta",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "ta",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Telugu",
+        "dataset_id": "te",
+        "stopwords_id": None,
+        "badwords_id": "te",
+        "fasttext_id": "te",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Urdu",
+        "dataset_id": "ur",
+        "stopwords_id": "ur",
+        "badwords_id": None,
+        "fasttext_id": "ur",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Vietnamese",
+        "dataset_id": "vi",
+        "stopwords_id": "vi",
+        "badwords_id": "vi",
+        "fasttext_id": "vi",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Yoruba",
+        "dataset_id": "yo",
+        "stopwords_id": "yo",
+        "badwords_id": None,
+        "fasttext_id": "yo",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Chinese",
+        "dataset_id": "zh",
+        "stopwords_id": "zh",
+        "badwords_id": "zh",
+        "fasttext_id": "zh",
+        "sentencepiece_id": "zh",
+        "kenlm_id": "zh",
+    },
+]
+langs_id = pd.DataFrame(langs_id)

lid.176.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
+size 131266198

normalization.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import re
+from typing import Dict
+non_printing_characters_re = re.compile(
+    f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+)
+digits_re: re.Pattern = re.compile(r"\d")
+unicode_punctuation: Dict[str, str] = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+normalization = {
+    "non_printing_characters_re": non_printing_characters_re,
+    "digits_re": digits_re,
+    "unicode_punctuation": unicode_punctuation,
+}

requirements.txt → packages.txt RENAMED Viewed

File without changes

parameters_filtering.py ADDED Viewed

	@@ -0,0 +1,852 @@

+import string
+import emoji
+main_special_characters = string.punctuation + string.digits + string.whitespace
+other_special_characters = (
+    "    　    ’“”–ー一▬…✦�£•€«»°·═"
+    "×士＾˘⇓↓↑←→（）§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
+    "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
+    "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬x？▷Г♫∟™ª₪®「—"
+    "❖」﴾》"
+)
+emoji = list(emoji.UNICODE_EMOJI["en"].keys())
+special_characters_default = set(main_special_characters + other_special_characters)
+special_characters_default.update(emoji)
+parameters_filtering_default = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.70,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_af = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.6,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_ar = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.45,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1000000,
+}
+parameters_filtering_arz = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.5,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_as = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_bn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.05,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 575000,
+}
+parameters_filtering_ca = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1750000,
+}
+parameters_filtering_en = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": True,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 20,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.3,
+    "cond_check_badwords": True,
+    "badwords_max_cutoff": 0.045,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.80,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500,
+}
+parameters_filtering_es = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.2,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+parameters_filtering_eu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_fr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_gu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 250000,
+}
+parameters_filtering_hi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 600000,
+}
+parameters_filtering_id = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.25,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+parameters_filtering_kn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 400000,
+}
+parameters_filtering_ml = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.2,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1600000,
+}
+parameters_filtering_mr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 425000,
+}
+parameters_filtering_pt = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_so = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 1000,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_sw = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_ta = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_te = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_ur = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_vi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2, 3],
+    "words_augmentation_join_char": " ",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_yo = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_zh = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 1000,
+    "cond_check_number_words": True,
+    "tokenization": True,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2, 3],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering = {
+    "default": parameters_filtering_default,
+    "af": parameters_filtering_af,
+    "ar": parameters_filtering_ar,
+    "arz": parameters_filtering_arz,
+    "as": parameters_filtering_as,
+    "bn": parameters_filtering_bn,
+    "ca": parameters_filtering_ca,
+    "en": parameters_filtering_en,
+    "es": parameters_filtering_es,
+    "eu": parameters_filtering_eu,
+    "fr": parameters_filtering_fr,
+    "gu": parameters_filtering_gu,
+    "hi": parameters_filtering_hi,
+    "id": parameters_filtering_id,
+    "kn": parameters_filtering_kn,
+    "ml": parameters_filtering_ml,
+    "mr": parameters_filtering_mr,
+    "pt": parameters_filtering_pt,
+    "so": parameters_filtering_so,
+    "sw": parameters_filtering_sw,
+    "ta": parameters_filtering_ta,
+    "te": parameters_filtering_te,
+    "ur": parameters_filtering_ur,
+    "vi": parameters_filtering_vi,
+    "yo": parameters_filtering_yo,
+    "zh": parameters_filtering_zh,
+}

stopwords.py ADDED Viewed

The diff for this file is too large to render. See raw diff