HugoLaurencon HF staff commited on
Commit
f217a73
1 Parent(s): bfbcd60

rename badwords to flagged words + new flagged words list of 68 words

Browse files
app.py CHANGED
@@ -7,6 +7,7 @@ import os
7
  import base64
8
  import json
9
  import pandas as pd
 
10
  pd.options.mode.chained_assignment = None
11
 
12
  import numpy as np
@@ -40,7 +41,7 @@ class Visualization:
40
  self.lang_dataset_id = lang_dataset_id
41
  self.param = LoadParameters.load_parameters(lang_dataset_id)
42
  self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
43
- self.badwords = LoadParameters.load_badwords(lang_dataset_id)
44
  self.model_lang_id = LoadParameters.load_model_lang_id(
45
  lang_dataset_id, path_fasttext_model
46
  )
@@ -222,16 +223,16 @@ class Visualization:
222
  print_discared_by_cond(cond)
223
  conds["stopwords_ratio"] = [cond]
224
 
225
- if "badwords_ratio" in columns:
226
- cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
227
- cutoff_badwords_ratio = st.sidebar.slider(
228
  cutoff_def, 0.0, 1.0, 1.0, step=0.01
229
  )
230
- new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
231
  keys.append(new_key)
232
  cond = get_cond(new_key[0], new_key[1], new_key[2])
233
  print_discared_by_cond(cond)
234
- conds["badwords_ratio"] = [cond]
235
 
236
  if "lang_id_score" in columns:
237
  cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
@@ -316,11 +317,11 @@ class Visualization:
316
  "Discarded documents for the filter on the stop words ratio",
317
  )
318
 
319
- if "badwords_ratio" in columns:
320
- cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
321
  display_dataset(
322
  cond_filter,
323
- "Discarded documents for the filter on the bad words ratio",
324
  )
325
 
326
  if "lang_id_score" in columns:
@@ -504,19 +505,19 @@ class Visualization:
504
  if is_doc_discarded(key, stopwords_ratio):
505
  is_discarded = True
506
 
507
- elif key[0] == "badwords_ratio":
508
- badwords_ratio = Filtering.compute_badwords_ratio(
509
  personal_doc,
510
  self.sentencepiece_model_tok,
511
  self.param["strip_characters"],
512
  self.param["cond_words_augmentation"],
513
  self.param["words_augmentation_group_sizes"],
514
  self.param["words_augmentation_join_char"],
515
- self.badwords,
516
  )
517
- badwords_ratio = round(badwords_ratio, 3)
518
- st.markdown(f"Flagged words ratio: {badwords_ratio}")
519
- if is_doc_discarded(key, badwords_ratio):
520
  is_discarded = True
521
 
522
  elif key[0] == "lang_id_score":
@@ -530,7 +531,7 @@ class Visualization:
530
  st.markdown(
531
  f"Language identification confidence score: {lang_id_score}"
532
  )
533
- if is_doc_discarded(key, badwords_ratio) or (
534
  self.lang_dataset_id != lang_pred_dataset_id
535
  ):
536
  is_discarded = True
7
  import base64
8
  import json
9
  import pandas as pd
10
+
11
  pd.options.mode.chained_assignment = None
12
 
13
  import numpy as np
41
  self.lang_dataset_id = lang_dataset_id
42
  self.param = LoadParameters.load_parameters(lang_dataset_id)
43
  self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
44
+ self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
45
  self.model_lang_id = LoadParameters.load_model_lang_id(
46
  lang_dataset_id, path_fasttext_model
47
  )
223
  print_discared_by_cond(cond)
224
  conds["stopwords_ratio"] = [cond]
225
 
226
+ if "flagged_words_ratio" in columns:
227
+ cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
228
+ cutoff_flagged_words_ratio = st.sidebar.slider(
229
  cutoff_def, 0.0, 1.0, 1.0, step=0.01
230
  )
231
+ new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
232
  keys.append(new_key)
233
  cond = get_cond(new_key[0], new_key[1], new_key[2])
234
  print_discared_by_cond(cond)
235
+ conds["flagged_words_ratio"] = [cond]
236
 
237
  if "lang_id_score" in columns:
238
  cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
317
  "Discarded documents for the filter on the stop words ratio",
318
  )
319
 
320
+ if "flagged_words_ratio" in columns:
321
+ cond_filter = np.invert(np.all(conds["flagged_words_ratio"], axis=0))
322
  display_dataset(
323
  cond_filter,
324
+ "Discarded documents for the filter on the flagged words ratio",
325
  )
326
 
327
  if "lang_id_score" in columns:
505
  if is_doc_discarded(key, stopwords_ratio):
506
  is_discarded = True
507
 
508
+ elif key[0] == "flagged_words_ratio":
509
+ flagged_words_ratio = Filtering.compute_flagged_words_ratio(
510
  personal_doc,
511
  self.sentencepiece_model_tok,
512
  self.param["strip_characters"],
513
  self.param["cond_words_augmentation"],
514
  self.param["words_augmentation_group_sizes"],
515
  self.param["words_augmentation_join_char"],
516
+ self.flagged_words,
517
  )
518
+ flagged_words_ratio = round(flagged_words_ratio, 3)
519
+ st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
520
+ if is_doc_discarded(key, flagged_words_ratio):
521
  is_discarded = True
522
 
523
  elif key[0] == "lang_id_score":
531
  st.markdown(
532
  f"Language identification confidence score: {lang_id_score}"
533
  )
534
+ if is_doc_discarded(key, flagged_words_ratio) or (
535
  self.lang_dataset_id != lang_pred_dataset_id
536
  ):
537
  is_discarded = True
en_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00106fc2a9d51bbc78ce1ca2d05f2f402bf927a1f741f6c092b3f17cb9f16801
3
- size 237353442
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffbb8afeba42822e4b10341112999321e0e14a19a5eeebc342dc68a9f65d3c7f
3
+ size 237426014
explanation_filtering_pipeline.pdf CHANGED
Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ
filtering.py CHANGED
@@ -13,7 +13,7 @@ from languages_id import langs_id
13
  from parameters_filtering import parameters_filtering
14
  from normalization import normalization
15
  from stopwords import stopwords
16
- from badwords import badwords
17
 
18
 
19
  class LoadParameters:
@@ -37,15 +37,15 @@ class LoadParameters:
37
  return stopwords_lang
38
 
39
  @staticmethod
40
- def load_badwords(lang_dataset_id):
41
- badwords_lang_id = langs_id.loc[
42
- langs_id["dataset_id"] == lang_dataset_id, "badwords_id"
43
  ].iloc[0]
44
- if badwords_lang_id:
45
- badwords_lang = set(badwords[badwords_lang_id])
46
  else:
47
- badwords_lang = None
48
- return badwords_lang
49
 
50
  @staticmethod
51
  def load_model_lang_id(lang_dataset_id, path_fasttext_model):
@@ -533,14 +533,14 @@ class Filtering:
533
  return cond
534
 
535
  @staticmethod
536
- def compute_badwords_ratio(
537
  document,
538
  sentencepiece_model_tok,
539
  strip_characters,
540
  cond_words_augmentation,
541
  words_augmentation_group_sizes,
542
  words_augmentation_join_char,
543
- badwords,
544
  ):
545
  words = ModifyingDocuments.get_words_from_document(
546
  document,
@@ -559,39 +559,36 @@ class Filtering:
559
  for group_size in words_augmentation_group_sizes
560
  ]
561
  augmentation = [word for augm in augmentation for word in augm]
562
- badwords_ratio = len(
563
- [word for word in words + augmentation if word in badwords]
564
  ) / len(words)
565
- if badwords_ratio > 1.0:
566
- badwords_ratio = 1.0
567
- for word in augmentation:
568
- if word in badwords:
569
- print(word)
570
- return badwords_ratio
571
 
572
  @staticmethod
573
- def check_badwords(
574
  document,
575
  sentencepiece_model_tok,
576
  strip_characters,
577
  cond_words_augmentation,
578
  words_augmentation_group_sizes,
579
  words_augmentation_join_char,
580
- badwords,
581
- badwords_max_cutoff,
582
  ):
583
  cond = True
584
- if badwords:
585
- badwords_ratio = Filtering.compute_badwords_ratio(
586
  document,
587
  sentencepiece_model_tok,
588
  strip_characters,
589
  cond_words_augmentation,
590
  words_augmentation_group_sizes,
591
  words_augmentation_join_char,
592
- badwords,
593
  )
594
- cond = badwords_ratio <= badwords_max_cutoff
595
  return cond
596
 
597
  @staticmethod
@@ -685,9 +682,9 @@ class Filtering:
685
  cond_check_stopwords,
686
  stopwords,
687
  stopwords_min_cutoff,
688
- cond_check_badwords,
689
- badwords,
690
- badwords_max_cutoff,
691
  cond_check_lang_id,
692
  lang_dataset_id,
693
  model_lang_id,
@@ -732,16 +729,16 @@ class Filtering:
732
  stopwords_min_cutoff,
733
  ):
734
  return False
735
- if cond_check_badwords:
736
- if not Filtering.check_badwords(
737
  document,
738
  sentencepiece_model_tok,
739
  strip_characters,
740
  cond_words_augmentation,
741
  words_augmentation_group_sizes,
742
  words_augmentation_join_char,
743
- badwords,
744
- badwords_max_cutoff,
745
  ):
746
  return False
747
  if cond_check_lang_id:
@@ -778,7 +775,7 @@ class FunctionDatasetFiltering:
778
 
779
  self.param = LoadParameters.load_parameters(lang_dataset_id)
780
  self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
781
- self.badwords = LoadParameters.load_badwords(lang_dataset_id)
782
  self.model_lang_id = LoadParameters.load_model_lang_id(
783
  lang_dataset_id, path_fasttext_model
784
  )
@@ -812,9 +809,9 @@ class FunctionDatasetFiltering:
812
  cond_check_stopwords=self.param["cond_check_stopwords"],
813
  stopwords=self.stopwords,
814
  stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
815
- cond_check_badwords=self.param["cond_check_badwords"],
816
- badwords=self.badwords,
817
- badwords_max_cutoff=self.param["badwords_max_cutoff"],
818
  cond_check_lang_id=self.param["cond_check_lang_id"],
819
  lang_dataset_id=self.lang_dataset_id,
820
  model_lang_id=self.model_lang_id,
13
  from parameters_filtering import parameters_filtering
14
  from normalization import normalization
15
  from stopwords import stopwords
16
+ from flagged_words import flagged_words
17
 
18
 
19
  class LoadParameters:
37
  return stopwords_lang
38
 
39
  @staticmethod
40
+ def load_flagged_words(lang_dataset_id):
41
+ flagged_words_lang_id = langs_id.loc[
42
+ langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id"
43
  ].iloc[0]
44
+ if flagged_words_lang_id:
45
+ flagged_words_lang = set(flagged_words[flagged_words_lang_id])
46
  else:
47
+ flagged_words_lang = None
48
+ return flagged_words_lang
49
 
50
  @staticmethod
51
  def load_model_lang_id(lang_dataset_id, path_fasttext_model):
533
  return cond
534
 
535
  @staticmethod
536
+ def compute_flagged_words_ratio(
537
  document,
538
  sentencepiece_model_tok,
539
  strip_characters,
540
  cond_words_augmentation,
541
  words_augmentation_group_sizes,
542
  words_augmentation_join_char,
543
+ flagged_words,
544
  ):
545
  words = ModifyingDocuments.get_words_from_document(
546
  document,
559
  for group_size in words_augmentation_group_sizes
560
  ]
561
  augmentation = [word for augm in augmentation for word in augm]
562
+ flagged_words_ratio = len(
563
+ [word for word in words + augmentation if word in flagged_words]
564
  ) / len(words)
565
+ if flagged_words_ratio > 1.0:
566
+ flagged_words_ratio = 1.0
567
+ return flagged_words_ratio
 
 
 
568
 
569
  @staticmethod
570
+ def check_flagged_words(
571
  document,
572
  sentencepiece_model_tok,
573
  strip_characters,
574
  cond_words_augmentation,
575
  words_augmentation_group_sizes,
576
  words_augmentation_join_char,
577
+ flagged_words,
578
+ flagged_words_max_cutoff,
579
  ):
580
  cond = True
581
+ if flagged_words:
582
+ flagged_words_ratio = Filtering.compute_flagged_words_ratio(
583
  document,
584
  sentencepiece_model_tok,
585
  strip_characters,
586
  cond_words_augmentation,
587
  words_augmentation_group_sizes,
588
  words_augmentation_join_char,
589
+ flagged_words,
590
  )
591
+ cond = flagged_words_ratio <= flagged_words_max_cutoff
592
  return cond
593
 
594
  @staticmethod
682
  cond_check_stopwords,
683
  stopwords,
684
  stopwords_min_cutoff,
685
+ cond_check_flagged_words,
686
+ flagged_words,
687
+ flagged_words_max_cutoff,
688
  cond_check_lang_id,
689
  lang_dataset_id,
690
  model_lang_id,
729
  stopwords_min_cutoff,
730
  ):
731
  return False
732
+ if cond_check_flagged_words:
733
+ if not Filtering.check_flagged_words(
734
  document,
735
  sentencepiece_model_tok,
736
  strip_characters,
737
  cond_words_augmentation,
738
  words_augmentation_group_sizes,
739
  words_augmentation_join_char,
740
+ flagged_words,
741
+ flagged_words_max_cutoff,
742
  ):
743
  return False
744
  if cond_check_lang_id:
775
 
776
  self.param = LoadParameters.load_parameters(lang_dataset_id)
777
  self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
778
+ self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
779
  self.model_lang_id = LoadParameters.load_model_lang_id(
780
  lang_dataset_id, path_fasttext_model
781
  )
809
  cond_check_stopwords=self.param["cond_check_stopwords"],
810
  stopwords=self.stopwords,
811
  stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
812
+ cond_check_flagged_words=self.param["cond_check_flagged_words"],
813
+ flagged_words=self.flagged_words,
814
+ flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"],
815
  cond_check_lang_id=self.param["cond_check_lang_id"],
816
  lang_dataset_id=self.lang_dataset_id,
817
  model_lang_id=self.model_lang_id,
badwords.py → flagged_words.py RENAMED
@@ -6,89 +6,21 @@
6
  # https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
7
 
8
 
9
- english_badwords = [
10
- "abuse",
11
  "anal",
12
- "anilingus",
13
- "anus",
14
- "aroused",
15
- "arse",
16
- "arsehole",
17
- "ass",
18
- "asses",
19
- "assfuck",
20
- "asshat",
21
- "asshole",
22
- "assholes",
23
- "autoerotic",
24
- "bangbros",
25
- "banging",
26
  "bareback",
27
- "bastard",
28
- "bastards",
29
- "bazongas",
30
  "bbw",
31
  "bdsm",
32
- "biatch",
33
- "bicurious",
34
- "bigass",
35
- "bigtits",
36
- "bimbo",
37
- "bimbos",
38
- "bitch",
39
- "bitches",
40
- "bitching",
41
  "blowjob",
42
  "blowjobs",
43
- "boche",
44
- "boner",
45
- "boners",
46
- "boob",
47
- "boobies",
48
- "boobs",
49
- "booty",
50
- "brothel",
51
- "buceta",
52
- "bugger",
53
- "buggered",
54
- "buggery",
55
  "bukkake",
56
- "bule",
57
- "buttcheeks",
58
- "buttfuck",
59
- "butthead",
60
- "butthole",
61
- "buttplug",
62
- "cameltoe",
63
  "camgirl",
64
  "camwhore",
65
- "chink",
66
- "chinks",
67
- "cialis",
68
- "clit",
69
- "clitoris",
70
- "clits",
71
- "clitty",
72
- "clusterfuck",
73
- "cock",
74
- "cock-head",
75
- "cockblock",
76
- "cockfight",
77
- "cockhead",
78
- "cocks",
79
- "cocksman",
80
- "cocksucker",
81
  "cocksucking",
82
- "coital",
83
- "coitus",
84
- "coochie",
85
- "cooly",
86
- "coon",
87
- "coons",
88
- "copulate",
89
- "cowgirl",
90
- "crabs",
91
  "creampie",
 
92
  "cum",
93
  "cumming",
94
  "cums",
@@ -96,399 +28,58 @@ english_badwords = [
96
  "cumshots",
97
  "cumslut",
98
  "cunnilingus",
99
- "cunny",
100
- "cunt",
101
- "cunts",
102
- "cybersex",
103
- "darkey",
104
- "darkie",
105
- "darkies",
106
- "darky",
107
  "deepthroat",
108
  "deepthroating",
109
- "dick",
110
- "dickhole",
111
- "dicks",
112
  "dildo",
113
  "dildos",
114
  "dogging",
115
- "doggy-style",
116
  "doggystyle",
117
  "dominatrix",
118
- "dommes",
119
- "dong",
120
- "dp",
121
- "dupa",
122
- "dyke",
123
- "dykes",
124
- "ecchi",
125
- "ejaculate",
126
- "ejaculated",
127
- "ejaculates",
128
- "ejaculating",
129
- "ejaculation",
130
- "ejaculations",
131
- "enema",
132
- "erect",
133
- "erection",
134
- "ero",
135
  "erotic",
136
- "erotism",
137
- "escort",
138
- "fag",
139
- "fagging",
140
- "faggot",
141
- "fagot",
142
- "fagots",
143
- "fags",
144
- "felch",
145
- "fellate",
146
  "fellatio",
147
  "femdom",
148
- "fetish",
149
- "figging",
150
- "fingerbang",
151
  "fingering",
152
- "fisted",
153
- "fister",
154
  "fisting",
155
- "floozy",
156
- "fondle",
157
- "footfetish",
158
  "footjob",
159
- "foreskin",
160
- "fornicate",
161
- "foursome",
162
- "fuck",
163
- "fuckable",
164
- "fuckbook",
165
- "fuckboy",
166
- "fuckbuddy",
167
- "fucked",
168
- "fucker",
169
- "fuckers",
170
- "fuckfest",
171
- "fuckhole",
172
- "fuckin",
173
- "fucking",
174
- "fucks",
175
- "fuk",
176
- "fukin",
177
- "fuking",
178
- "g-spot",
179
  "gangbang",
180
- "gangbanged",
181
- "gangbanger",
182
- "gangbangs",
183
- "genital",
184
- "genitals",
185
- "gigolo",
186
- "glans",
187
- "gonad",
188
- "gonads",
189
- "gook",
190
- "gringo",
191
- "gringos",
192
- "grope",
193
- "gspot",
194
- "guido",
195
  "handjob",
196
- "haole",
197
- "hapa",
198
- "hardcore",
199
- "hardon",
200
- "harem",
201
  "hentai",
202
- "hindoo",
203
- "hoe",
204
- "hoes",
205
- "honky",
206
- "hooker",
207
- "hookers",
208
- "hooter",
209
- "hooters",
210
- "hori",
211
- "horndog",
212
  "horney",
213
  "horniest",
214
  "horny",
215
- "humped",
216
- "humper",
217
- "humping",
218
- "hussy",
219
- "hymen",
220
- "ikey",
221
- "incest",
222
- "injun",
223
- "intercourse",
224
- "interracial",
225
- "jack-off",
226
- "jackoff",
227
- "jailbait",
228
- "jerk-off",
229
- "jerkoff",
230
- "jiggy",
231
  "jism",
232
  "jizz",
233
- "jizzed",
234
- "kaffir",
235
- "kafir",
236
- "kike",
237
- "kikes",
238
- "kinkster",
239
- "kinky",
240
- "kkk",
241
- "klan",
242
- "kraut",
243
- "labia",
244
- "lapdance",
245
- "libido",
246
- "licker",
247
- "licking",
248
- "limey",
249
- "lingerie",
250
- "livesex",
251
- "lolita",
252
- "lovemaking",
253
- "lust",
254
- "lusting",
255
- "masochist",
256
- "masterbate",
257
  "masterbating",
258
- "masterbation",
259
  "masturbate",
260
  "masturbating",
261
  "masturbation",
262
  "milf",
263
- "minge",
264
- "missionary",
265
- "molest",
266
- "molestation",
267
- "molester",
268
- "munging",
269
- "muschi",
270
- "nads",
271
- "naked",
272
- "necked",
273
- "necro",
274
- "negress",
275
- "negro",
276
- "negroes",
277
- "negroid",
278
- "negros",
279
- "nig",
280
- "nigar",
281
- "nigga",
282
- "niggas",
283
- "niggaz",
284
- "nigger",
285
- "niggers",
286
- "nigra",
287
- "nipple",
288
- "nipples",
289
- "nookie",
290
- "nooky",
291
- "nooner",
292
- "nude",
293
- "nudie",
294
- "nudity",
295
- "nymph",
296
- "nympho",
297
- "nymphomania",
298
- "orgasim",
299
- "orgasm",
300
- "orgasms",
301
  "orgies",
302
  "orgy",
303
- "orifice",
304
- "p0rn",
305
- "paedophile",
306
- "pantie",
307
- "panties",
308
- "panty",
309
- "pastie",
310
- "pecker",
311
- "pedo",
312
- "pedophile",
313
- "pedophilia",
314
- "pedophiliac",
315
- "peeper",
316
- "peepshow",
317
  "pegging",
318
- "penetrate",
319
- "penetration",
320
- "penile",
321
- "penis",
322
- "penises",
323
- "penus",
324
- "perv",
325
- "phallic",
326
- "phonesex",
327
- "pickaninnies",
328
- "pimp",
329
- "playboy",
330
- "playgirl",
331
- "poontang",
332
  "porn",
 
333
  "porno",
334
- "pornography",
335
  "pornos",
336
- "pr0n",
337
- "premature",
338
- "preteen",
339
- "pron",
340
- "prostitute",
341
- "pube",
342
- "pubes",
343
- "pubic",
344
- "pubis",
345
- "punani",
346
- "pussies",
347
- "pussy",
348
- "pussys",
349
- "pusy",
350
- "puta",
351
- "puto",
352
- "queef",
353
- "quickie",
354
- "quicky",
355
- "quim",
356
- "randy",
357
- "rape",
358
- "raped",
359
- "raper",
360
- "raping",
361
- "rapist",
362
- "rectum",
363
- "redneck",
364
- "rednecks",
365
- "redskin",
366
- "redskins",
367
- "rimjob",
368
  "rimming",
369
- "russki",
370
- "s&m",
371
- "sadism",
372
- "sadist",
373
- "sambo",
374
- "santorum",
375
- "schlong",
376
- "scissoring",
377
- "semen",
378
- "sex",
379
- "sexed",
380
- "sexi",
381
- "sexing",
382
- "sexo",
383
- "sexpot",
384
- "sextoy",
385
- "sexual",
386
- "sexually",
387
- "sexx",
388
- "sexxx",
389
- "sexxxy",
390
- "sexxy",
391
- "sexy",
392
- "sh!t",
393
- "sh1t",
394
- "shagging",
395
- "shemale",
396
- "sissy",
397
- "skank",
398
- "skanks",
399
- "slapper",
400
- "slut",
401
- "sluts",
402
- "slutting",
403
  "slutty",
404
- "smut",
405
- "smutty",
406
- "sodomise",
407
- "sodomite",
408
- "sodomize",
409
- "sodomy",
410
- "spank",
411
- "sperm",
412
- "spic",
413
- "spick",
414
- "splooge",
415
- "spooge",
416
- "squaw",
417
  "squirting",
418
- "steamy",
419
- "stiffy",
420
  "strapon",
421
- "suck",
422
- "sucked",
423
- "sucker",
424
- "sucking",
425
- "sucks",
426
- "swallow",
427
- "swallower",
428
- "swinger",
429
- "teabagging",
430
- "testical",
431
- "testicle",
432
- "testicles",
433
- "testis",
434
  "threesome",
435
- "threeway",
436
- "titfuck",
437
- "titjob",
438
- "tits",
439
- "tittie",
440
- "titties",
441
- "titty",
442
- "tittyfuck",
443
- "tity",
444
- "toots",
445
- "topless",
446
- "trannie",
447
- "tranny",
448
- "tribadism",
449
- "twat",
450
- "twats",
451
- "undies",
452
- "undressing",
453
- "upskirt",
454
- "vag",
455
- "vagina",
456
- "vaginal",
457
- "viagra",
458
  "vibrator",
459
- "virgin",
460
- "vixen",
461
- "voyeur",
462
- "vulva",
463
- "wank",
464
- "wanker",
465
- "wanking",
466
- "wazoo",
467
- "wedgie",
468
- "wench",
469
- "wetback",
470
- "whore",
471
- "whored",
472
- "whorehouse",
473
- "whores",
474
- "whoring",
475
- "wigger",
476
- "willie",
477
- "willies",
478
- "willy",
479
- "wog",
480
- "wop",
481
- "x-rated",
482
  "xxx",
483
- "xxxxxx",
484
- "yaoi",
485
- "yid",
486
- "zoophile",
487
- "zoophilia",
488
  ]
489
 
490
- badwords = {
491
- "ar": english_badwords
 
492
  + [
493
  "احتلام",
494
  "اغتصاب",
@@ -529,9 +120,8 @@ badwords = {
529
  "مفلقسة",
530
  "نيك",
531
  ],
532
- "ca": english_badwords
533
  + [
534
- "avortament",
535
  "anal",
536
  "anus",
537
  "cul",
@@ -664,8 +254,8 @@ badwords = {
664
  "x classificat",
665
  "xxx",
666
  ],
667
- "en": english_badwords,
668
- "es": english_badwords
669
  + [
670
  "Asesinato",
671
  "Bollera",
@@ -846,7 +436,6 @@ badwords = {
846
  "tirón",
847
  "tizón",
848
  "tonto",
849
- "transexual",
850
  "vagina",
851
  "vete a la mierda",
852
  "viagra",
@@ -858,7 +447,7 @@ badwords = {
858
  "xxx",
859
  "zurullo",
860
  ],
861
- "eu": english_badwords
862
  + [
863
  "abortu",
864
  "anal",
@@ -899,13 +488,12 @@ badwords = {
899
  "bagina",
900
  "viagra",
901
  ],
902
- "fr": english_badwords
903
  + [
904
  "MALPT",
905
  "anal",
906
  "anus",
907
  "arracher",
908
- "avortement",
909
  "baise",
910
  "baiser",
911
  "baiseur de chien",
@@ -1078,7 +666,6 @@ badwords = {
1078
  "testicule",
1079
  "teuch",
1080
  "titt",
1081
- "transexuelle",
1082
  "tremper",
1083
  "tringler",
1084
  "trique",
@@ -1102,7 +689,7 @@ badwords = {
1102
  "éjaculé",
1103
  "étron",
1104
  ],
1105
- "hi": english_badwords
1106
  + [
1107
  "aand",
1108
  "aandu",
@@ -1163,7 +750,6 @@ badwords = {
1163
  "fagging",
1164
  "fagots",
1165
  "felching",
1166
- "fuckers",
1167
  "fuckings",
1168
  "fucks",
1169
  "gaand",
@@ -1361,7 +947,7 @@ badwords = {
1361
  "होमोसेक्सुअल",
1362
  "होर",
1363
  ],
1364
- "id": english_badwords
1365
  + [
1366
  "abortus",
1367
  "anal",
@@ -1479,7 +1065,7 @@ badwords = {
1479
  "x diberi peringkat",
1480
  "xxx",
1481
  ],
1482
- "kn": english_badwords
1483
  + [
1484
  "ಗರ್ಭಪಾತ",
1485
  "ಗುದ",
@@ -1622,7 +1208,7 @@ badwords = {
1622
  "x ರೇಟೆಡ್",
1623
  "xxx",
1624
  ],
1625
- "ml": english_badwords
1626
  + [
1627
  "ഗർഭഛിദ്രം",
1628
  "വ��ശപ്പ്",
@@ -1759,7 +1345,7 @@ badwords = {
1759
  "x റേറ്റുചെയ്തു",
1760
  "xxx",
1761
  ],
1762
- "mr": english_badwords
1763
  + [
1764
  "गर्भपात",
1765
  "गुदा",
@@ -1898,7 +1484,7 @@ badwords = {
1898
  "एक्स रेट केले",
1899
  "xxx",
1900
  ],
1901
- "pt": english_badwords
1902
  + [
1903
  "aborto",
1904
  "amador",
@@ -2000,7 +1586,6 @@ badwords = {
2000
  "gozar",
2001
  "grelho",
2002
  "heroína",
2003
- "homem gay",
2004
  "homoerótico",
2005
  "homosexual",
2006
  "hore",
@@ -2097,7 +1682,7 @@ badwords = {
2097
  "xxx",
2098
  "ânus",
2099
  ],
2100
- "te": english_badwords
2101
  + [
2102
  "గర్భస్రావం",
2103
  "అంగ",
@@ -2244,7 +1829,7 @@ badwords = {
2244
  "x రేట్",
2245
  "xxx",
2246
  ],
2247
- "vi": english_badwords
2248
  + [
2249
  "sự phá thai",
2250
  "hậu môn",
@@ -2362,7 +1947,7 @@ badwords = {
2362
  "x đánh giá",
2363
  "xxx",
2364
  ],
2365
- "zh": english_badwords
2366
  + [
2367
  "13.",
2368
  "13点",
6
  # https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
7
 
8
 
9
+ english_flagged_words = [
 
10
  "anal",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "bareback",
 
 
 
12
  "bbw",
13
  "bdsm",
 
 
 
 
 
 
 
 
 
14
  "blowjob",
15
  "blowjobs",
16
+ "brazzers",
 
 
 
 
 
 
 
 
 
 
 
17
  "bukkake",
 
 
 
 
 
 
 
18
  "camgirl",
19
  "camwhore",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  "cocksucking",
21
+ "cougar",
 
 
 
 
 
 
 
 
22
  "creampie",
23
+ "cuckold",
24
  "cum",
25
  "cumming",
26
  "cums",
28
  "cumshots",
29
  "cumslut",
30
  "cunnilingus",
 
 
 
 
 
 
 
 
31
  "deepthroat",
32
  "deepthroating",
 
 
 
33
  "dildo",
34
  "dildos",
35
  "dogging",
 
36
  "doggystyle",
37
  "dominatrix",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  "erotic",
 
 
 
 
 
 
 
 
 
 
39
  "fellatio",
40
  "femdom",
 
 
 
41
  "fingering",
 
 
42
  "fisting",
 
 
 
43
  "footjob",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "gangbang",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  "handjob",
 
 
 
 
 
46
  "hentai",
 
 
 
 
 
 
 
 
 
 
47
  "horney",
48
  "horniest",
49
  "horny",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  "jism",
51
  "jizz",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  "masterbating",
 
53
  "masturbate",
54
  "masturbating",
55
  "masturbation",
56
  "milf",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  "orgies",
58
  "orgy",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  "pegging",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "porn",
61
+ "pornhub",
62
  "porno",
 
63
  "pornos",
64
+ "pornstar",
65
+ "pornstars",
66
+ "redtube",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  "rimming",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  "slutty",
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  "squirting",
 
 
70
  "strapon",
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  "threesome",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  "vibrator",
73
+ "xhamster",
74
+ "xnxx",
75
+ "xvideos",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  "xxx",
77
+ "youporn",
 
 
 
 
78
  ]
79
 
80
+
81
+ flagged_words = {
82
+ "ar": english_flagged_words
83
  + [
84
  "احتلام",
85
  "اغتصاب",
120
  "مفلقسة",
121
  "نيك",
122
  ],
123
+ "ca": english_flagged_words
124
  + [
 
125
  "anal",
126
  "anus",
127
  "cul",
254
  "x classificat",
255
  "xxx",
256
  ],
257
+ "en": english_flagged_words,
258
+ "es": english_flagged_words
259
  + [
260
  "Asesinato",
261
  "Bollera",
436
  "tirón",
437
  "tizón",
438
  "tonto",
 
439
  "vagina",
440
  "vete a la mierda",
441
  "viagra",
447
  "xxx",
448
  "zurullo",
449
  ],
450
+ "eu": english_flagged_words
451
  + [
452
  "abortu",
453
  "anal",
488
  "bagina",
489
  "viagra",
490
  ],
491
+ "fr": english_flagged_words
492
  + [
493
  "MALPT",
494
  "anal",
495
  "anus",
496
  "arracher",
 
497
  "baise",
498
  "baiser",
499
  "baiseur de chien",
666
  "testicule",
667
  "teuch",
668
  "titt",
 
669
  "tremper",
670
  "tringler",
671
  "trique",
689
  "éjaculé",
690
  "étron",
691
  ],
692
+ "hi": english_flagged_words
693
  + [
694
  "aand",
695
  "aandu",
750
  "fagging",
751
  "fagots",
752
  "felching",
 
753
  "fuckings",
754
  "fucks",
755
  "gaand",
947
  "होमोसेक्सुअल",
948
  "होर",
949
  ],
950
+ "id": english_flagged_words
951
  + [
952
  "abortus",
953
  "anal",
1065
  "x diberi peringkat",
1066
  "xxx",
1067
  ],
1068
+ "kn": english_flagged_words
1069
  + [
1070
  "ಗರ್ಭಪಾತ",
1071
  "ಗುದ",
1208
  "x ರೇಟೆಡ್",
1209
  "xxx",
1210
  ],
1211
+ "ml": english_flagged_words
1212
  + [
1213
  "ഗർഭഛിദ്രം",
1214
  "വ��ശപ്പ്",
1345
  "x റേറ്റുചെയ്തു",
1346
  "xxx",
1347
  ],
1348
+ "mr": english_flagged_words
1349
  + [
1350
  "गर्भपात",
1351
  "गुदा",
1484
  "एक्स रेट केले",
1485
  "xxx",
1486
  ],
1487
+ "pt": english_flagged_words
1488
  + [
1489
  "aborto",
1490
  "amador",
1586
  "gozar",
1587
  "grelho",
1588
  "heroína",
 
1589
  "homoerótico",
1590
  "homosexual",
1591
  "hore",
1682
  "xxx",
1683
  "ânus",
1684
  ],
1685
+ "te": english_flagged_words
1686
  + [
1687
  "గర్భస్రావం",
1688
  "అంగ",
1829
  "x రేట్",
1830
  "xxx",
1831
  ],
1832
+ "vi": english_flagged_words
1833
  + [
1834
  "sự phá thai",
1835
  "hậu môn",
1947
  "x đánh giá",
1948
  "xxx",
1949
  ],
1950
+ "zh": english_flagged_words
1951
  + [
1952
  "13.",
1953
  "13点",
languages_id.py CHANGED
@@ -6,7 +6,7 @@ langs_id = [
6
  "lang": "Afrikaans",
7
  "dataset_id": "af",
8
  "stopwords_id": "af",
9
- "badwords_id": None,
10
  "fasttext_id": "af",
11
  "sentencepiece_id": "af",
12
  "kenlm_id": "af",
@@ -15,7 +15,7 @@ langs_id = [
15
  "lang": "Arabic",
16
  "dataset_id": "ar",
17
  "stopwords_id": "ar",
18
- "badwords_id": "ar",
19
  "fasttext_id": "ar",
20
  "sentencepiece_id": "ar",
21
  "kenlm_id": "ar",
@@ -24,7 +24,7 @@ langs_id = [
24
  "lang": "Egyptian Arabic",
25
  "dataset_id": "arz",
26
  "stopwords_id": None,
27
- "badwords_id": None,
28
  "fasttext_id": "arz",
29
  "sentencepiece_id": None,
30
  "kenlm_id": None,
@@ -33,7 +33,7 @@ langs_id = [
33
  "lang": "Assamese",
34
  "dataset_id": "as",
35
  "stopwords_id": None,
36
- "badwords_id": None,
37
  "fasttext_id": "as",
38
  "sentencepiece_id": None,
39
  "kenlm_id": None,
@@ -42,7 +42,7 @@ langs_id = [
42
  "lang": "Bengali",
43
  "dataset_id": "bn",
44
  "stopwords_id": "bn",
45
- "badwords_id": None,
46
  "fasttext_id": "bn",
47
  "sentencepiece_id": "bn",
48
  "kenlm_id": "bn",
@@ -51,7 +51,7 @@ langs_id = [
51
  "lang": "Catalan",
52
  "dataset_id": "ca",
53
  "stopwords_id": "ca",
54
- "badwords_id": "ca",
55
  "fasttext_id": "ca",
56
  "sentencepiece_id": "ca",
57
  "kenlm_id": "ca",
@@ -60,7 +60,7 @@ langs_id = [
60
  "lang": "English",
61
  "dataset_id": "en",
62
  "stopwords_id": "en",
63
- "badwords_id": "en",
64
  "fasttext_id": "en",
65
  "sentencepiece_id": "en",
66
  "kenlm_id": "en",
@@ -69,7 +69,7 @@ langs_id = [
69
  "lang": "Spanish",
70
  "dataset_id": "es",
71
  "stopwords_id": "es",
72
- "badwords_id": "es",
73
  "fasttext_id": "es",
74
  "sentencepiece_id": "es",
75
  "kenlm_id": "es",
@@ -78,7 +78,7 @@ langs_id = [
78
  "lang": "Basque",
79
  "dataset_id": "eu",
80
  "stopwords_id": "eu",
81
- "badwords_id": "eu",
82
  "fasttext_id": "eu",
83
  "sentencepiece_id": None,
84
  "kenlm_id": None,
@@ -87,7 +87,7 @@ langs_id = [
87
  "lang": "French",
88
  "dataset_id": "fr",
89
  "stopwords_id": "fr",
90
- "badwords_id": "fr",
91
  "fasttext_id": "fr",
92
  "sentencepiece_id": "fr",
93
  "kenlm_id": "fr",
@@ -96,7 +96,7 @@ langs_id = [
96
  "lang": "Gujarati",
97
  "dataset_id": "gu",
98
  "stopwords_id": None,
99
- "badwords_id": None,
100
  "fasttext_id": "gu",
101
  "sentencepiece_id": "gu",
102
  "kenlm_id": "gu",
@@ -105,7 +105,7 @@ langs_id = [
105
  "lang": "Hindi",
106
  "dataset_id": "hi",
107
  "stopwords_id": "hi",
108
- "badwords_id": "hi",
109
  "fasttext_id": "hi",
110
  "sentencepiece_id": "hi",
111
  "kenlm_id": "hi",
@@ -114,7 +114,7 @@ langs_id = [
114
  "lang": "Indonesian",
115
  "dataset_id": "id",
116
  "stopwords_id": "id",
117
- "badwords_id": "id",
118
  "fasttext_id": "id",
119
  "sentencepiece_id": "id",
120
  "kenlm_id": "id",
@@ -123,7 +123,7 @@ langs_id = [
123
  "lang": "Kannada",
124
  "dataset_id": "kn",
125
  "stopwords_id": None,
126
- "badwords_id": "kn",
127
  "fasttext_id": "kn",
128
  "sentencepiece_id": "kn",
129
  "kenlm_id": "kn",
@@ -132,7 +132,7 @@ langs_id = [
132
  "lang": "Malayalam",
133
  "dataset_id": "ml",
134
  "stopwords_id": None,
135
- "badwords_id": "ml",
136
  "fasttext_id": "ml",
137
  "sentencepiece_id": "ml",
138
  "kenlm_id": "ml",
@@ -141,7 +141,7 @@ langs_id = [
141
  "lang": "Marathi",
142
  "dataset_id": "mr",
143
  "stopwords_id": "mr",
144
- "badwords_id": "mr",
145
  "fasttext_id": "mr",
146
  "sentencepiece_id": "mr",
147
  "kenlm_id": "mr",
@@ -150,7 +150,7 @@ langs_id = [
150
  "lang": "Portuguese",
151
  "dataset_id": "pt",
152
  "stopwords_id": "pt",
153
- "badwords_id": "pt",
154
  "fasttext_id": "pt",
155
  "sentencepiece_id": "pt",
156
  "kenlm_id": "pt",
@@ -159,7 +159,7 @@ langs_id = [
159
  "lang": "Somali",
160
  "dataset_id": "so",
161
  "stopwords_id": "so",
162
- "badwords_id": None,
163
  "fasttext_id": "so",
164
  "sentencepiece_id": None,
165
  "kenlm_id": None,
@@ -168,7 +168,7 @@ langs_id = [
168
  "lang": "Swahili",
169
  "dataset_id": "sw",
170
  "stopwords_id": "sw",
171
- "badwords_id": None,
172
  "fasttext_id": "sw",
173
  "sentencepiece_id": None,
174
  "kenlm_id": None,
@@ -177,7 +177,7 @@ langs_id = [
177
  "lang": "Tamil",
178
  "dataset_id": "ta",
179
  "stopwords_id": None,
180
- "badwords_id": None,
181
  "fasttext_id": "ta",
182
  "sentencepiece_id": None,
183
  "kenlm_id": None,
@@ -186,7 +186,7 @@ langs_id = [
186
  "lang": "Telugu",
187
  "dataset_id": "te",
188
  "stopwords_id": None,
189
- "badwords_id": "te",
190
  "fasttext_id": "te",
191
  "sentencepiece_id": None,
192
  "kenlm_id": None,
@@ -195,7 +195,7 @@ langs_id = [
195
  "lang": "Urdu",
196
  "dataset_id": "ur",
197
  "stopwords_id": "ur",
198
- "badwords_id": None,
199
  "fasttext_id": "ur",
200
  "sentencepiece_id": None,
201
  "kenlm_id": None,
@@ -204,7 +204,7 @@ langs_id = [
204
  "lang": "Vietnamese",
205
  "dataset_id": "vi",
206
  "stopwords_id": "vi",
207
- "badwords_id": "vi",
208
  "fasttext_id": "vi",
209
  "sentencepiece_id": None,
210
  "kenlm_id": None,
@@ -213,7 +213,7 @@ langs_id = [
213
  "lang": "Yoruba",
214
  "dataset_id": "yo",
215
  "stopwords_id": "yo",
216
- "badwords_id": None,
217
  "fasttext_id": "yo",
218
  "sentencepiece_id": None,
219
  "kenlm_id": None,
@@ -222,7 +222,7 @@ langs_id = [
222
  "lang": "Chinese",
223
  "dataset_id": "zh",
224
  "stopwords_id": "zh",
225
- "badwords_id": "zh",
226
  "fasttext_id": "zh",
227
  "sentencepiece_id": "zh",
228
  "kenlm_id": "zh",
6
  "lang": "Afrikaans",
7
  "dataset_id": "af",
8
  "stopwords_id": "af",
9
+ "flagged_words_id": None,
10
  "fasttext_id": "af",
11
  "sentencepiece_id": "af",
12
  "kenlm_id": "af",
15
  "lang": "Arabic",
16
  "dataset_id": "ar",
17
  "stopwords_id": "ar",
18
+ "flagged_words_id": "ar",
19
  "fasttext_id": "ar",
20
  "sentencepiece_id": "ar",
21
  "kenlm_id": "ar",
24
  "lang": "Egyptian Arabic",
25
  "dataset_id": "arz",
26
  "stopwords_id": None,
27
+ "flagged_words_id": None,
28
  "fasttext_id": "arz",
29
  "sentencepiece_id": None,
30
  "kenlm_id": None,
33
  "lang": "Assamese",
34
  "dataset_id": "as",
35
  "stopwords_id": None,
36
+ "flagged_words_id": None,
37
  "fasttext_id": "as",
38
  "sentencepiece_id": None,
39
  "kenlm_id": None,
42
  "lang": "Bengali",
43
  "dataset_id": "bn",
44
  "stopwords_id": "bn",
45
+ "flagged_words_id": None,
46
  "fasttext_id": "bn",
47
  "sentencepiece_id": "bn",
48
  "kenlm_id": "bn",
51
  "lang": "Catalan",
52
  "dataset_id": "ca",
53
  "stopwords_id": "ca",
54
+ "flagged_words_id": "ca",
55
  "fasttext_id": "ca",
56
  "sentencepiece_id": "ca",
57
  "kenlm_id": "ca",
60
  "lang": "English",
61
  "dataset_id": "en",
62
  "stopwords_id": "en",
63
+ "flagged_words_id": "en",
64
  "fasttext_id": "en",
65
  "sentencepiece_id": "en",
66
  "kenlm_id": "en",
69
  "lang": "Spanish",
70
  "dataset_id": "es",
71
  "stopwords_id": "es",
72
+ "flagged_words_id": "es",
73
  "fasttext_id": "es",
74
  "sentencepiece_id": "es",
75
  "kenlm_id": "es",
78
  "lang": "Basque",
79
  "dataset_id": "eu",
80
  "stopwords_id": "eu",
81
+ "flagged_words_id": "eu",
82
  "fasttext_id": "eu",
83
  "sentencepiece_id": None,
84
  "kenlm_id": None,
87
  "lang": "French",
88
  "dataset_id": "fr",
89
  "stopwords_id": "fr",
90
+ "flagged_words_id": "fr",
91
  "fasttext_id": "fr",
92
  "sentencepiece_id": "fr",
93
  "kenlm_id": "fr",
96
  "lang": "Gujarati",
97
  "dataset_id": "gu",
98
  "stopwords_id": None,
99
+ "flagged_words_id": None,
100
  "fasttext_id": "gu",
101
  "sentencepiece_id": "gu",
102
  "kenlm_id": "gu",
105
  "lang": "Hindi",
106
  "dataset_id": "hi",
107
  "stopwords_id": "hi",
108
+ "flagged_words_id": "hi",
109
  "fasttext_id": "hi",
110
  "sentencepiece_id": "hi",
111
  "kenlm_id": "hi",
114
  "lang": "Indonesian",
115
  "dataset_id": "id",
116
  "stopwords_id": "id",
117
+ "flagged_words_id": "id",
118
  "fasttext_id": "id",
119
  "sentencepiece_id": "id",
120
  "kenlm_id": "id",
123
  "lang": "Kannada",
124
  "dataset_id": "kn",
125
  "stopwords_id": None,
126
+ "flagged_words_id": "kn",
127
  "fasttext_id": "kn",
128
  "sentencepiece_id": "kn",
129
  "kenlm_id": "kn",
132
  "lang": "Malayalam",
133
  "dataset_id": "ml",
134
  "stopwords_id": None,
135
+ "flagged_words_id": "ml",
136
  "fasttext_id": "ml",
137
  "sentencepiece_id": "ml",
138
  "kenlm_id": "ml",
141
  "lang": "Marathi",
142
  "dataset_id": "mr",
143
  "stopwords_id": "mr",
144
+ "flagged_words_id": "mr",
145
  "fasttext_id": "mr",
146
  "sentencepiece_id": "mr",
147
  "kenlm_id": "mr",
150
  "lang": "Portuguese",
151
  "dataset_id": "pt",
152
  "stopwords_id": "pt",
153
+ "flagged_words_id": "pt",
154
  "fasttext_id": "pt",
155
  "sentencepiece_id": "pt",
156
  "kenlm_id": "pt",
159
  "lang": "Somali",
160
  "dataset_id": "so",
161
  "stopwords_id": "so",
162
+ "flagged_words_id": None,
163
  "fasttext_id": "so",
164
  "sentencepiece_id": None,
165
  "kenlm_id": None,
168
  "lang": "Swahili",
169
  "dataset_id": "sw",
170
  "stopwords_id": "sw",
171
+ "flagged_words_id": None,
172
  "fasttext_id": "sw",
173
  "sentencepiece_id": None,
174
  "kenlm_id": None,
177
  "lang": "Tamil",
178
  "dataset_id": "ta",
179
  "stopwords_id": None,
180
+ "flagged_words_id": None,
181
  "fasttext_id": "ta",
182
  "sentencepiece_id": None,
183
  "kenlm_id": None,
186
  "lang": "Telugu",
187
  "dataset_id": "te",
188
  "stopwords_id": None,
189
+ "flagged_words_id": "te",
190
  "fasttext_id": "te",
191
  "sentencepiece_id": None,
192
  "kenlm_id": None,
195
  "lang": "Urdu",
196
  "dataset_id": "ur",
197
  "stopwords_id": "ur",
198
+ "flagged_words_id": None,
199
  "fasttext_id": "ur",
200
  "sentencepiece_id": None,
201
  "kenlm_id": None,
204
  "lang": "Vietnamese",
205
  "dataset_id": "vi",
206
  "stopwords_id": "vi",
207
+ "flagged_words_id": "vi",
208
  "fasttext_id": "vi",
209
  "sentencepiece_id": None,
210
  "kenlm_id": None,
213
  "lang": "Yoruba",
214
  "dataset_id": "yo",
215
  "stopwords_id": "yo",
216
+ "flagged_words_id": None,
217
  "fasttext_id": "yo",
218
  "sentencepiece_id": None,
219
  "kenlm_id": None,
222
  "lang": "Chinese",
223
  "dataset_id": "zh",
224
  "stopwords_id": "zh",
225
+ "flagged_words_id": "zh",
226
  "fasttext_id": "zh",
227
  "sentencepiece_id": "zh",
228
  "kenlm_id": "zh",
parameters_filtering.py CHANGED
@@ -39,8 +39,8 @@ parameters_filtering_default = {
39
  "words_augmentation_join_char": "",
40
  "cond_check_stopwords": False,
41
  "stopwords_min_cutoff": 0,
42
- "cond_check_badwords": False,
43
- "badwords_max_cutoff": 0.2,
44
  "cond_check_lang_id": True,
45
  "lang_id_min_cutoff": 0.70,
46
  "cond_check_perplexity": False,
@@ -70,8 +70,8 @@ parameters_filtering_af = {
70
  "words_augmentation_join_char": "",
71
  "cond_check_stopwords": True,
72
  "stopwords_min_cutoff": 0,
73
- "cond_check_badwords": False,
74
- "badwords_max_cutoff": 0.2,
75
  "cond_check_lang_id": True,
76
  "lang_id_min_cutoff": 0.6,
77
  "cond_check_perplexity": True,
@@ -101,8 +101,8 @@ parameters_filtering_ar = {
101
  "words_augmentation_join_char": "",
102
  "cond_check_stopwords": True,
103
  "stopwords_min_cutoff": 0,
104
- "cond_check_badwords": False,
105
- "badwords_max_cutoff": 0.2,
106
  "cond_check_lang_id": True,
107
  "lang_id_min_cutoff": 0.75,
108
  "cond_check_perplexity": True,
@@ -132,8 +132,8 @@ parameters_filtering_arz = {
132
  "words_augmentation_join_char": "",
133
  "cond_check_stopwords": True,
134
  "stopwords_min_cutoff": 0,
135
- "cond_check_badwords": False,
136
- "badwords_max_cutoff": 0.2,
137
  "cond_check_lang_id": True,
138
  "lang_id_min_cutoff": 0.75,
139
  "cond_check_perplexity": False,
@@ -163,8 +163,8 @@ parameters_filtering_as = {
163
  "words_augmentation_join_char": "",
164
  "cond_check_stopwords": True,
165
  "stopwords_min_cutoff": 0,
166
- "cond_check_badwords": False,
167
- "badwords_max_cutoff": 0.2,
168
  "cond_check_lang_id": True,
169
  "lang_id_min_cutoff": 0.75,
170
  "cond_check_perplexity": False,
@@ -194,8 +194,8 @@ parameters_filtering_bn = {
194
  "words_augmentation_join_char": "",
195
  "cond_check_stopwords": True,
196
  "stopwords_min_cutoff": 0.05,
197
- "cond_check_badwords": False,
198
- "badwords_max_cutoff": 0.2,
199
  "cond_check_lang_id": True,
200
  "lang_id_min_cutoff": 0.75,
201
  "cond_check_perplexity": False,
@@ -225,8 +225,8 @@ parameters_filtering_ca = {
225
  "words_augmentation_join_char": "",
226
  "cond_check_stopwords": True,
227
  "stopwords_min_cutoff": 0,
228
- "cond_check_badwords": False,
229
- "badwords_max_cutoff": 0.2,
230
  "cond_check_lang_id": True,
231
  "lang_id_min_cutoff": 0.75,
232
  "cond_check_perplexity": True,
@@ -256,8 +256,8 @@ parameters_filtering_en = {
256
  "words_augmentation_join_char": "",
257
  "cond_check_stopwords": True,
258
  "stopwords_min_cutoff": 0.3,
259
- "cond_check_badwords": True,
260
- "badwords_max_cutoff": 0.045,
261
  "cond_check_lang_id": True,
262
  "lang_id_min_cutoff": 0.80,
263
  "cond_check_perplexity": True,
@@ -287,8 +287,8 @@ parameters_filtering_es = {
287
  "words_augmentation_join_char": "",
288
  "cond_check_stopwords": True,
289
  "stopwords_min_cutoff": 0.2,
290
- "cond_check_badwords": False,
291
- "badwords_max_cutoff": 0.2,
292
  "cond_check_lang_id": True,
293
  "lang_id_min_cutoff": 0.75,
294
  "cond_check_perplexity": True,
@@ -318,8 +318,8 @@ parameters_filtering_eu = {
318
  "words_augmentation_join_char": "",
319
  "cond_check_stopwords": True,
320
  "stopwords_min_cutoff": 0,
321
- "cond_check_badwords": False,
322
- "badwords_max_cutoff": 0.2,
323
  "cond_check_lang_id": True,
324
  "lang_id_min_cutoff": 0.75,
325
  "cond_check_perplexity": False,
@@ -349,8 +349,8 @@ parameters_filtering_fr = {
349
  "words_augmentation_join_char": "",
350
  "cond_check_stopwords": True,
351
  "stopwords_min_cutoff": 0.15,
352
- "cond_check_badwords": False,
353
- "badwords_max_cutoff": 0.2,
354
  "cond_check_lang_id": True,
355
  "lang_id_min_cutoff": 0.75,
356
  "cond_check_perplexity": True,
@@ -380,8 +380,8 @@ parameters_filtering_gu = {
380
  "words_augmentation_join_char": "",
381
  "cond_check_stopwords": True,
382
  "stopwords_min_cutoff": 0,
383
- "cond_check_badwords": False,
384
- "badwords_max_cutoff": 0.2,
385
  "cond_check_lang_id": True,
386
  "lang_id_min_cutoff": 0.75,
387
  "cond_check_perplexity": True,
@@ -411,8 +411,8 @@ parameters_filtering_hi = {
411
  "words_augmentation_join_char": "",
412
  "cond_check_stopwords": True,
413
  "stopwords_min_cutoff": 0,
414
- "cond_check_badwords": False,
415
- "badwords_max_cutoff": 0.2,
416
  "cond_check_lang_id": True,
417
  "lang_id_min_cutoff": 0.75,
418
  "cond_check_perplexity": True,
@@ -442,8 +442,8 @@ parameters_filtering_id = {
442
  "words_augmentation_join_char": "",
443
  "cond_check_stopwords": True,
444
  "stopwords_min_cutoff": 0.25,
445
- "cond_check_badwords": False,
446
- "badwords_max_cutoff": 0.2,
447
  "cond_check_lang_id": True,
448
  "lang_id_min_cutoff": 0.75,
449
  "cond_check_perplexity": True,
@@ -473,8 +473,8 @@ parameters_filtering_kn = {
473
  "words_augmentation_join_char": "",
474
  "cond_check_stopwords": True,
475
  "stopwords_min_cutoff": 0,
476
- "cond_check_badwords": False,
477
- "badwords_max_cutoff": 0.2,
478
  "cond_check_lang_id": True,
479
  "lang_id_min_cutoff": 0.75,
480
  "cond_check_perplexity": True,
@@ -504,8 +504,8 @@ parameters_filtering_ml = {
504
  "words_augmentation_join_char": "",
505
  "cond_check_stopwords": True,
506
  "stopwords_min_cutoff": 0,
507
- "cond_check_badwords": False,
508
- "badwords_max_cutoff": 0.2,
509
  "cond_check_lang_id": True,
510
  "lang_id_min_cutoff": 0.75,
511
  "cond_check_perplexity": True,
@@ -535,8 +535,8 @@ parameters_filtering_mr = {
535
  "words_augmentation_join_char": "",
536
  "cond_check_stopwords": True,
537
  "stopwords_min_cutoff": 0,
538
- "cond_check_badwords": False,
539
- "badwords_max_cutoff": 0.2,
540
  "cond_check_lang_id": True,
541
  "lang_id_min_cutoff": 0.75,
542
  "cond_check_perplexity": True,
@@ -566,8 +566,8 @@ parameters_filtering_pt = {
566
  "words_augmentation_join_char": "",
567
  "cond_check_stopwords": True,
568
  "stopwords_min_cutoff": 0.15,
569
- "cond_check_badwords": False,
570
- "badwords_max_cutoff": 0.2,
571
  "cond_check_lang_id": True,
572
  "lang_id_min_cutoff": 0.75,
573
  "cond_check_perplexity": True,
@@ -597,8 +597,8 @@ parameters_filtering_so = {
597
  "words_augmentation_join_char": "",
598
  "cond_check_stopwords": False,
599
  "stopwords_min_cutoff": 0,
600
- "cond_check_badwords": False,
601
- "badwords_max_cutoff": 0.2,
602
  "cond_check_lang_id": True,
603
  "lang_id_min_cutoff": 0.75,
604
  "cond_check_perplexity": False,
@@ -628,8 +628,8 @@ parameters_filtering_sw = {
628
  "words_augmentation_join_char": "",
629
  "cond_check_stopwords": True,
630
  "stopwords_min_cutoff": 0,
631
- "cond_check_badwords": False,
632
- "badwords_max_cutoff": 0.2,
633
  "cond_check_lang_id": True,
634
  "lang_id_min_cutoff": 0.75,
635
  "cond_check_perplexity": False,
@@ -659,8 +659,8 @@ parameters_filtering_ta = {
659
  "words_augmentation_join_char": "",
660
  "cond_check_stopwords": True,
661
  "stopwords_min_cutoff": 0,
662
- "cond_check_badwords": False,
663
- "badwords_max_cutoff": 0.2,
664
  "cond_check_lang_id": True,
665
  "lang_id_min_cutoff": 0.75,
666
  "cond_check_perplexity": False,
@@ -690,8 +690,8 @@ parameters_filtering_te = {
690
  "words_augmentation_join_char": "",
691
  "cond_check_stopwords": True,
692
  "stopwords_min_cutoff": 0,
693
- "cond_check_badwords": False,
694
- "badwords_max_cutoff": 0.2,
695
  "cond_check_lang_id": True,
696
  "lang_id_min_cutoff": 0.75,
697
  "cond_check_perplexity": False,
@@ -721,8 +721,8 @@ parameters_filtering_ur = {
721
  "words_augmentation_join_char": "",
722
  "cond_check_stopwords": True,
723
  "stopwords_min_cutoff": 0,
724
- "cond_check_badwords": False,
725
- "badwords_max_cutoff": 0.2,
726
  "cond_check_lang_id": True,
727
  "lang_id_min_cutoff": 0.75,
728
  "cond_check_perplexity": False,
@@ -752,8 +752,8 @@ parameters_filtering_vi = {
752
  "words_augmentation_join_char": " ",
753
  "cond_check_stopwords": True,
754
  "stopwords_min_cutoff": 0,
755
- "cond_check_badwords": False,
756
- "badwords_max_cutoff": 0.2,
757
  "cond_check_lang_id": True,
758
  "lang_id_min_cutoff": 0.75,
759
  "cond_check_perplexity": False,
@@ -783,8 +783,8 @@ parameters_filtering_yo = {
783
  "words_augmentation_join_char": "",
784
  "cond_check_stopwords": True,
785
  "stopwords_min_cutoff": 0,
786
- "cond_check_badwords": False,
787
- "badwords_max_cutoff": 0.2,
788
  "cond_check_lang_id": True,
789
  "lang_id_min_cutoff": 0.75,
790
  "cond_check_perplexity": False,
@@ -814,8 +814,8 @@ parameters_filtering_zh = {
814
  "words_augmentation_join_char": "",
815
  "cond_check_stopwords": False,
816
  "stopwords_min_cutoff": 0,
817
- "cond_check_badwords": False,
818
- "badwords_max_cutoff": 0.2,
819
  "cond_check_lang_id": True,
820
  "lang_id_min_cutoff": 0.75,
821
  "cond_check_perplexity": False,
39
  "words_augmentation_join_char": "",
40
  "cond_check_stopwords": False,
41
  "stopwords_min_cutoff": 0,
42
+ "cond_check_flagged_words": False,
43
+ "flagged_words_max_cutoff": 0.2,
44
  "cond_check_lang_id": True,
45
  "lang_id_min_cutoff": 0.70,
46
  "cond_check_perplexity": False,
70
  "words_augmentation_join_char": "",
71
  "cond_check_stopwords": True,
72
  "stopwords_min_cutoff": 0,
73
+ "cond_check_flagged_words": False,
74
+ "flagged_words_max_cutoff": 0.2,
75
  "cond_check_lang_id": True,
76
  "lang_id_min_cutoff": 0.6,
77
  "cond_check_perplexity": True,
101
  "words_augmentation_join_char": "",
102
  "cond_check_stopwords": True,
103
  "stopwords_min_cutoff": 0,
104
+ "cond_check_flagged_words": False,
105
+ "flagged_words_max_cutoff": 0.2,
106
  "cond_check_lang_id": True,
107
  "lang_id_min_cutoff": 0.75,
108
  "cond_check_perplexity": True,
132
  "words_augmentation_join_char": "",
133
  "cond_check_stopwords": True,
134
  "stopwords_min_cutoff": 0,
135
+ "cond_check_flagged_words": False,
136
+ "flagged_words_max_cutoff": 0.2,
137
  "cond_check_lang_id": True,
138
  "lang_id_min_cutoff": 0.75,
139
  "cond_check_perplexity": False,
163
  "words_augmentation_join_char": "",
164
  "cond_check_stopwords": True,
165
  "stopwords_min_cutoff": 0,
166
+ "cond_check_flagged_words": False,
167
+ "flagged_words_max_cutoff": 0.2,
168
  "cond_check_lang_id": True,
169
  "lang_id_min_cutoff": 0.75,
170
  "cond_check_perplexity": False,
194
  "words_augmentation_join_char": "",
195
  "cond_check_stopwords": True,
196
  "stopwords_min_cutoff": 0.05,
197
+ "cond_check_flagged_words": False,
198
+ "flagged_words_max_cutoff": 0.2,
199
  "cond_check_lang_id": True,
200
  "lang_id_min_cutoff": 0.75,
201
  "cond_check_perplexity": False,
225
  "words_augmentation_join_char": "",
226
  "cond_check_stopwords": True,
227
  "stopwords_min_cutoff": 0,
228
+ "cond_check_flagged_words": False,
229
+ "flagged_words_max_cutoff": 0.2,
230
  "cond_check_lang_id": True,
231
  "lang_id_min_cutoff": 0.75,
232
  "cond_check_perplexity": True,
256
  "words_augmentation_join_char": "",
257
  "cond_check_stopwords": True,
258
  "stopwords_min_cutoff": 0.3,
259
+ "cond_check_flagged_words": True,
260
+ "flagged_words_max_cutoff": 0.045,
261
  "cond_check_lang_id": True,
262
  "lang_id_min_cutoff": 0.80,
263
  "cond_check_perplexity": True,
287
  "words_augmentation_join_char": "",
288
  "cond_check_stopwords": True,
289
  "stopwords_min_cutoff": 0.2,
290
+ "cond_check_flagged_words": False,
291
+ "flagged_words_max_cutoff": 0.2,
292
  "cond_check_lang_id": True,
293
  "lang_id_min_cutoff": 0.75,
294
  "cond_check_perplexity": True,
318
  "words_augmentation_join_char": "",
319
  "cond_check_stopwords": True,
320
  "stopwords_min_cutoff": 0,
321
+ "cond_check_flagged_words": False,
322
+ "flagged_words_max_cutoff": 0.2,
323
  "cond_check_lang_id": True,
324
  "lang_id_min_cutoff": 0.75,
325
  "cond_check_perplexity": False,
349
  "words_augmentation_join_char": "",
350
  "cond_check_stopwords": True,
351
  "stopwords_min_cutoff": 0.15,
352
+ "cond_check_flagged_words": False,
353
+ "flagged_words_max_cutoff": 0.2,
354
  "cond_check_lang_id": True,
355
  "lang_id_min_cutoff": 0.75,
356
  "cond_check_perplexity": True,
380
  "words_augmentation_join_char": "",
381
  "cond_check_stopwords": True,
382
  "stopwords_min_cutoff": 0,
383
+ "cond_check_flagged_words": False,
384
+ "flagged_words_max_cutoff": 0.2,
385
  "cond_check_lang_id": True,
386
  "lang_id_min_cutoff": 0.75,
387
  "cond_check_perplexity": True,
411
  "words_augmentation_join_char": "",
412
  "cond_check_stopwords": True,
413
  "stopwords_min_cutoff": 0,
414
+ "cond_check_flagged_words": False,
415
+ "flagged_words_max_cutoff": 0.2,
416
  "cond_check_lang_id": True,
417
  "lang_id_min_cutoff": 0.75,
418
  "cond_check_perplexity": True,
442
  "words_augmentation_join_char": "",
443
  "cond_check_stopwords": True,
444
  "stopwords_min_cutoff": 0.25,
445
+ "cond_check_flagged_words": False,
446
+ "flagged_words_max_cutoff": 0.2,
447
  "cond_check_lang_id": True,
448
  "lang_id_min_cutoff": 0.75,
449
  "cond_check_perplexity": True,
473
  "words_augmentation_join_char": "",
474
  "cond_check_stopwords": True,
475
  "stopwords_min_cutoff": 0,
476
+ "cond_check_flagged_words": False,
477
+ "flagged_words_max_cutoff": 0.2,
478
  "cond_check_lang_id": True,
479
  "lang_id_min_cutoff": 0.75,
480
  "cond_check_perplexity": True,
504
  "words_augmentation_join_char": "",
505
  "cond_check_stopwords": True,
506
  "stopwords_min_cutoff": 0,
507
+ "cond_check_flagged_words": False,
508
+ "flagged_words_max_cutoff": 0.2,
509
  "cond_check_lang_id": True,
510
  "lang_id_min_cutoff": 0.75,
511
  "cond_check_perplexity": True,
535
  "words_augmentation_join_char": "",
536
  "cond_check_stopwords": True,
537
  "stopwords_min_cutoff": 0,
538
+ "cond_check_flagged_words": False,
539
+ "flagged_words_max_cutoff": 0.2,
540
  "cond_check_lang_id": True,
541
  "lang_id_min_cutoff": 0.75,
542
  "cond_check_perplexity": True,
566
  "words_augmentation_join_char": "",
567
  "cond_check_stopwords": True,
568
  "stopwords_min_cutoff": 0.15,
569
+ "cond_check_flagged_words": False,
570
+ "flagged_words_max_cutoff": 0.2,
571
  "cond_check_lang_id": True,
572
  "lang_id_min_cutoff": 0.75,
573
  "cond_check_perplexity": True,
597
  "words_augmentation_join_char": "",
598
  "cond_check_stopwords": False,
599
  "stopwords_min_cutoff": 0,
600
+ "cond_check_flagged_words": False,
601
+ "flagged_words_max_cutoff": 0.2,
602
  "cond_check_lang_id": True,
603
  "lang_id_min_cutoff": 0.75,
604
  "cond_check_perplexity": False,
628
  "words_augmentation_join_char": "",
629
  "cond_check_stopwords": True,
630
  "stopwords_min_cutoff": 0,
631
+ "cond_check_flagged_words": False,
632
+ "flagged_words_max_cutoff": 0.2,
633
  "cond_check_lang_id": True,
634
  "lang_id_min_cutoff": 0.75,
635
  "cond_check_perplexity": False,
659
  "words_augmentation_join_char": "",
660
  "cond_check_stopwords": True,
661
  "stopwords_min_cutoff": 0,
662
+ "cond_check_flagged_words": False,
663
+ "flagged_words_max_cutoff": 0.2,
664
  "cond_check_lang_id": True,
665
  "lang_id_min_cutoff": 0.75,
666
  "cond_check_perplexity": False,
690
  "words_augmentation_join_char": "",
691
  "cond_check_stopwords": True,
692
  "stopwords_min_cutoff": 0,
693
+ "cond_check_flagged_words": False,
694
+ "flagged_words_max_cutoff": 0.2,
695
  "cond_check_lang_id": True,
696
  "lang_id_min_cutoff": 0.75,
697
  "cond_check_perplexity": False,
721
  "words_augmentation_join_char": "",
722
  "cond_check_stopwords": True,
723
  "stopwords_min_cutoff": 0,
724
+ "cond_check_flagged_words": False,
725
+ "flagged_words_max_cutoff": 0.2,
726
  "cond_check_lang_id": True,
727
  "lang_id_min_cutoff": 0.75,
728
  "cond_check_perplexity": False,
752
  "words_augmentation_join_char": " ",
753
  "cond_check_stopwords": True,
754
  "stopwords_min_cutoff": 0,
755
+ "cond_check_flagged_words": False,
756
+ "flagged_words_max_cutoff": 0.2,
757
  "cond_check_lang_id": True,
758
  "lang_id_min_cutoff": 0.75,
759
  "cond_check_perplexity": False,
783
  "words_augmentation_join_char": "",
784
  "cond_check_stopwords": True,
785
  "stopwords_min_cutoff": 0,
786
+ "cond_check_flagged_words": False,
787
+ "flagged_words_max_cutoff": 0.2,
788
  "cond_check_lang_id": True,
789
  "lang_id_min_cutoff": 0.75,
790
  "cond_check_perplexity": False,
814
  "words_augmentation_join_char": "",
815
  "cond_check_stopwords": False,
816
  "stopwords_min_cutoff": 0,
817
+ "cond_check_flagged_words": False,
818
+ "flagged_words_max_cutoff": 0.2,
819
  "cond_check_lang_id": True,
820
  "lang_id_min_cutoff": 0.75,
821
  "cond_check_perplexity": False,