Spaces:

HugoLaurencon
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Feb 10, 2022

Commit

a3825e5

1 Parent(s): 5b8f851

first commit

Browse files

Files changed (35) hide show

.gitattributes +1 -0
.gitignore +2 -0
LICENSE +204 -0
README.md +13 -21
app.py +916 -0
ca.arpa.bin +3 -0
ca.sp.model +3 -0
ca_examples_with_stats.json +3 -0
en.arpa.bin +3 -0
en.sp.model +3 -0
en_examples_with_stats.json +3 -0
es.arpa.bin +3 -0
es.sp.model +3 -0
es_examples_with_stats.json +3 -0
eu.arpa.bin +3 -0
eu.sp.model +3 -0
eu_examples_with_stats.json +3 -0
explanation_filtering_pipeline.pdf +0 -0
filtering.py +957 -0
flagged_words.py +1055 -0
fr.arpa.bin +3 -0
fr.sp.model +3 -0
fr_examples_with_stats.json +3 -0
id.arpa.bin +3 -0
id.sp.model +3 -0
id_examples_with_stats.json +3 -0
languages_id.py +222 -0
lid.176.bin +3 -0
normalization.py +52 -0
parameters_filtering.py +895 -0
requirements.txt +4 -0
stopwords.py +0 -0
zh.arpa.bin +3 -0
zh.sp.model +3 -0
zh_examples_with_stats.json +3 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *cpython-39.pyc
2	+ .DS_Store

LICENSE ADDED Viewed

	@@ -0,0 +1,204 @@

+------------- LICENSE FOR Bigscience code  --------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2021] [Bigscience]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Text Data Filtering
-emoji: ⚡
-colorFrom: red
-colorTo: yellow
 sdk: streamlit
 app_file: app.py
 pinned: false
@@ -10,36 +10,28 @@ pinned: false
 # Configuration
-`title`: _string_
 Display title for the Space
-`emoji`: _string_
 Space emoji (emoji-only character allowed)
-`colorFrom`: _string_
 Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`colorTo`: _string_
 Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`sdk`: _string_
-Can be either `gradio`, `streamlit`, or `static`
-`sdk_version` : _string_
 Only applicable for `streamlit` SDK.
 See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
-`app_file`: _string_
-Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
 Path is relative to the root of the repository.
-`models`: _List[string]_
-HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
-Will be parsed automatically from your code if not specified here.
-`datasets`: _List[string]_
-HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
-Will be parsed automatically from your code if not specified here.
-`pinned`: _boolean_
 Whether the Space stays on top of your list.

 ---
 title: Text Data Filtering
+emoji: 👁
+colorFrom: blue
+colorTo: pink
 sdk: streamlit
 app_file: app.py
 pinned: false
 # Configuration
+`title`: _string_
 Display title for the Space
+`emoji`: _string_
 Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
 Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
 Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio` or `streamlit`
+`sdk_version` : _string_
 Only applicable for `streamlit` SDK.
 See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code).
 Path is relative to the root of the repository.
+`pinned`: _boolean_
 Whether the Space stays on top of your list.

app.py ADDED Viewed

	@@ -0,0 +1,916 @@

+# Run with: streamlit run visualization.py
+import streamlit as st
+import os
+from io import StringIO
+import base64
+import json
+import pandas as pd
+pd.options.mode.chained_assignment = None
+import numpy as np
+import matplotlib.pyplot as plt
+from filtering import LoadParameters, ModifyingDocuments, Filtering
+from languages_id import langs_id
+class Visualization_for_lang:
+    def __init__(
+        self,
+        path_data,
+        lang,
+        num_docs,
+        num_docs_for_words,
+        max_len_text_display,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+    ):
+        self.path_data = path_data
+        self.lang = lang
+        self.num_docs = num_docs
+        self.num_docs_for_words = num_docs_for_words
+        self.max_len_text_display = max_len_text_display
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
+    def set_title(self):
+        st.title(f"Filtering visualization for {self.lang}")
+    def open_data(self):
+        with open(self.path_data) as json_file:
+            data = json.load(json_file)
+        self.num_docs = min(self.num_docs, len(data))
+        self.num_docs_for_words = min(self.num_docs_for_words, len(data))
+        if "words" in data[0]:
+            words = [doc["words"] for doc in data[: self.num_docs_for_words]]
+            words = [word for doc in words for word in doc]
+            self.words = pd.DataFrame(words)
+        else:
+            self.words = None
+        docs = data[: self.num_docs]
+        for doc in docs:
+            if not (self.words is None):
+                del doc["words"]
+            if len(doc["text"]) > self.max_len_text_display:
+                doc["text"] = (
+                    doc["text"][: self.max_len_text_display]
+                    + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
+                )
+        self.docs_checkpoint = pd.DataFrame(docs)
+        self.docs = self.docs_checkpoint
+    @staticmethod
+    def print_discarded_by_cond(cond):
+        st.caption(
+            f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
+        )
+    @staticmethod
+    def plot_hist(dataframe, key, num_bins=50):
+        checkbox = st.checkbox(
+            "Diplay distribution", value=True, key=f"display_distribution_{key[0]}"
+        )
+        if checkbox:
+            fig, ax = plt.subplots()
+            val = dataframe[key[0]].values
+            if np.median(val) != 0:
+                val = val[
+                    abs(val - np.median(val))
+                    < 9 * np.median(np.absolute(val - np.median(val)))
+                ]
+            ax.hist(val, bins=num_bins, density=True)
+            ax.set_title(" ".join(key[0].split("_")))
+            ax.axvline(x=key[1], color="r", linestyle="dashed")
+            st.pyplot(fig)
+    @staticmethod
+    def display_dataset(dataframe, cond, description, type_of_examples):
+        displayed_examples = dataframe.loc[cond]
+        st.subheader(
+            f"{description}: {len(displayed_examples)} {type_of_examples} ({len(displayed_examples) / len(dataframe.index) * 100:.2f}%)"
+        )
+        st.markdown(
+            "Click on a column to sort by it, place the cursor on the text to display it."
+        )
+        st.dataframe(displayed_examples)
+    def filtering_of_docs(self):
+        def set_sliders():
+            columns = list(self.docs)
+            keys = []
+            conds = {}
+            def get_cond(key, cutoff, max_cutoff):
+                if max_cutoff:
+                    return self.docs[key] <= cutoff
+                return self.docs[key] >= cutoff
+            if "number_words" in columns:
+                with st.sidebar.expander("Number of words"):
+                    cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
+                    max_nb_words = int(np.max(self.docs["number_words"])) + 1
+                    cutoff_min_number_words = st.slider(
+                        cutoff_def, 0, min(max_nb_words, 500), 0
+                    )
+                    new_key = ("number_words", cutoff_min_number_words, False)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond_1)
+                    cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
+                    cutoff_max_number_words = st.slider(
+                        cutoff_def, 0, max_nb_words, max_nb_words
+                    )
+                    new_key = ("number_words", cutoff_max_number_words, True)
+                    keys.append(new_key)
+                    cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond_2)
+                    conds["number_words"] = [cond_1, cond_2]
+            if "character_repetition_ratio" in columns:
+                with st.sidebar.expander("Character repetition ratio"):
+                    val_repetitions_lengths = list(
+                        self.docs["character_repetition_ratio"].iloc[0].keys()
+                    )
+                    default_index = (
+                        val_repetitions_lengths.index("10")
+                        if "10" in val_repetitions_lengths
+                        else 0
+                    )
+                    label_selectbox = "Length of repetitions in characters (that will influence the character repetition ratio)."
+                    repetitions_length = st.selectbox(
+                        label=label_selectbox,
+                        options=val_repetitions_lengths,
+                        index=default_index,
+                    )
+                    st.caption(
+                        "Choosing a higher or lower number does not mean that the filtering "
+                        "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
+                        "tends to associate a high character repetition ratio to very long documents (like book chapters), but with "
+                        "few or no repetitions, simply because their length gives them more diversity, and we do "
+                        "not want to discard such documents. It is generally better to increase this number, so that false "
+                        "positives are very short documents (which we want to delete anyway) rather than long ones. However, "
+                        "a low number can be useful for Chinese, where a character can designate a whole word."
+                    )
+                    self.docs["character_repetition_ratio"] = self.docs_checkpoint[
+                        "character_repetition_ratio"
+                    ]
+                    for i in range(len(self.docs["character_repetition_ratio"])):
+                        self.docs["character_repetition_ratio"].iloc[i] = self.docs[
+                            "character_repetition_ratio"
+                        ].iloc[i][repetitions_length]
+                    cutoff_def = "If the character repetition ratio of a document is higher than this number, the document is removed."
+                    cutoff_character_repetition_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 1.0, step=0.01
+                    )
+                    new_key = (
+                        "character_repetition_ratio",
+                        cutoff_character_repetition_ratio,
+                        True,
+                        repetitions_length,
+                    )
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["character_repetition_ratio"] = [cond]
+            if "word_repetition_ratio" in columns:
+                with st.sidebar.expander("Word repetition ratio"):
+                    val_repetitions_lengths = list(
+                        self.docs["word_repetition_ratio"].iloc[0].keys()
+                    )
+                    default_index = (
+                        val_repetitions_lengths.index("5")
+                        if "5" in val_repetitions_lengths
+                        else 0
+                    )
+                    label_selectbox = "Length of repetitions in words (that will influence the word repetition ratio)."
+                    repetitions_length = st.selectbox(
+                        label=label_selectbox,
+                        options=val_repetitions_lengths,
+                        index=default_index,
+                    )
+                    st.caption(
+                        "Choosing a higher or lower number does not mean that the filtering "
+                        "is stronger or weaker. Be careful, choosing a low number (like 3) could "
+                        "tend to associate a high word repetition ratio to very long documents (like book chapters), but with "
+                        "few or no repetitions, simply because their length gives them more diversity, and we do "
+                        "not want to discard such documents. It is generally better to increase a bit this number, so that false "
+                        "positives are very short documents (which we want to delete anyway) rather than long ones."
+                    )
+                    self.docs["word_repetition_ratio"] = self.docs_checkpoint[
+                        "word_repetition_ratio"
+                    ]
+                    for i in range(len(self.docs["word_repetition_ratio"])):
+                        self.docs["word_repetition_ratio"].iloc[i] = self.docs[
+                            "word_repetition_ratio"
+                        ].iloc[i][repetitions_length]
+                    cutoff_def = "If the word repetition ratio of a document is higher than this number, the document is removed."
+                    cutoff_word_repetition_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 1.0, step=0.01
+                    )
+                    new_key = (
+                        "word_repetition_ratio",
+                        cutoff_word_repetition_ratio,
+                        True,
+                        repetitions_length,
+                    )
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["word_repetition_ratio"] = [cond]
+            if "special_characters_ratio" in columns:
+                with st.sidebar.expander("Special characters ratio"):
+                    cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
+                    cutoff_special_characters_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 1.0, step=0.01
+                    )
+                    new_key = (
+                        "special_characters_ratio",
+                        cutoff_special_characters_ratio,
+                        True,
+                    )
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["special_characters_ratio"] = [cond]
+            if "stopwords_ratio" in columns:
+                with st.sidebar.expander("Stop words ratio"):
+                    stopwords_file = st.file_uploader(
+                        "Upload your own list of stop words (one per line). If there is none, the default one is used."
+                    )
+                    if stopwords_file:
+                        new_stopwords = StringIO(
+                            stopwords_file.getvalue().decode("utf-8")
+                        ).read()
+                        new_stopwords = set(new_stopwords.split("\n"))
+                        self.docs["stopwords_ratio"] = self.docs_checkpoint[
+                            "stopwords_ratio"
+                        ]
+                        for i in range(len(self.docs["stopwords_ratio"])):
+                            self.docs["stopwords_ratio"].iloc[
+                                i
+                            ] = Filtering.compute_stopwords_ratio(
+                                self.docs["text"].iloc[i],
+                                self.sentencepiece_model_tok,
+                                self.param["strip_characters"],
+                                self.param["cond_words_augmentation"],
+                                self.param["words_augmentation_group_sizes"],
+                                self.param["words_augmentation_join_char"],
+                                new_stopwords,
+                            )
+                    cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
+                    cutoff_stopwords_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 0.0, step=0.01
+                    )
+                    new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["stopwords_ratio"] = [cond]
+            if "flagged_words_ratio" in columns:
+                with st.sidebar.expander("Flagged words ratio"):
+                    flagged_words_file = st.file_uploader(
+                        "Upload your own list of flagged words (one per line). If there is none, the default one is used."
+                    )
+                    if flagged_words_file:
+                        new_flagged_words = StringIO(
+                            flagged_words_file.getvalue().decode("utf-8")
+                        ).read()
+                        new_flagged_words = set(new_flagged_words.split("\n"))
+                        self.docs["flagged_words_ratio"] = self.docs_checkpoint[
+                            "flagged_words_ratio"
+                        ]
+                        for i in range(len(self.docs["flagged_words_ratio"])):
+                            self.docs["flagged_words_ratio"].iloc[
+                                i
+                            ] = Filtering.compute_flagged_words_ratio(
+                                self.docs["text"].iloc[i],
+                                self.sentencepiece_model_tok,
+                                self.param["strip_characters"],
+                                self.param["cond_words_augmentation"],
+                                self.param["words_augmentation_group_sizes"],
+                                self.param["words_augmentation_join_char"],
+                                new_flagged_words,
+                            )
+                    cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
+                    max_fwr = np.max(self.docs["flagged_words_ratio"])
+                    max_fwr = np.ceil(max_fwr * 1000) / 1000
+                    max_fwr = float(max_fwr)
+                    cutoff_flagged_words_ratio = st.slider(
+                        cutoff_def,
+                        0.000,
+                        max_fwr,
+                        max_fwr,
+                        step=0.001,
+                        format="%f",
+                    )
+                    new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["flagged_words_ratio"] = [cond]
+            if "lang_id_score" in columns:
+                with st.sidebar.expander("Language ID confidence score"):
+                    cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
+                    cutoff_lang_id_score = st.slider(
+                        cutoff_def, 0.0, 1.0, 0.0, step=0.01
+                    )
+                    new_key = ("lang_id_score", cutoff_lang_id_score, False)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["lang_id_score"] = [cond]
+            if "perplexity_score" in columns:
+                with st.sidebar.expander("Perplexity score"):
+                    cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
+                    max_pp = int(np.max(self.docs["perplexity_score"])) + 1
+                    cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
+                    new_key = ("perplexity_score", cutoff_perplexity_score, True)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["perplexity_score"] = [cond]
+            return keys, conds
+        with st.expander(
+            f"Filtering on documents, for {self.num_docs} {self.lang} documents"
+        ):
+            st.header(
+                f"Filtering on documents, for {self.num_docs} {self.lang} documents"
+            )
+            if "labels" in list(self.docs):
+                chosen_label = st.selectbox(
+                    label="Consider only documents that include the following label",
+                    options=[
+                        "All",
+                        "NA: Narrative",
+                        "IN: Informational Description",
+                        "OP: Opinion",
+                        "ID: Interactive Discussion",
+                        "HI: How-to/Instruction",
+                        "IP: Informational Persuasion",
+                        "LY: Lyrical",
+                        "SP: Spoken",
+                    ],
+                )
+                chosen_label = chosen_label.split(":")[0]
+                if chosen_label != "All":
+                    cond_label = list(
+                        self.docs["labels"].apply(
+                            lambda x: True if chosen_label in x else False
+                        )
+                    )
+                    self.docs = self.docs[cond_label]
+            if self.docs.empty:
+                st.markdown(
+                    "No document to display, please try to select a different label."
+                )
+                self.keys = []
+                self.parameters = []
+            else:
+                st.sidebar.subheader("Parameters of the filtering on documents")
+                self.keys, conds = set_sliders()
+                self.parameters = self.keys * 1
+                all_conds = [
+                    subcond for cond in list(conds.values()) for subcond in cond
+                ]
+                all_conds = np.all(all_conds, axis=0)
+                Visualization_for_lang.display_dataset(
+                    self.docs, np.invert(all_conds), "Discarded documents", "docs"
+                )
+                # st.subheader("Display discarded documents by filter")
+                display_discarded_documents_by_filter = st.checkbox(
+                    "Display discarded documents by filter"
+                )
+                if display_discarded_documents_by_filter:
+                    columns = list(self.docs)
+                    if "number_words" in columns:
+                        cond_filter = np.invert(np.all(conds["number_words"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the number of words",
+                            "docs",
+                        )
+                    if "character_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["character_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the character repetition ratio",
+                            "docs",
+                        )
+                    if "word_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["word_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the word repetition ratio",
+                            "docs",
+                        )
+                    if "special_characters_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["special_characters_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the special characters ratio",
+                            "docs",
+                        )
+                    if "stopwords_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["stopwords_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the stop words ratio",
+                            "docs",
+                        )
+                    if "flagged_words_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["flagged_words_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the flagged words ratio",
+                            "docs",
+                        )
+                    if "lang_id_score" in columns:
+                        cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the language identification confidence score",
+                            "docs",
+                        )
+                    if "perplexity_score" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["perplexity_score"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the perplexity score",
+                            "docs",
+                        )
+                Visualization_for_lang.display_dataset(
+                    self.docs, all_conds, "Retained documents", "docs"
+                )
+            st.header("Download data")
+            with open(self.path_data) as json_file:
+                btn = st.download_button(
+                    label="Download data as json",
+                    data=json_file,
+                    file_name="data.json",
+                )
+    def filtering_of_words(self):
+        if not (self.words is None):
+            columns = list(self.words)
+            st.sidebar.subheader("Parameter of the filtering on words")
+            conds_words = {}
+            if "len_word" in columns:
+                with st.sidebar.expander("Length of words"):
+                    cutoff_def = "If the length of a word is higher than this number, the word is removed."
+                    max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
+                    cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
+                    new_key = ("len_word", cutoff_word, True)
+                    self.parameters.append(new_key)
+                    Visualization_for_lang.plot_hist(self.words, new_key)
+                    cond_len_words = self.words["len_word"] <= cutoff_word
+                    Visualization_for_lang.print_discarded_by_cond(cond_len_words)
+                    conds_words["len_word"] = cond_len_words
+            if "incorrect_substrings" in columns:
+                with st.sidebar.expander("Words with incorrect substrings"):
+                    incorrect_substrings = st.checkbox(
+                        "Remove words with incorrect substrings."
+                    )
+                    self.parameters.append(
+                        ("incorrect_substrings", incorrect_substrings)
+                    )
+                    checkbox = st.checkbox(
+                        "Diplay distribution",
+                        value=True,
+                        key="display_distribution_incorrect_substrings",
+                    )
+                    if checkbox:
+                        incor_sub = np.array(self.words["incorrect_substrings"]) * 1
+                        with_incor_sub = np.sum(incor_sub)
+                        without_incor_sub = len(incor_sub) - with_incor_sub
+                        st.markdown(
+                            f"Number of words with incorrect substrings: {with_incor_sub}"
+                        )
+                        st.markdown(
+                            f"Number of words without incorrect substrings: {without_incor_sub}"
+                        )
+                    if incorrect_substrings:
+                        cond_incorrect_substrings = np.invert(
+                            self.words["incorrect_substrings"]
+                        )
+                    else:
+                        cond_incorrect_substrings = np.array(
+                            [
+                                True
+                                for i in range(len(self.words["incorrect_substrings"]))
+                            ]
+                        )
+                    Visualization_for_lang.print_discarded_by_cond(
+                        cond_incorrect_substrings
+                    )
+                    conds_words["incorrect_substrings"] = cond_incorrect_substrings
+            all_conds_words = np.all(list(conds_words.values()), axis=0)
+            with st.expander(
+                f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
+            ):
+                st.header(
+                    f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
+                )
+                st.markdown(
+                    f"Since the number of words is way larger than the number of documents, "
+                    f"we consider in this section words for only {self.num_docs_for_words} documents."
+                )
+                Visualization_for_lang.display_dataset(
+                    self.words, np.invert(all_conds_words), "Discarded words", "words"
+                )
+                # st.subheader("Display discarded words by filter")
+                display_discarded_words_by_filter = st.checkbox(
+                    "Display discarded words by filter"
+                )
+                if display_discarded_words_by_filter:
+                    if "len_word" in columns:
+                        cond_filter = np.invert(conds_words["len_word"])
+                        Visualization_for_lang.display_dataset(
+                            self.words,
+                            cond_filter,
+                            "Discarded words for the filter on length",
+                            "words",
+                        )
+                    if "incorrect_substrings" in columns:
+                        cond_filter = np.invert(conds_words["incorrect_substrings"])
+                        Visualization_for_lang.display_dataset(
+                            self.words,
+                            cond_filter,
+                            "Discarded words for the filter on incorrect substrings",
+                            "words",
+                        )
+                Visualization_for_lang.display_dataset(
+                    self.words, all_conds_words, "Retained words", "words"
+                )
+    def download_parameters(self):
+        st.sidebar.subheader("Download parameters")
+        btn = st.sidebar.download_button(
+            label="Download current parameters as json",
+            data=json.dumps(self.parameters),
+            file_name=f"parameters_{self.lang_dataset_id}.json",
+        )
+    """
+    def plot_zipf_law(self):
+        if not (self.words is None):
+            st.header("Zipf's Law")
+            display_zipf_law = st.checkbox("Display Zipf's Law")
+            if display_zipf_law:
+                freq_words = {}
+                for _, row in self.words.iterrows():
+                    freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
+                freq_words = np.array(list(freq_words.values()))
+                freq_words = -np.sort(-freq_words)
+                fig, ax = plt.subplots()
+                ax.loglog(freq_words)
+                ax.set_title("Zipf's Law")
+                ax.set_xlabel("$i$-th most frequent word")
+                ax.set_ylabel("frequency in the documents")
+                st.pyplot(fig)
+    """
+    def analyse_personal_doc(self):
+        with st.expander("Analyse your own document"):
+            st.header("Analyse your own document")
+            personal_doc = st.text_area(
+                label="Paste here the document you want to analyse",
+                value="",
+                max_chars=10000,
+            )
+            is_discarded = False
+            def is_doc_discarded(key, score):
+                if key[2]:  # max cutoff
+                    return score > key[1]
+                else:
+                    return score < key[1]
+            if personal_doc:
+                st.markdown("Statistics of the document:")
+                for key in self.keys:
+                    if key[0] == "number_words":
+                        words = ModifyingDocuments.get_words_from_document(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            lower_case=False,
+                            strip_characters=self.param["strip_characters"],
+                        )
+                        if key[2]:
+                            st.markdown(f"Number of words: {len(words)}")
+                        if is_doc_discarded(key, len(words)):
+                            is_discarded = True
+                    elif key[0] == "character_repetition_ratio":
+                        character_repetition_ratio = (
+                            Filtering.compute_character_repetition_ratio(
+                                personal_doc, int(key[3])
+                            )
+                        )
+                        character_repetition_ratio = round(
+                            character_repetition_ratio, 3
+                        )
+                        st.markdown(
+                            f"Character repetition ratio: {character_repetition_ratio}"
+                        )
+                        if is_doc_discarded(key, character_repetition_ratio):
+                            is_discarded = True
+                    elif key[0] == "word_repetition_ratio":
+                        word_repetition_ratio = Filtering.compute_word_repetition_ratio(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            self.param["strip_characters"],
+                            int(key[3]),
+                        )
+                        word_repetition_ratio = round(word_repetition_ratio, 3)
+                        st.markdown(f"Word repetition ratio: {word_repetition_ratio}")
+                        if is_doc_discarded(key, word_repetition_ratio):
+                            is_discarded = True
+                    elif key[0] == "special_characters_ratio":
+                        special_characters_ratio = (
+                            Filtering.compute_special_characters_ratio(
+                                personal_doc, self.param["special_characters"]
+                            )
+                        )
+                        special_characters_ratio = round(special_characters_ratio, 3)
+                        st.markdown(
+                            f"Special characters ratio: {special_characters_ratio}"
+                        )
+                        if is_doc_discarded(key, special_characters_ratio):
+                            is_discarded = True
+                    elif key[0] == "stopwords_ratio":
+                        stopwords_ratio = Filtering.compute_stopwords_ratio(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            self.param["strip_characters"],
+                            self.param["cond_words_augmentation"],
+                            self.param["words_augmentation_group_sizes"],
+                            self.param["words_augmentation_join_char"],
+                            self.stopwords,
+                        )
+                        stopwords_ratio = round(stopwords_ratio, 3)
+                        st.markdown(f"Stop words ratio: {stopwords_ratio}")
+                        if is_doc_discarded(key, stopwords_ratio):
+                            is_discarded = True
+                    elif key[0] == "flagged_words_ratio":
+                        flagged_words_ratio = Filtering.compute_flagged_words_ratio(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            self.param["strip_characters"],
+                            self.param["cond_words_augmentation"],
+                            self.param["words_augmentation_group_sizes"],
+                            self.param["words_augmentation_join_char"],
+                            self.flagged_words,
+                        )
+                        flagged_words_ratio = round(flagged_words_ratio, 3)
+                        st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
+                        if is_doc_discarded(key, flagged_words_ratio):
+                            is_discarded = True
+                    elif key[0] == "lang_id_score":
+                        (
+                            lang_pred_dataset_id,
+                            lang_id_score,
+                        ) = Filtering.compute_lang_id_pred_score(
+                            personal_doc, self.model_lang_id
+                        )
+                        lang_id_score = round(lang_id_score, 3)
+                        st.markdown(
+                            f"Language identification confidence score: {lang_id_score}"
+                        )
+                        if is_doc_discarded(key, flagged_words_ratio) or (
+                            self.lang_dataset_id != lang_pred_dataset_id
+                        ):
+                            is_discarded = True
+                    elif key[0] == "perplexity_score":
+                        perplexity_score = Filtering.compute_perplexity_score(
+                            personal_doc,
+                            self.sentencepiece_model,
+                            self.kenlm_model,
+                        )
+                        perplexity_score = round(perplexity_score, 3)
+                        st.markdown(f"Perplexity score: {perplexity_score}")
+                        if is_doc_discarded(key, perplexity_score):
+                            is_discarded = True
+                is_discarded = "" if is_discarded else "not "
+                st.markdown(
+                    f"With the current filtering parameters, this document **is {is_discarded}discarded**."
+                )
+    def visualization_for_lang(self):
+        self.set_title()
+        self.open_data()
+        self.filtering_of_docs()
+        self.filtering_of_words()
+        self.download_parameters()
+        self.analyse_personal_doc()
+class Visualization:
+    def __init__(self, path_instructions, param_visu_langs):
+        self.path_instructions = path_instructions
+        self.param_visu_langs = param_visu_langs
+    def preamble(self):
+        def get_binary_file_downloader_html(bin_file, file_label="File"):
+            with open(bin_file, "rb") as f:
+                data = f.read()
+            bin_str = base64.b64encode(data).decode()
+            href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
+            return href
+        st.markdown(
+            "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
+            + get_binary_file_downloader_html(
+                self.path_instructions,
+                "pdf",
+            )
+            + ".",
+            unsafe_allow_html=True,
+        )
+    def warning_preamble(self):
+        st.markdown(
+            "This demo can be a little slow, and only allows you to process up to 5000 documents "
+            "for a decent speed. If you want to display up to three times more documents and have "
+            "a faster visualization, we invite you to run this "
+            "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
+            "on your computer."
+        )
+    def choose_lang(self):
+        options = [
+            self.param_visu_langs[lang_dataset_id]["lang"]
+            for lang_dataset_id in self.param_visu_langs
+        ]
+        index = options.index("English") if ("English" in options) else 0
+        lang_chosen = st.selectbox(
+            label="Select the language for visualization",
+            options=options,
+            index=index,
+        )
+        if lang_chosen != "None":
+            lang_chosen_dataset_id = langs_id.loc[
+                langs_id["lang"] == lang_chosen, "dataset_id"
+            ].iloc[0]
+            visualization_for_lang = Visualization_for_lang(
+                path_data=self.param_visu_langs[lang_chosen_dataset_id]["path_data"],
+                lang=self.param_visu_langs[lang_chosen_dataset_id]["lang"],
+                num_docs=self.param_visu_langs[lang_chosen_dataset_id]["num_docs"],
+                num_docs_for_words=self.param_visu_langs[lang_chosen_dataset_id][
+                    "num_docs_for_words"
+                ],
+                max_len_text_display=self.param_visu_langs[lang_chosen_dataset_id][
+                    "max_len_text_display"
+                ],
+                lang_dataset_id=self.param_visu_langs[lang_chosen_dataset_id][
+                    "lang_dataset_id"
+                ],
+                path_fasttext_model=self.param_visu_langs[lang_chosen_dataset_id][
+                    "path_fasttext_model"
+                ],
+                path_sentencepiece_model=self.param_visu_langs[lang_chosen_dataset_id][
+                    "path_sentencepiece_model"
+                ],
+                path_kenlm_model=self.param_visu_langs[lang_chosen_dataset_id][
+                    "path_kenlm_model"
+                ],
+            )
+            visualization_for_lang.visualization_for_lang()
+    def visualization(self):
+        self.preamble()
+        self.warning_preamble()
+        self.choose_lang()
+path_instructions = "./explanation_filtering_pipeline.pdf"
+param_visu_langs = {
+    lang_dataset_id: {
+        "path_data": f"./{lang_dataset_id}_examples_with_stats.json",
+        "lang": langs_id.loc[langs_id["dataset_id"] == lang_dataset_id, "lang"].iloc[0],
+        "num_docs": 5000,
+        "num_docs_for_words": 500,
+        "max_len_text_display": 10000,
+        "lang_dataset_id": lang_dataset_id,
+        "path_fasttext_model": "./lid.176.bin",
+        "path_sentencepiece_model": f"./{lang_dataset_id}.sp.model",
+        "path_kenlm_model": f"./{lang_dataset_id}.arpa.bin",
+    }
+    for lang_dataset_id in ["eu", "ca", "zh", "en", "fr", "id", "es"]
+}
+visualization = Visualization(path_instructions, param_visu_langs)
+visualization.visualization()

ca.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ece1e503d4b44409069ea9c5c5125b74792b575143169e08cf9a27248f9a78e
+size 2809368958

ca.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abc6936e2ff5dcdc86962ffaeef48ef66f567d568ef7090d28123ed6618b455c
+size 946977

ca_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4207b45aa366ece2763a06565fcb771b86e433f2a6190248017f97e7534fa4a
+size 103605036

en.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04923fccbb4e63005c40f01d66112659416de01accd80d16e366a592289ee07a
+size 4444690658

en.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf8147a573770b4e6c0d4df1dcb75453baa88190706dab406be7711b84f059de
+size 931348

en_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dccf03710e9dc7ec68c676175e711be815bc29a50260f5d334156b03fe2e6d1
+size 241408394

es.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26964ff8185eb105021fc0e9eaa0a1de590c4a12f8aa3fe12112b29d42281cf3
+size 3828418653

es.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aae545566a995d3374fbc8ac1d4e0c7073008da8ae32acfe7f176136a8efcf37
+size 961535

es_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d52760c4c961ebfe419a603a6d837619ca146656f563f5abbd140dec8fbe28e
+size 148378888

eu.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d04c4d1233b40044e2facc978987ecd4a6d4f84032f2af3f85f7079676fa08b
+size 774011873

eu.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:447cbd1714e51e6a7b4dd8ff55b7bd975fdb7f6ba873cb6f8a1fe36b5867dbb6
+size 955869

eu_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10a06ac7ed9b4c444f35fb9a3e3636a22689c198a6bdd4fd358b0eec50aa924d
+size 66358003

explanation_filtering_pipeline.pdf ADDED Viewed

Binary file (218 kB). View file

filtering.py ADDED Viewed

	@@ -0,0 +1,957 @@

+import re
+import numpy as np
+import fasttext
+import sentencepiece
+import kenlm
+import pathlib
+from languages_id import langs_id
+from parameters_filtering import parameters_filtering
+from normalization import normalization
+from stopwords import stopwords
+from flagged_words import flagged_words
+class LoadParameters:
+    @staticmethod
+    def load_parameters(lang_dataset_id):
+        if lang_dataset_id in parameters_filtering:
+            param = parameters_filtering[lang_dataset_id]
+        else:
+            param = parameters_filtering["default"]
+        return param
+    @staticmethod
+    def load_stopwords(lang_dataset_id):
+        stopwords_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "stopwords_id"
+        ].iloc[0]
+        if stopwords_lang_id:
+            stopwords_lang = set(stopwords[stopwords_lang_id])
+        else:
+            stopwords_lang = None
+        return stopwords_lang
+    @staticmethod
+    def load_flagged_words(lang_dataset_id):
+        flagged_words_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id"
+        ].iloc[0]
+        if flagged_words_lang_id:
+            flagged_words_lang = set(flagged_words[flagged_words_lang_id])
+        else:
+            flagged_words_lang = None
+        return flagged_words_lang
+    @staticmethod
+    def load_model_lang_id(lang_dataset_id, path_fasttext_model):
+        fasttext_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "fasttext_id"
+        ].iloc[0]
+        if fasttext_lang_id:
+            model_lang_id = fasttext.load_model(path_fasttext_model)
+        else:
+            model_lang_id = None
+        return model_lang_id
+    @staticmethod
+    def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model):
+        sentencepiece_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id"
+        ].iloc[0]
+        if sentencepiece_lang_id:
+            sentencepiece_model = sentencepiece.SentencePieceProcessor()
+            sentencepiece_model.load(path_sentencepiece_model)
+        else:
+            sentencepiece_model = None
+        return sentencepiece_model
+    @staticmethod
+    def load_kenlm_model(lang_dataset_id, path_kenlm_model):
+        kenlm_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "kenlm_id"
+        ].iloc[0]
+        if kenlm_lang_id:
+            kenlm_model = kenlm.Model(path_kenlm_model)
+        else:
+            kenlm_model = None
+        return kenlm_model
+class ModifyingDocuments:
+    @staticmethod
+    def remove_empty_el_from_list(list_):
+        return [el for el in list_ if el]
+    @staticmethod
+    def remove_non_printing_characters(document, non_printing_characters_re):
+        return non_printing_characters_re.sub("", document)
+    @staticmethod
+    def uniform_whitespace(
+        document,
+        whitespace=[
+            " ",
+            " ",
+            " ",
+            " ",
+            " ",
+            "　",
+            " ",
+            " ",
+            " ",
+            " ",
+            "",
+            "",
+        ],
+    ):
+        """There are different whitespace characters."""
+        whitespace = set(whitespace)
+        document = "".join(
+            [char if char not in whitespace else " " for char in document]
+        )
+        return document
+    @staticmethod
+    def replace_digits_with_zeros(document, digits_re):
+        return digits_re.sub("0", document)
+    @staticmethod
+    def replace_unicode_punctuation(document, unicode_punctuation):
+        return "".join(unicode_punctuation.get(c, c) for c in document)
+    @staticmethod
+    def normalization(
+        document,
+        remove_non_printing_characters,
+        strip,
+        lower_case,
+        uniform_whitespace,
+        replace_digits_with_zeros,
+        replace_unicode_punctuation,
+        non_printing_characters_re=normalization["non_printing_characters_re"],
+        digits_re=normalization["digits_re"],
+        unicode_punctuation=normalization["unicode_punctuation"],
+    ):
+        if remove_non_printing_characters:
+            document = ModifyingDocuments.remove_non_printing_characters(
+                document, non_printing_characters_re
+            )
+        if strip:
+            document = document.strip()
+        if not document:
+            return document
+        if lower_case:
+            document = document.lower()
+        if uniform_whitespace:
+            document = ModifyingDocuments.uniform_whitespace(document)
+        if replace_digits_with_zeros:
+            document = ModifyingDocuments.replace_digits_with_zeros(document, digits_re)
+        if replace_unicode_punctuation:
+            document = ModifyingDocuments.replace_unicode_punctuation(
+                document, unicode_punctuation
+            )
+        return document
+    @staticmethod
+    def tokenization(document, sentencepiece_model, join_on_whitespace):
+        document_tokenized = sentencepiece_model.encode_as_pieces(document)
+        if join_on_whitespace:
+            document_tokenized = " ".join(document_tokenized)
+        return document_tokenized
+    @staticmethod
+    def split_on_whitespace(
+        document,
+        new_line=False,
+        tab=False,
+    ):
+        """This method also removes concatenated spaces."""
+        sep = [" "] + new_line * ["\n"] + tab * ["\t"]
+        sep = "|".join(sep)
+        split_document = re.split(sep, document)
+        split_document = ModifyingDocuments.remove_empty_el_from_list(split_document)
+        return split_document
+    @staticmethod
+    def strip(document, strip_characters):
+        """Way faster than document.strip(strip_characters)
+        since strip_characters is now a set instead of a str,
+        and it contains a lot of elements (all the emojis)."""
+        if not document:
+            return document
+        beg_ind = 0
+        end_ind = len(document)
+        for i in range(len(document)):
+            if document[i] in strip_characters:
+                beg_ind += 1
+            else:
+                break
+        for i in range(1, len(document) + 1):
+            if document[-i] in strip_characters:
+                end_ind -= 1
+            else:
+                break
+        document_stripped = document[beg_ind:end_ind]
+        return document_stripped
+    @staticmethod
+    def get_words_from_document(
+        document, sentencepiece_model_tok, lower_case, strip_characters
+    ):
+        """Get words from a document. Non reversible since the document
+        is split on multiple characters, words are stripped of
+        special characters and characters are converted to lower case.
+        Useful to compute ratios, like the stopwords ratio."""
+        if sentencepiece_model_tok:
+            document_normalized = ModifyingDocuments.normalization(
+                document=document,
+                remove_non_printing_characters=True,
+                strip=True,
+                lower_case=True,
+                uniform_whitespace=True,
+                replace_digits_with_zeros=True,
+                replace_unicode_punctuation=True,
+            )
+            words = ModifyingDocuments.tokenization(
+                document_normalized, sentencepiece_model_tok, join_on_whitespace=False
+            )
+        else:
+            words = ModifyingDocuments.split_on_whitespace(
+                document, new_line=True, tab=True
+            )
+        if lower_case:
+            words = [word.lower() for word in words]
+        if strip_characters:
+            words = [ModifyingDocuments.strip(word, strip_characters) for word in words]
+            words = ModifyingDocuments.remove_empty_el_from_list(words)
+        return words
+    @staticmethod
+    def words_augmentation(words, group_size, join_char):
+        """Augment words, especially for Chinese (without a space between words)
+        and Vietnamese (with a space between syllables)."""
+        augmentation = [
+            join_char.join(words[i : i + group_size])
+            for i in range(len(words) - group_size + 1)
+        ]
+        return augmentation
+    @staticmethod
+    def split_on_newline_tab_whitespace(document):
+        """First split on "\n", then on "\t", then on " "."""
+        sentences = document.split("\n")
+        sentences = [sentence.split("\t") for sentence in sentences]
+        sentences = [
+            [
+                ModifyingDocuments.split_on_whitespace(subsentence)
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        return sentences
+    @staticmethod
+    def merge_on_whitespace_tab_newline(sentences):
+        """Invert the method split_on_newline_tab_whitespace.
+        Removes concatenated separators."""
+        sentences = [
+            [" ".join(subsentence) for subsentence in sentence if subsentence]
+            for sentence in sentences
+        ]
+        sentences = ["\t".join(sentence) for sentence in sentences if sentence]
+        if not sentences:
+            return ""
+        document = "\n".join(sentences)
+        return document
+    @staticmethod
+    def should_keep_word_with_incorrect_substrings(
+        word, strip_characters, incorrect_word_substrings
+    ):
+        word = ModifyingDocuments.strip(word, strip_characters)
+        should_keep = all(
+            [(i_substr not in word) for i_substr in incorrect_word_substrings]
+        )
+        return should_keep
+    @staticmethod
+    def remove_words_with_incorrect_substrings(
+        document,
+        strip_characters,
+        incorrect_word_substrings,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_word_with_incorrect_substrings(
+                        word, strip_characters, incorrect_word_substrings
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+    @staticmethod
+    def should_keep_long_word(word, strip_characters, length_word_max_cutoff):
+        """If the word is too long but it contains only one
+        special character, it might be a concatenation of one word,
+        a punctuation, and another word, with no space between them.
+        In this case, we give the word a pass."""
+        if len(word) <= length_word_max_cutoff:
+            return True
+        word = ModifyingDocuments.strip(word, strip_characters)
+        if not word:  # The word consisted only of strip characters
+            return False
+        if len(word) <= length_word_max_cutoff:
+            return True
+        return False
+    def remove_long_words(
+        document,
+        strip_characters,
+        length_word_max_cutoff,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_long_word(
+                        word,
+                        strip_characters,
+                        length_word_max_cutoff,
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+    @staticmethod
+    def modifying_documents(
+        document,
+        cond_uniform_whitespace,
+        cond_replace_unicode_punctuation,
+        cond_remove_words_with_incorrect_substrings,
+        strip_characters,
+        incorrect_word_substrings,
+        cond_remove_long_words,
+        length_word_max_cutoff,
+    ):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=False,
+            strip=True,
+            lower_case=False,
+            uniform_whitespace=cond_uniform_whitespace,
+            replace_digits_with_zeros=False,
+            replace_unicode_punctuation=cond_replace_unicode_punctuation,
+        )
+        if cond_remove_words_with_incorrect_substrings:
+            document = ModifyingDocuments.remove_words_with_incorrect_substrings(
+                document,
+                strip_characters,
+                incorrect_word_substrings,
+            )
+        if cond_remove_long_words:
+            document = ModifyingDocuments.remove_long_words(
+                document,
+                strip_characters,
+                length_word_max_cutoff,
+            )
+        return document
+class FunctionDatasetModifyingDocuments:
+    def __init__(self, lang_dataset_id):
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+    def __call__(self, example):
+        example["text"] = ModifyingDocuments.modifying_documents(
+            document=example["text"],
+            cond_uniform_whitespace=self.param["cond_uniform_whitespace"],
+            cond_replace_unicode_punctuation=self.param[
+                "cond_replace_unicode_punctuation"
+            ],
+            cond_remove_words_with_incorrect_substrings=self.param[
+                "cond_remove_words_with_incorrect_substrings"
+            ],
+            strip_characters=self.param["strip_characters"],
+            incorrect_word_substrings=self.param["incorrect_word_substrings"],
+            cond_remove_long_words=self.param["cond_remove_long_words"],
+            length_word_max_cutoff=self.param["length_word_max_cutoff"],
+        )
+        return example
+    def __reduce__(self):
+        return (self.__class__, (self.lang_dataset_id,))
+class Filtering:
+    @staticmethod
+    def check_number_words(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=False,
+            strip_characters=strip_characters,
+        )
+        cond = (len(words) >= number_words_min_cutoff) and (
+            len(words) <= number_words_max_cutoff
+        )
+        return cond
+    @staticmethod
+    def compute_character_repetition_ratio(document, character_repetition_length):
+        def get_freq_character_ngrams(document, n):
+            character_ngrams = [
+                document[i : i + n] for i in range(len(document) - n + 1)
+            ]
+            freq_character_ngrams = {}
+            for character_ngram in character_ngrams:
+                freq_character_ngrams[character_ngram] = (
+                    freq_character_ngrams.get(character_ngram, 0) + 1
+                )
+            return freq_character_ngrams
+        freq_character_ngrams = get_freq_character_ngrams(
+            document, character_repetition_length
+        )
+        if len(freq_character_ngrams) == 0:
+            return 0
+        freq_character_ngrams = list(freq_character_ngrams.values())
+        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
+        val_less_than_one = len([el for el in freq_character_ngrams if el > 1])
+        num_rep_character_ngrams = min(
+            int(np.sqrt(len(freq_character_ngrams))),
+            len(freq_character_ngrams) - val_less_than_one,
+        )
+        character_repetition_ratio = sum(
+            freq_character_ngrams[:num_rep_character_ngrams]
+        ) / sum(freq_character_ngrams)
+        return character_repetition_ratio
+    @staticmethod
+    def check_character_repetition_removal(
+        document,
+        character_repetition_length,
+        character_repetition_max_cutoff,
+    ):
+        character_repetition_ratio = Filtering.compute_character_repetition_ratio(
+            document, character_repetition_length
+        )
+        cond = character_repetition_ratio <= character_repetition_max_cutoff
+        return cond
+    @staticmethod
+    def compute_word_repetition_ratio(
+        document, sentencepiece_model_tok, strip_characters, word_repetition_length
+    ):
+        def get_freq_word_ngrams(
+            document, sentencepiece_model_tok, strip_characters, n
+        ):
+            words = ModifyingDocuments.get_words_from_document(
+                document,
+                sentencepiece_model_tok,
+                lower_case=True,
+                strip_characters=strip_characters,
+            )
+            word_ngrams = [
+                " ".join(words[i : i + n]) for i in range(len(words) - n + 1)
+            ]
+            freq_word_ngrams = {}
+            for word_ngram in word_ngrams:
+                freq_word_ngrams[word_ngram] = freq_word_ngrams.get(word_ngram, 0) + 1
+            return freq_word_ngrams
+        freq_word_ngrams = get_freq_word_ngrams(
+            document, sentencepiece_model_tok, strip_characters, word_repetition_length
+        )
+        if len(freq_word_ngrams) == 0:
+            return 0
+        freq_word_ngrams = list(freq_word_ngrams.values())
+        word_repetition_ratio = sum(
+            freq for freq in freq_word_ngrams if freq > 1
+        ) / sum(freq_word_ngrams)
+        return word_repetition_ratio
+    @staticmethod
+    def check_word_repetition_removal(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        word_repetition_length,
+        word_repetition_max_cutoff,
+    ):
+        word_repetition_ratio = Filtering.compute_word_repetition_ratio(
+            document, sentencepiece_model_tok, strip_characters, word_repetition_length
+        )
+        cond = word_repetition_ratio <= word_repetition_max_cutoff
+        return cond
+    @staticmethod
+    def compute_special_characters_ratio(document, special_characters):
+        if len(document) == 0:
+            return 0
+        special_characters_ratio = len(
+            [char for char in document if char in special_characters]
+        ) / len(document)
+        return special_characters_ratio
+    @staticmethod
+    def check_special_characters(
+        document,
+        special_characters,
+        special_characters_max_cutoff,
+    ):
+        special_characters_ratio = Filtering.compute_special_characters_ratio(
+            document, special_characters
+        )
+        cond = special_characters_ratio <= special_characters_max_cutoff
+        return cond
+    @staticmethod
+    def compute_stopwords_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        stopwords_ratio = len(
+            [word for word in words + augmentation if word in stopwords]
+        ) / len(words)
+        if stopwords_ratio > 1.0:
+            stopwords_ratio = 1.0
+        return stopwords_ratio
+    @staticmethod
+    def check_stopwords(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+        stopwords_min_cutoff,
+    ):
+        cond = True
+        if stopwords:
+            stopwords_ratio = Filtering.compute_stopwords_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+            )
+            cond = stopwords_ratio >= stopwords_min_cutoff
+        return cond
+    @staticmethod
+    def compute_flagged_words_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        flagged_words,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        flagged_words_ratio = len(
+            [word for word in words + augmentation if word in flagged_words]
+        ) / len(words)
+        if flagged_words_ratio > 1.0:
+            flagged_words_ratio = 1.0
+        return flagged_words_ratio
+    @staticmethod
+    def check_flagged_words(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        flagged_words,
+        flagged_words_max_cutoff,
+    ):
+        cond = True
+        if flagged_words:
+            flagged_words_ratio = Filtering.compute_flagged_words_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                flagged_words,
+            )
+            cond = flagged_words_ratio <= flagged_words_max_cutoff
+        return cond
+    @staticmethod
+    def compute_lang_id_pred_score(document, model_lang_id):
+        document = document.lower().replace("\n", " ")
+        pred = model_lang_id.predict(document)
+        lang_pred_fasttext_id = pred[0][0].replace("__label__", "")
+        score_pred = pred[1][0]
+        lang_pred_dataset_id = langs_id.loc[
+            langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id"
+        ]
+        if len(lang_pred_dataset_id) > 0:
+            lang_pred_dataset_id = lang_pred_dataset_id.iloc[0]
+        else:
+            lang_pred_dataset_id = "unknown"
+        return lang_pred_dataset_id, score_pred
+    @staticmethod
+    def check_lang_id(
+        document,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+    ):
+        cond = True
+        if model_lang_id:
+            lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score(
+                document, model_lang_id
+            )
+            cond = (lang_pred_dataset_id == lang_dataset_id) and (
+                score_pred >= lang_id_min_cutoff
+            )
+        return cond
+    @staticmethod
+    def compute_perplexity_score(document, sentencepiece_model, kenlm_model):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=True,
+            strip=True,
+            lower_case=False,
+            uniform_whitespace=True,
+            replace_digits_with_zeros=True,
+            replace_unicode_punctuation=True,
+        )
+        document = ModifyingDocuments.tokenization(
+            document, sentencepiece_model, join_on_whitespace=True
+        )
+        doc_log_score, doc_length = 0, 0
+        for line in document.split("\n"):
+            log_score = kenlm_model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        pp_score = 10.0 ** (-doc_log_score / doc_length)
+        pp_score = round(pp_score, 1)
+        return pp_score
+    @staticmethod
+    def check_perplexity(
+        document,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        cond = True
+        if kenlm_model:
+            score = Filtering.compute_perplexity_score(
+                document, sentencepiece_model, kenlm_model
+            )
+            cond = score <= perplexity_max_cutoff
+        return cond
+    @staticmethod
+    def filtering(
+        document,
+        cond_check_number_words,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+        cond_check_character_repetition_removal,
+        character_repetition_length,
+        character_repetition_max_cutoff,
+        cond_check_word_repetition_removal,
+        word_repetition_length,
+        word_repetition_max_cutoff,
+        cond_check_special_characters,
+        special_characters,
+        special_characters_max_cutoff,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        cond_check_stopwords,
+        stopwords,
+        stopwords_min_cutoff,
+        cond_check_flagged_words,
+        flagged_words,
+        flagged_words_max_cutoff,
+        cond_check_lang_id,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+        cond_check_perplexity,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        if cond_check_number_words:
+            if not Filtering.check_number_words(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                number_words_min_cutoff,
+                number_words_max_cutoff,
+            ):
+                return False
+        if cond_check_character_repetition_removal:
+            if not Filtering.check_character_repetition_removal(
+                document,
+                character_repetition_length,
+                character_repetition_max_cutoff,
+            ):
+                return False
+        if cond_check_word_repetition_removal:
+            if not Filtering.check_word_repetition_removal(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                word_repetition_length,
+                word_repetition_max_cutoff,
+            ):
+                return False
+        if cond_check_special_characters:
+            if not Filtering.check_special_characters(
+                document,
+                special_characters,
+                special_characters_max_cutoff,
+            ):
+                return False
+        if cond_check_stopwords:
+            if not Filtering.check_stopwords(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+                stopwords_min_cutoff,
+            ):
+                return False
+        if cond_check_flagged_words:
+            if not Filtering.check_flagged_words(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                flagged_words,
+                flagged_words_max_cutoff,
+            ):
+                return False
+        if cond_check_lang_id:
+            if not Filtering.check_lang_id(
+                document,
+                lang_dataset_id,
+                model_lang_id,
+                lang_id_min_cutoff,
+            ):
+                return False
+        if cond_check_perplexity:
+            if not Filtering.check_perplexity(
+                document,
+                sentencepiece_model,
+                kenlm_model,
+                perplexity_max_cutoff,
+            ):
+                return False
+        return True
+class FunctionDatasetFiltering:
+    def __init__(
+        self,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+    ):
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
+    def __call__(self, example):
+        keep_example = Filtering.filtering(
+            document=example["text"],
+            cond_check_number_words=self.param["cond_check_number_words"],
+            sentencepiece_model_tok=self.sentencepiece_model_tok,
+            strip_characters=self.param["strip_characters"],
+            number_words_min_cutoff=self.param["number_words_min_cutoff"],
+            number_words_max_cutoff=self.param["number_words_max_cutoff"],
+            cond_check_character_repetition_removal=self.param[
+                "cond_check_character_repetition_removal"
+            ],
+            character_repetition_length=self.param["character_repetition_length"],
+            character_repetition_max_cutoff=self.param[
+                "character_repetition_max_cutoff"
+            ],
+            cond_check_word_repetition_removal=self.param[
+                "cond_check_word_repetition_removal"
+            ],
+            word_repetition_length=self.param["word_repetition_length"],
+            word_repetition_max_cutoff=self.param["word_repetition_max_cutoff"],
+            cond_check_special_characters=self.param["cond_check_special_characters"],
+            special_characters=self.param["special_characters"],
+            special_characters_max_cutoff=self.param["special_characters_max_cutoff"],
+            cond_words_augmentation=self.param["cond_words_augmentation"],
+            words_augmentation_group_sizes=self.param["words_augmentation_group_sizes"],
+            words_augmentation_join_char=self.param["words_augmentation_join_char"],
+            cond_check_stopwords=self.param["cond_check_stopwords"],
+            stopwords=self.stopwords,
+            stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
+            cond_check_flagged_words=self.param["cond_check_flagged_words"],
+            flagged_words=self.flagged_words,
+            flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"],
+            cond_check_lang_id=self.param["cond_check_lang_id"],
+            lang_dataset_id=self.lang_dataset_id,
+            model_lang_id=self.model_lang_id,
+            lang_id_min_cutoff=self.param["lang_id_min_cutoff"],
+            cond_check_perplexity=self.param["cond_check_perplexity"],
+            sentencepiece_model=self.sentencepiece_model,
+            kenlm_model=self.kenlm_model,
+            perplexity_max_cutoff=self.param["perplexity_max_cutoff"],
+        )
+        return keep_example
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.lang_dataset_id,
+                self.path_fasttext_model,
+                self.path_sentencepiece_model,
+                self.path_kenlm_model,
+            ),
+        )
+class DatasetFiltering:
+    def __init__(
+        self,
+        dataset,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+        num_proc,
+        path_dir_save_dataset,
+    ):
+        self.ds = dataset
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+        self.num_proc = num_proc
+        self.path_dir_save_dataset = path_dir_save_dataset
+    def modifying_documents(self):
+        func_dataset_modifying_documents = FunctionDatasetModifyingDocuments(
+            self.lang_dataset_id
+        )
+        self.ds = self.ds.map(func_dataset_modifying_documents, num_proc=self.num_proc)
+    def filtering(self):
+        func_dataset_filtering = FunctionDatasetFiltering(
+            self.lang_dataset_id,
+            self.path_fasttext_model,
+            self.path_sentencepiece_model,
+            self.path_kenlm_model,
+        )
+        self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc)
+    def save_dataset(self):
+        pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        path_dir_save_dataset = pathlib.PurePath(
+            self.path_dir_save_dataset, self.lang_dataset_id
+        )
+        pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        self.ds.save_to_disk(path_dir_save_dataset)

flagged_words.py ADDED Viewed

	@@ -0,0 +1,1055 @@

+# Merge
+# https://github.com/zacanger/profane-words
+# and
+# https://github.com/thisandagain/washyourmouthoutwithsoap/blob/develop/data/build.json
+# and
+# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
+english_flagged_words = [
+    "anal",
+    "bareback",
+    "bbw",
+    "bdsm",
+    "blowjob",
+    "blowjobs",
+    "brazzers",
+    "bukkake",
+    "camgirl",
+    "camwhore",
+    "cocksucking",
+    "cougar",
+    "creampie",
+    "cuckold",
+    "cum",
+    "cumming",
+    "cums",
+    "cumshot",
+    "cumshots",
+    "cumslut",
+    "cunnilingus",
+    "deepthroat",
+    "deepthroating",
+    "dildo",
+    "dildos",
+    "dogging",
+    "doggystyle",
+    "dominatrix",
+    "erotic",
+    "fellatio",
+    "femdom",
+    "fingering",
+    "fisting",
+    "footjob",
+    "gangbang",
+    "handjob",
+    "hentai",
+    "horney",
+    "horniest",
+    "horny",
+    "jism",
+    "jizz",
+    "masterbating",
+    "masturbate",
+    "masturbating",
+    "masturbation",
+    "milf",
+    "orgies",
+    "orgy",
+    "pegging",
+    "porn",
+    "pornhub",
+    "porno",
+    "pornos",
+    "pornstar",
+    "pornstars",
+    "redtube",
+    "rimming",
+    "slutty",
+    "squirting",
+    "strapon",
+    "threesome",
+    "vibrator",
+    "xhamster",
+    "xnxx",
+    "xvideos",
+    "xxx",
+    "youporn",
+]
+flagged_words = {
+    "ar": english_flagged_words
+    + [
+        "إباحي",
+        "احتلام",
+        "است",
+        "استمناء",
+        "اغتصاب",
+        "أورغازم",
+        "إيروتيك",
+        "أير",
+        "بز",
+        "بزاز",
+        "بظر",
+        "بورن",
+        "بيضان",
+        "مص",
+        "ثدي",
+        "جماع",
+        "جنس",
+        "حلمة",
+        "خلاعة",
+        "خنثي",
+        "خول",
+        "دعارة",
+        "زب",
+        "سحاق",
+        "سحاقية",
+        "سكس",
+        "سيكس",
+        "شاذ",
+        "شبق",
+        "شرج",
+        "شرموطة",
+        "شهواني",
+        "شهوة",
+        "طيز",
+        "عادة السرية",
+        "عاهرة",
+        "عرص",
+        "فاسقة",
+        "فرج",
+        "قحبة",
+        "قضيب",
+        "كس",
+        "لحس",
+        "لعق",
+        "لواط",
+        "لوطي",
+        "مؤخرة",
+        "متناك",
+        "متناكة",
+        "مومس",
+        "مثير",
+        "مص",
+        "مضاجعة",
+        "مفلقسة",
+        "مني",
+        "مهتاج",
+        "نشوة",
+        "نكاح",
+        "نيك",
+    ],
+    "ca": english_flagged_words
+    + [
+        "cagarro",
+        "cagarros",
+        "cipote",
+        "cipotes",
+        "collons",
+        "colló",
+        "consolador",
+        "consoladors",
+        "cony",
+        "conys",
+        "corre's",
+        "corre't",
+        "corregut",
+        "cunillingus",
+        "córrer-se",
+        "escorreguda",
+        "escorregudes",
+        "escorregut",
+        "escrot",
+        "escrots",
+        "escórre's",
+        "escórre't",
+        "escórrer-se",
+        "mamada",
+        "mamadera",
+        "mamaderes",
+        "mamades",
+        "masturba",
+        "masturbacions",
+        "masturbació",
+        "masturbant",
+        "masturbar",
+        "masturbar-se",
+        "masturbat",
+        "masturbats",
+        "masturbes",
+        "orgasme",
+        "orgasmes",
+        "ou",
+        "ous",
+        "palla",
+        "palles",
+        "pornografia",
+        "semen",
+        "semens",
+        "verga",
+        "vergues",
+        "xxx",
+    ],
+    "en": english_flagged_words,
+    "es": english_flagged_words
+    + [
+        "chupar el coño",
+        "chupar la concha",
+        "chupar la polla",
+        "chupar la verga",
+        "comer el coño",
+        "comer la concha",
+        "comer la polla",
+        "comer la verga",
+        "coprofagía",
+        "correrse",
+        "cunillingus",
+        "fagging",
+        "felación",
+        "felching",
+        "follada",
+        "follador de culo",
+        "folladores",
+        "fudge packer",
+        "hacer una paja",
+        "hacerse una paja",
+        "hore",
+        "kock",
+        "macizorra",
+        "madre folladora",
+        "mamada",
+        "perro follador",
+        "pisser",
+        "pornografía",
+        "sado",
+        "sadomasoquismo",
+        "sadomasoquista",
+        "sexo anal",
+        "skank",
+        "smegma",
+        "x clasificado",
+    ],
+    "eu": english_flagged_words + [],
+    "fr": english_flagged_words
+    + [
+        "baiseurs",
+        "baiseur",
+        "baiseuse",
+        "baiseuses",
+        "branlette",
+        "branlettes",
+        "branleuse",
+        "branleuses",
+        "cunillingus",
+        "cunilingus",
+        "enculée",
+        "enculées",
+        "enculation",
+        "enculations",
+        "enculement",
+        "enculements",
+        "fellation",
+        "fellations",
+        "porno",
+        "pornos",
+        "pornographie",
+        "pornographique",
+        "pornographiques",
+        "salope",
+        "salopes",
+        "suceuse",
+        "suceuses",
+        "xxx",
+    ],
+    "ha": english_flagged_words
+    + [
+        "bf",
+        "bura",
+        "burar",
+        "cin gindi",
+        "duri",
+        "durin",
+        "gindi",
+        "gindin",
+        "guntsu",
+        "guntsun",
+        "karuwa",
+        "karuwai",
+        "karuwar",
+        "maniyyi",
+        "maɗigo",
+        "maɗugo",
+        "nonuwa",
+        "shan gindi",
+        "tsuliya",
+        "tsuliyariskanci",
+        "ɗuwaiwai",
+        "ɗuwaiwan",
+        "ɗuwawu",
+        "ɗuwawun",
+    ],
+    "hi": english_flagged_words
+    + [
+        "अंडकोश की थैली",
+        "एक्स रेटेड",
+        "ओगाज़्म",
+        "कामोद्दीपक चित्र",
+        "कालीन का चूरा",
+        "कून",
+        "कॉक",
+        "गेंद का थैला",
+        "चाकलेट का रंग",
+        "चूची",
+        "चूतड़",
+        "झटका बंद",
+        "ठगना पैकर",
+        "डिल्डो",
+        "नितंब",
+        "पिछाड़ी",
+        "पीड़न कामुक",
+        "पॉर्न",
+        "फटना",
+        "फूहड़",
+        "बट",
+        "बहुत मदहोश",
+        "बेल अंत",
+        "भगवान-शापित",
+        "भगशेफ",
+        "माँ कमीने",
+        "मुखमैथुन",
+        "मुर्गा चूसने वाला",
+        "रक्तरंजित",
+        "लेबिया",
+        "वहशी",
+        "वहशीता",
+        "वैंग",
+        "शिश्नमल",
+        "संभोग सुख",
+        "सह शॉट",
+        "सींग का बना हुआ",
+        "होर",
+        "घपा घप",
+        "चुदाई",
+        "चुदक्कड़",
+    ],
+    "id": english_flagged_words
+    + [
+        "bokep",
+        "coli",
+        "colmek",
+        "grepe",
+        "horni",
+        "janda",
+        "jembut",
+        "jilat memek",
+        "jilmek",
+        "kontol",
+        "masturbasi",
+        "memek",
+        "ngentot",
+        "ngewe",
+        "peju",
+        "pepek",
+        "pornografi",
+        "sange",
+        "sepong",
+        "tusbol",
+    ],
+    "kn": english_flagged_words
+    + [
+        "ಗರ್ಭಪಾತ",
+        "ಗುದ",
+        "ಗುದದ್ವಾರ",
+        "ಕತ್ತೆ",
+        "ಆಶ್-ಫಕರ್",
+        "ಅಸ್ಹೋಲ್",
+        "ಅಸೋಲೆಸ್",
+        "ಬಾಲ್ಬಾಗ್",
+        "ಚೆಂಡುಗಳು",
+        "ಬಾಸ್ಟರ್ಡ್",
+        "ಬೆಲೆಂಡ್",
+        "ಮೃದ್ವಂಗಿ",
+        "ಪ್ರಾಣಿಜನ್ಯತೆ",
+        "ಬಿಚ್",
+        "ಬಿಟ್ಚಿಸ್",
+        "ಬೆಚಿಂಗ್",
+        "ರಕ್ತಸಿಕ್ತ",
+        "ಬ್ಲೋಜಾಬ್",
+        "ಬೊಲ್ಲೊಕ್",
+        "ಕುರುಚಲು ಗಿಡ",
+        "ಬೂಬಿಗಳು",
+        "ಸ್ತನಗಳನ್ನು",
+        "ಬುಕೆಟಾ",
+        "ತಿಕ",
+        "ಬಟ್",
+        "ಕಾರ್ಪೆಟ್ ಮಂಚರ್",
+        "ಚಿಂಕ್",
+        "ಸಿಪಾ",
+        "ಚಂದ್ರನಾಡಿ",
+        "ಕೋಳಿ",
+        "ಕೋಳಿ ಸಕ್ಕರ್",
+        "ಕಾಕ್ಸ್",
+        "ಕೂನ್",
+        "ಅಮೇಧ್ಯ",
+        "ಕಮ್",
+        "ಕಮ್ಶಾಟ್",
+        "ಕುನಿಲ್ಲಸ್",
+        "ಕಂಟ್",
+        "ಡ್ಯಾಮ್",
+        "ಡಿಕ್",
+        "ದ್ವಿಧ್ರುವಿ",
+        "dildos",
+        "ಡಿಂಕ್",
+        "ನಾಯಿ-ಫಕರ್",
+        "ಡಚೆ",
+        "ಡೈಕ್",
+        "ಹೊರಹೊಮ್ಮಿಸು",
+        "ಸ್ಫೂರ್ತಿ",
+        "ಎಜಾಕ್ಯುಲೇಟ್ಸ್",
+        "ಇಜಲಲೇಟಿಂಗ್",
+        "ಉದ್ಗಾರ",
+        "ತಮಾಷೆ",
+        "ಮಂದಗತಿ",
+        "ಮಬ್ಬು",
+        "fagots",
+        "ಫ್ಯಾನಿ",
+        "ಹೊಡೆತ",
+        "ಪತನ",
+        "ಚಾಚುಪಟ್ಟಿ",
+        "ಫಕ್",
+        "ನಾಶವಾಗಿದ್ದನು",
+        "ಫಕರ್",
+        "fuckers",
+        "ಫಕಿಂಗ್",
+        "ಫಕಿಂಗ್ಸ್",
+        "ಇಷ್ಟಪಡುತ್ತಾನೆ",
+        "ಮಿಠಾಯಿ ಪ್ಯಾಕರ್",
+        "ದೇವರನ್ನು ಹಾನಿಗೊಳಗಾಯಿತು",
+        "ಗಾಡ್ಡಮ್",
+        "ನರಕ",
+        "ಹೋರ್",
+        "ಮೊನಚಾದ",
+        "ಜರ್ಕ್-ಆಫ್",
+        "ಕೋಕ್",
+        "ಯೋನಿಯ",
+        "ಕಾಮ",
+        "ಕಾಮುಕ",
+        "ಮಾಸೋಚಿಸ್ಟ್",
+        "ಹಸ್ತಮೈಥುನ ಮಾಡು",
+        "ತಾಯಿ ಫಕರ್",
+        "ನಾಜಿ",
+        "ನಿಗರ್",
+        "ನಿಗ್ಗರ್ಗಳು",
+        "ಒರಾಸಿಮ್",
+        "ಪರಾಕಾಷ್ಠೆ",
+        "ಪರಾಕಾಷ್ಠೆಗಳನ್ನು",
+        "ಪೆಕರ್",
+        "ಶಿಶ್ನ",
+        "ಮೂತ್ರ ವಿಸರ್ಜಿಸು",
+        "ನಿರುತ್ಸಾಹಗೊಂಡಿದೆ",
+        "ಪಿಸರ್",
+        "ಮೂತ್ರಪಿಂಡಗಳು",
+        "pissing",
+        "ಪಿಸ್ಸಾಫ್",
+        "ಪೂಪ್",
+        "ಅಶ್ಲೀಲತೆ",
+        "ಅಶ್ಲೀಲ",
+        "ಚುಚ್ಚು",
+        "ಪ್ರಿಕ್ಸ್",
+        "ಪಬ್",
+        "ಪುಸಿಗಳು",
+        "ಪುಸಿ",
+        "ಅತ್ಯಾಚಾರ",
+        "ಅತ್ಯಾಚಾರಿ",
+        "ಗುದನಾಳದ",
+        "ರಿಟಾರ್ಡ್",
+        "ಹಚ್ಚುವುದು",
+        "ದುಃಖಗಾರ",
+        "ತಿರುಗಿಸುವುದು",
+        "ಸ್ಕ್ರೋಟಮ್",
+        "ವೀರ್ಯ",
+        "ಲೈಂಗಿಕತೆ",
+        "ಶಾಗ್",
+        "ಶಾಗ್ಗಿಂಗ್",
+        "ಶೆಮೇಲ್",
+        "ಶಿಟ್",
+        "ಷೈಟ್",
+        "ಶಿಟ್ಸ್",
+        "shitted",
+        "ಅಲುಗಾಡುವಿಕೆ",
+        "ಅಸಹ್ಯ",
+        "ಸ್ಕಾಂಕ್",
+        "ಸೂಳೆ",
+        "ಸ್ಲಟ್ಗಳು",
+        "ಸ್ಮೆಗ್ಮಾ",
+        "ಕೊಳೆತ",
+        "ಸ್ನ್ಯಾಚ್",
+        "ಮಗ-ಆಫ್-ಬಿಚ್",
+        "spac",
+        "ಉಬ್ಬು",
+        "ವೃಷಣ",
+        "ಟಿಟ್",
+        "ಚೇಕಡಿ ಹಕ್ಕಿಗಳು",
+        "turd",
+        "ಯೋನಿ",
+        "ವಯಾಗ್ರ",
+        "ವಾಂಗ್",
+        "ಮುಷ್ಕರ",
+        "x ರೇಟೆಡ್",
+        "xxx",
+    ],
+    "ml": english_flagged_words
+    + [
+        "ഗർഭഛിദ്രം",
+        "വിശപ്പ്",
+        "മലദ്വാരം",
+        "കഴുത",
+        "അസി ഫക്കർ",
+        "കഴുതകളെ",
+        "ആസ്ഹോൾ",
+        "അശ്ളീലങ്ങൾ",
+        "ബോൾബാഗ്",
+        "പന്തുകൾ",
+        "തന്തയില്ലാത്തവൻ",
+        "ബെല്ലെൻഡ്",
+        "മൃഗീയമായ",
+        "മൃഗീയത",
+        "ബിച്ച്",
+        "ബിച്ചുകൾ",
+        "ബിപിഡിംഗ്",
+        "രക്തരൂക്ഷിതമായ",
+        "ആശ്വാസം",
+        "ബലോക്ക്",
+        "ബോബ്",
+        "പൂക്കൾ",
+        "സ്തനങ്ങൾ",
+        "ബ്യൂട്ടാ",
+        "ബം",
+        "മയക്കുമരുന്ന്",
+        "പരവതാനി മാൻച്ചർ",
+        "ചുംബ്",
+        "സിപാ",
+        "ക്ലോറിസിസ്",
+        "കോക്ക്",
+        "കോക്ക് സക്കർ",
+        "കോക്സ്",
+        "കോൺ",
+        "ക്രാപ്പ്",
+        "ശുക്ലം",
+        "പുരുഷാരം",
+        "സി",
+        "മുഷിഞ്ഞ",
+        "കഷ്ടം",
+        "ഡിക്ക്",
+        "ഡിൽഡോ",
+        "dildos",
+        "ഡൈൻ",
+        "നായ-ഫക്കർ",
+        "ഡച്ച്",
+        "ഡൈകെ",
+        "ശമിപ്പിക്കുക",
+        "മോഷ്ടിച്ചു",
+        "വികാരങ്ങൾ",
+        "വിരസത",
+        "മടി",
+        "ക്ഷീണിപ്പിക്കുക",
+        "fagot",
+        "വഞ്ചന",
+        "ഫാനി",
+        "വേദന",
+        "flange",
+        "ഊമ്പി",
+        "സംഭോഗം ചെയ്യുക",
+        "ഫക്കർ",
+        "നർമ്മം",
+        "ഫഡ്ജ് പാക്കർ",
+        "ദൈവം-കൊള്ളിത",
+        "ഗോഡ്ഡം",
+        "നരകം",
+        "വയ്ക്കുക",
+        "വൃത്തികെട്ട",
+        "ജെർക് ഓഫ്",
+        "കിക്ക്",
+        "ലാബിയ",
+        "മോഹം",
+        "മോഹഭംഗം",
+        "മാസോച്ചിസ്റ്റ്",
+        "സ്വയംഭോഗം ചെയ്യുക",
+        "അമ്മ ഫക്കർ",
+        "നാസി",
+        "നിഗർ",
+        "മ��ക്കുമരുന്നുകൾ",
+        "രതിമൂർച്ഛ",
+        "പെക്കർ",
+        "ലിംഗം",
+        "മൂത്രമൊഴിക്കുക",
+        "കുഴഞ്ഞുവീഴുന്നു",
+        "പിസ്സർ",
+        "പിസ്സകൾ",
+        "pissing",
+        "പിസ്സോഫ്",
+        "poop",
+        "അശ്ലീലം",
+        "അശ്ലീലത",
+        "പ്രാവി",
+        "വിസർജ്യങ്ങൾ",
+        "പ്യൂബ്",
+        "pussies",
+        "pussy",
+        "ബലാൽസംഗം",
+        "ബലാത്സംഗം",
+        "മലാശയം",
+        "തുടരുക",
+        "റിമ്മിംഗ്",
+        "സചിസ്റ്റ്",
+        "വഞ്ചി",
+        "പുല്ല്",
+        "ബീജം",
+        "ശവം",
+        "ഷാഗിംഗ്",
+        "അവൾ",
+        "ഷീറ്റ്",
+        "ഷെയ്റ്റ്",
+        "shits",
+        "തിന്നിട്ടില്ല",
+        "ഷോർട്ട്",
+        "ഷൈറ്റി",
+        "സ്കാൻ",
+        "മന്ദഹസരം",
+        "സ്നെഗമാ",
+        "പുഞ്ചിരി",
+        "പിടിക്കുക",
+        "വെറുക്കപ്പെട്ടയാൾ",
+        "സ്പെയ്ക്",
+        "തുളച്ച്",
+        "വൃഷണം",
+        "പേ",
+        "ടിത്ത്",
+        "കുഴപ്പമില്ല",
+        "യോനി",
+        "വരാഗ്ര",
+        "വാൽവ",
+        "വാങ്",
+        "വാൻ",
+        "വേശ്യ",
+        "x റേറ്റുചെയ്തു",
+        "xxx",
+    ],
+    "mr": english_flagged_words
+    + [
+        "गर्भपात",
+        "गुदा",
+        "गाढव",
+        "गांडुळ",
+        "asses",
+        "asshole",
+        "assholes",
+        "ballbag",
+        "चेंडू",
+        "बॅस्टर्ड",
+        "बेलेंड",
+        "बेस्टियल",
+        "प्राण्यांबरोबर",
+        "कुत्री",
+        "बिट्स",
+        "खूनी",
+        "blowjob",
+        "बोलोक",
+        "बोब",
+        "स्तन",
+        "बसीटा",
+        "बम",
+        "बट",
+        "कार्पेट मुन्चर",
+        "चिंक",
+        "सिपा",
+        "क्लिटोरिस",
+        "मुर्ख",
+        "मांसाहारी",
+        "कॉक्स",
+        "कॉनन",
+        "बकवास",
+        "सह",
+        "cumshot",
+        "कनिलिंगस",
+        "कांट",
+        "धिक्कार",
+        "डिक",
+        "dildo",
+        "डिल्डो",
+        "डंक",
+        "duche",
+        "डाईक",
+        "उद्गार",
+        "उत्साही",
+        "ejaculates",
+        "उत्सुकता",
+        "स्खलन",
+        "फॅग",
+        "फॅगिंग",
+        "फॅगॉट",
+        "फॅगॉट्स",
+        "फॅनी",
+        "फेलिंग",
+        "फॅलेटीओ",
+        "निकला",
+        "fucked",
+        "गुप्तचर",
+        "fuckers",
+        "fucking",
+        "fuckings",
+        "fucks",
+        "फडगे पॅकर",
+        "देव-शापित",
+        "देव",
+        "नरक",
+        "होरे",
+        "शिंग",
+        "झटका बंद",
+        "कॉक",
+        "लॅबिया",
+        "वासना",
+        "मासोचिस्ट",
+        "हस्तमैथुन करा",
+        "आई माकड",
+        "नाझी",
+        "निगर",
+        "निगार",
+        "ऑर्गॅसिम",
+        "संभोग",
+        "orgasms",
+        "चापटी",
+        "पुरुषाचे जननेंद्रिय",
+        "पेशी",
+        "pissed",
+        "पिसर",
+        "pisses",
+        "पिसिंग",
+        "पिसोफ",
+        "घाट",
+        "अश्लील",
+        "पोर्नोग्राफी",
+        "मुरुम",
+        "प्रिक्स",
+        "प्यूब",
+        "pussies",
+        "मांजर",
+        "बलात्कार",
+        "गुदाशय",
+        "मंद",
+        "rimming",
+        "दुःखी",
+        "screwing",
+        "स्क्रोटम",
+        "वीर्य",
+        "लिंग",
+        "शेग",
+        "shagging",
+        "शेमले",
+        "विचित्र",
+        "shite",
+        "shits",
+        "shitted",
+        "shitting",
+        "shitty",
+        "घाणेरडा",
+        "फट",
+        "sluts",
+        "सुगंध",
+        "स्मट",
+        "छेडछाड",
+        "मुलगा-एक-कुत्री",
+        "spac",
+        "तिरस्कार",
+        "परीक्षक",
+        "शीर्षक",
+        "टिट",
+        "टर्ड",
+        "योनी",
+        "वियाग्रा",
+        "वल्वा",
+        "वांग",
+        "विंक",
+        "वेश्या",
+        "एक्स रेट केले",
+        "xxx",
+    ],
+    "pt": english_flagged_words
+    + [
+        "balalao",
+        "bate uma",
+        "beijo grego",
+        "boceta",
+        "boquete",
+        "buceta",
+        "caralho",
+        "chochota",
+        "coito",
+        "cona",
+        "consolo",
+        "corno",
+        "cu",
+        "dar a bunda",
+        "dar o rabo",
+        "dildo",
+        "dildos",
+        "esporrar",
+        "estrovenga",
+        "felação",
+        "filho da puta",
+        "filhos da puta",
+        "gozada",
+        "jeba",
+        "perereca",
+        "pica",
+        "piru",
+        "porno",
+        "pornografia",
+        "pornô",
+        "porra",
+        "prostituta",
+        "pube",
+        "punheta",
+        "punheteiro",
+        "putaria",
+        "queca",
+        "sexo",
+        "siririca",
+        "tesão",
+        "trepada",
+        "verga",
+        "vibrador",
+        "xana",
+        "xochota",
+        "xoxota",
+    ],
+    "ta": english_flagged_words
+    + [
+        "ஓதா",
+        "ஒத்தா",
+        "புண்டை",
+        "ஒம்மாளே",
+        "பக்கி",
+        "கூமுட்டை",
+        "கருமம்",
+        "சனியன்",
+        "கஸ்மாலம்",
+        "சூத்து",
+    ],
+    "te": english_flagged_words
+    + [
+        "గర్భస్రావం",
+        "అంగ",
+        "పాయువు",
+        "గాడిద",
+        "గాడిద-fucker",
+        "asses",
+        "assholes",
+        "బాల్బ్యాగ్",
+        "బంతుల్లో",
+        "బాస్టర్డ్",
+        "బెల్లెండ్",
+        "మృగ",
+        "బెస్టియాలిటీ",
+        "బిచ్",
+        "bitches",
+        "బిట్చింగ్",
+        "బ్లడీ",
+        "blowjob",
+        "బోల్లక",
+        "బూబ్",
+        "వక్షోజాలను",
+        "ఛాతీ",
+        "buceta",
+        "బం",
+        "బట్",
+        "కార్పెట్ ముంచర్",
+        "చింక్",
+        "cipa",
+        "స్త్రీగుహ్యాంకురము",
+        "ఆత్మవిశ్వాసం",
+        "కాక్-సక్కర్",
+        "కాక్స్",
+        "కూన్",
+        "చెత్త",
+        "కం",
+        "cumshot",
+        "క్యునిల్లింగస్",
+        "కంట్",
+        "తిట్టు",
+        "డిక్",
+        "లైంగిక సంతృప్తి కోసం స్త్రీలు ఉపయోగించే పురుషాంగము వంటి పరికరము",
+        "డిల్డోస్",
+        "dink",
+        "కుక్క-fucker",
+        "డూష్",
+        "డైక్",
+        "స్ఖలించు",
+        "ఎజాక్యులేటెడ్",
+        "ఎజాక్యులేట్స్",
+        "ఎరాక్యులేటింగ్",
+        "స్ఖలనం",
+        "నవుకరు",
+        "ఫాగ్గింగ్",
+        "ఫాగాట్",
+        "ఫగాట్స్",
+        "fanny",
+        "ఫెల్చింగ్",
+        "కుడుచుట",
+        "అచ్చు",
+        "ఫక్",
+        "ఇబ్బంది పెట్టాడు",
+        "fucker",
+        "ఫకర్స్",
+        "ఫకింగ్",
+        "ఫకింగ్స్",
+        "ఫక్స్",
+        "ఫడ్జ్ ప్యాకర్",
+        "దేవతలా మంచిది",
+        "గాడ్డామ్",
+        "నరకం",
+        "హోర్",
+        "horny",
+        "జెర్క్-ఆఫ్",
+        "కాక్",
+        "పెదవి",
+        "కామం",
+        "మనసు పడ్డట్లు చిత్రించారు",
+        "masochist",
+        "హస్తప్రయోగం",
+        "తల్లి ఫెకర్",
+        "నాజీ",
+        "నిగ్గర్",
+        "నిగ్గర్స్",
+        "ఆర్గాసిమ్",
+        "స్కలనం",
+        "orgasms",
+        "pecker",
+        "పురుషాంగం",
+        "విసర్జన",
+        "pissed",
+        "పిస్సర్",
+        "పిస్సీస్",
+        "పిస్సింగ్",
+        "పిస్సాఫ్",
+        "poop",
+        "శృంగార",
+        "పోర్నో",
+        "అశ్లీల",
+        "బుడతడు",
+        "ప్రిక్స్",
+        "ప్యూబ్",
+        "pussies",
+        "పుస్సీ",
+        "రేప్",
+        "ఉన్నప్పటికీ బలాత్కారం",
+        "పురీషనాళం",
+        "రిటార్డ్",
+        "రిమ్మింగ్",
+        "పీడన కాముకత",
+        "screwing",
+        "స్క్రోటమ్",
+        "వీర్యం",
+        "సెక్స్",
+        "బొచ్చు",
+        "షగ్గింగ్",
+        "షీమేల్",
+        "ఒంటి",
+        "షైట్",
+        "షిట్స్",
+        "షిట్టెడ్",
+        "షిట్టింగ్",
+        "shitty",
+        "స్కాన్క్",
+        "నీతి",
+        "స్లట్స్",
+        "శిశ్న",
+        "స్మట్",
+        "స్నాచ్",
+        "ఒక బిచ్ కుమారుడు ఆఫ్",
+        "spac",
+        "స్పంక్",
+        "వృషణాలు",
+        "తునక",
+        "టిట్స్",
+        "టిట్",
+        "turd",
+        "యోని",
+        "వయాగ్రా",
+        "జననాంగం",
+        "వాంగ్",
+        "వ్యాంక్",
+        "వేశ్య",
+        "x రేట్",
+        "xxx",
+    ],
+    "vi": english_flagged_words
+    + [
+        "cặc",
+        "lồn",
+        "địt",
+        "buồi",
+        "đụ",
+        "nứng",
+        "phò",
+        "bú lol",
+        "bú lon",
+        "bú cac",
+        "bú cak",
+        "bú cax",
+        "bú buoi",
+        "bú mút",
+        "húp lol",
+        "húp lon",
+        "húp sò",
+        "nung lol",
+        "nung lon",
+        "nug lon",
+        "nung cac",
+        "nung buoi",
+        "thèm dit",
+        "thèm lon",
+        "thèm lol",
+        "thèm buoi",
+        "thèm dit",
+        "thèm chich",
+        "thèm chịch",
+        "thèm cax",
+        "thèm cak",
+        "móc lol",
+        "móc lon",
+        "buscu",
+        "lol non",
+        "bướm non",
+        "vú to",
+        "vú bự",
+        "lon to",
+        "phim sex",
+        "phim xex",
+        "phim xxx",
+    ],
+    "zh": english_flagged_words
+    + [
+        "三级片",
+        "乳交",
+        "卖B",
+        "卖比",
+        "卖淫",
+        "口交",
+        "姣西",
+        "打炮",
+        "打飞机",
+        "抽插",
+        "招妓",
+        "插你",
+        "插死你",
+        "淫水",
+        "肉壁",
+        "肉棍子",
+        "肉缝",
+        "肏",
+        "肛交",
+    ],
+}

fr.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:301c82d52a8e34f63937afc12970794c8783244c8c0b085a8bbfb0d54dcb9374
+size 2829042764

fr.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1b70d5e6556ad245e02ac76919a714ad0b7d288955df65ecd3831a42950b653
+size 942639

fr_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8dd605b140e7a4c20a00e06c8c70d90333d2559434acd9c182de054d6b53b13b
+size 140859096

id.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e099b6216a558d6c6f6108895e2e13fbc6ffd00b59791d16d6a5f85103ac0be
+size 1847280248

id.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b217615a7b185e5e0c967ea5b7156fe149145221e32a54b96dfed15d98b3c807
+size 926624

id_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1c05dfc6f847bccf2e79cdb90c0dbb05a7266ae77673cd9f6c3cb811dace8e8
+size 89435039

languages_id.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import pandas as pd
+langs_id = [
+    {
+        "lang": "Afrikaans",
+        "dataset_id": "af",
+        "stopwords_id": "af",
+        "flagged_words_id": None,
+        "fasttext_id": "af",
+        "sentencepiece_id": "af",
+        "kenlm_id": "af",
+    },
+    {
+        "lang": "Arabic",
+        "dataset_id": "ar",
+        "stopwords_id": "ar",
+        "flagged_words_id": "ar",
+        "fasttext_id": "ar",
+        "sentencepiece_id": "ar",
+        "kenlm_id": "ar",
+    },
+    {
+        "lang": "Egyptian Arabic",
+        "dataset_id": "arz",
+        "stopwords_id": None,
+        "flagged_words_id": None,
+        "fasttext_id": "arz",
+        "sentencepiece_id": "arz",
+        "kenlm_id": "arz",
+    },
+    {
+        "lang": "Assamese",
+        "dataset_id": "as",
+        "stopwords_id": None,
+        "flagged_words_id": None,
+        "fasttext_id": "as",
+        "sentencepiece_id": "as",
+        "kenlm_id": "as",
+    },
+    {
+        "lang": "Bengali",
+        "dataset_id": "bn",
+        "stopwords_id": "bn",
+        "flagged_words_id": None,
+        "fasttext_id": "bn",
+        "sentencepiece_id": "bn",
+        "kenlm_id": "bn",
+    },
+    {
+        "lang": "Catalan",
+        "dataset_id": "ca",
+        "stopwords_id": "ca",
+        "flagged_words_id": "ca",
+        "fasttext_id": "ca",
+        "sentencepiece_id": "ca",
+        "kenlm_id": "ca",
+    },
+    {
+        "lang": "English",
+        "dataset_id": "en",
+        "stopwords_id": "en",
+        "flagged_words_id": "en",
+        "fasttext_id": "en",
+        "sentencepiece_id": "en",
+        "kenlm_id": "en",
+    },
+    {
+        "lang": "Spanish",
+        "dataset_id": "es",
+        "stopwords_id": "es",
+        "flagged_words_id": "es",
+        "fasttext_id": "es",
+        "sentencepiece_id": "es",
+        "kenlm_id": "es",
+    },
+    {
+        "lang": "Basque",
+        "dataset_id": "eu",
+        "stopwords_id": "eu",
+        "flagged_words_id": "eu",
+        "fasttext_id": "eu",
+        "sentencepiece_id": "eu",
+        "kenlm_id": "eu",
+    },
+    {
+        "lang": "French",
+        "dataset_id": "fr",
+        "stopwords_id": "fr",
+        "flagged_words_id": "fr",
+        "fasttext_id": "fr",
+        "sentencepiece_id": "fr",
+        "kenlm_id": "fr",
+    },
+    {
+        "lang": "Gujarati",
+        "dataset_id": "gu",
+        "stopwords_id": None,
+        "flagged_words_id": None,
+        "fasttext_id": "gu",
+        "sentencepiece_id": "gu",
+        "kenlm_id": "gu",
+    },
+    {
+        "lang": "Hindi",
+        "dataset_id": "hi",
+        "stopwords_id": "hi",
+        "flagged_words_id": "hi",
+        "fasttext_id": "hi",
+        "sentencepiece_id": "hi",
+        "kenlm_id": "hi",
+    },
+    {
+        "lang": "Indonesian",
+        "dataset_id": "id",
+        "stopwords_id": "id",
+        "flagged_words_id": "id",
+        "fasttext_id": "id",
+        "sentencepiece_id": "id",
+        "kenlm_id": "id",
+    },
+    {
+        "lang": "Kannada",
+        "dataset_id": "kn",
+        "stopwords_id": None,
+        "flagged_words_id": "kn",
+        "fasttext_id": "kn",
+        "sentencepiece_id": "kn",
+        "kenlm_id": "kn",
+    },
+    {
+        "lang": "Malayalam",
+        "dataset_id": "ml",
+        "stopwords_id": None,
+        "flagged_words_id": "ml",
+        "fasttext_id": "ml",
+        "sentencepiece_id": "ml",
+        "kenlm_id": "ml",
+    },
+    {
+        "lang": "Marathi",
+        "dataset_id": "mr",
+        "stopwords_id": "mr",
+        "flagged_words_id": "mr",
+        "fasttext_id": "mr",
+        "sentencepiece_id": "mr",
+        "kenlm_id": "mr",
+    },
+    {
+        "lang": "Portuguese",
+        "dataset_id": "pt",
+        "stopwords_id": "pt",
+        "flagged_words_id": "pt",
+        "fasttext_id": "pt",
+        "sentencepiece_id": "pt",
+        "kenlm_id": "pt",
+    },
+    {
+        "lang": "Swahili",
+        "dataset_id": "sw",
+        "stopwords_id": "sw",
+        "flagged_words_id": None,
+        "fasttext_id": "sw",
+        "sentencepiece_id": "sw",
+        "kenlm_id": "sw",
+    },
+    {
+        "lang": "Tamil",
+        "dataset_id": "ta",
+        "stopwords_id": None,
+        "flagged_words_id": "ta",
+        "fasttext_id": "ta",
+        "sentencepiece_id": "ta",
+        "kenlm_id": "ta",
+    },
+    {
+        "lang": "Telugu",
+        "dataset_id": "te",
+        "stopwords_id": None,
+        "flagged_words_id": "te",
+        "fasttext_id": "te",
+        "sentencepiece_id": "te",
+        "kenlm_id": "te",
+    },
+    {
+        "lang": "Urdu",
+        "dataset_id": "ur",
+        "stopwords_id": "ur",
+        "flagged_words_id": None,
+        "fasttext_id": "ur",
+        "sentencepiece_id": "ur",
+        "kenlm_id": "ur",
+    },
+    {
+        "lang": "Vietnamese",
+        "dataset_id": "vi",
+        "stopwords_id": "vi",
+        "flagged_words_id": "vi",
+        "fasttext_id": "vi",
+        "sentencepiece_id": "vi",
+        "kenlm_id": "vi",
+    },
+    {
+        "lang": "Yoruba",
+        "dataset_id": "yo",
+        "stopwords_id": "yo",
+        "flagged_words_id": None,
+        "fasttext_id": "yo",
+        "sentencepiece_id": "yo",
+        "kenlm_id": "yo",
+    },
+    {
+        "lang": "Chinese",
+        "dataset_id": "zh",
+        "stopwords_id": "zh",
+        "flagged_words_id": "zh",
+        "fasttext_id": "zh",
+        "sentencepiece_id": "zh",
+        "kenlm_id": "zh",
+    },
+]
+langs_id = pd.DataFrame(langs_id)

lid.176.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
+size 131266198

normalization.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import re
+from typing import Dict
+non_printing_characters_re = re.compile(
+    f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+)
+digits_re: re.Pattern = re.compile(r"\d")
+unicode_punctuation: Dict[str, str] = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+normalization = {
+    "non_printing_characters_re": non_printing_characters_re,
+    "digits_re": digits_re,
+    "unicode_punctuation": unicode_punctuation,
+}

parameters_filtering.py ADDED Viewed

	@@ -0,0 +1,895 @@

+import string
+import emoji
+main_special_characters = string.punctuation + string.digits + string.whitespace
+other_special_characters = (
+    "    　    ’“”–ー一▬…✦�£•€«»°·═"
+    "×士＾˘⇓↓↑←→（）§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
+    "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
+    "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬？▷Г♫∟™ª₪®「—❖"
+    "」﴾》"
+)
+emoji = list(emoji.UNICODE_EMOJI["en"].keys())
+special_characters_default = set(main_special_characters + other_special_characters)
+special_characters_default.update(emoji)
+parameters_filtering_default = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.70,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_af = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.6,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_ar = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.45,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1000000,
+}
+parameters_filtering_arz = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.5,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_as = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_bn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.05,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 575000,
+}
+parameters_filtering_ca = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1750000,
+}
+parameters_filtering_en = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": True,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 20,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.3,
+    "cond_check_flagged_words": True,
+    "flagged_words_max_cutoff": 0.045,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.80,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500,
+}
+parameters_filtering_es = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.2,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+parameters_filtering_eu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_fr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_gu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 250000,
+}
+parameters_filtering_hi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 600000,
+}
+parameters_filtering_id = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.25,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+parameters_filtering_kn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 400000,
+}
+parameters_filtering_ml = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.2,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1600000,
+}
+parameters_filtering_mr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 425000,
+}
+parameters_filtering_pt = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_sw = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_ta = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_te = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_ur = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_vi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2],
+    "words_augmentation_join_char": " ",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_yo = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering_zh = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 1000,
+    "cond_check_number_words": True,
+    "tokenization": True,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+parameters_filtering = {
+    "default": parameters_filtering_default,
+    "af": parameters_filtering_af,
+    "ar": parameters_filtering_ar,
+    "arz": parameters_filtering_arz,
+    "as": parameters_filtering_as,
+    "bn": parameters_filtering_bn,
+    "ca": parameters_filtering_ca,
+    "en": parameters_filtering_en,
+    "es": parameters_filtering_es,
+    "eu": parameters_filtering_eu,
+    "fr": parameters_filtering_fr,
+    "gu": parameters_filtering_gu,
+    "hi": parameters_filtering_hi,
+    "id": parameters_filtering_id,
+    "kn": parameters_filtering_kn,
+    "ml": parameters_filtering_ml,
+    "mr": parameters_filtering_mr,
+    "pt": parameters_filtering_pt,
+    "sw": parameters_filtering_sw,
+    "ta": parameters_filtering_ta,
+    "te": parameters_filtering_te,
+    "ur": parameters_filtering_ur,
+    "vi": parameters_filtering_vi,
+    "yo": parameters_filtering_yo,
+    "zh": parameters_filtering_zh,
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fasttext
+sentencepiece
+https://github.com/kpu/kenlm/archive/master.zip
+emoji

stopwords.py ADDED Viewed

The diff for this file is too large to render. See raw diff

zh.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:240f156d70a4b04cb078b4f127ae0103378454143a77442c18e5e24b93404e56
+size 3635106545

zh.sp.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2189b2cc84a513a76d24f9a0154e52f0afaf3010dc5fd1034ed37c9d2b5970
+size 876286

zh_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:318cf4641a46c9c7c16fc77171f28475cb8e96935201d3541d493b5231e8d53a
+size 63524762