Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Jan 21, 2022

Commit

da13b29

1 Parent(s): 5d56c36

distributions for the filters on words and discarded words by filter

Browse files

Files changed (2) hide show

app.py +139 -66
en_examples_with_stats.json +2 -2

app.py CHANGED Viewed

@@ -112,6 +112,12 @@ class Visualization:
     def set_title(self):
         st.title(f"Filtering visualization")
     @staticmethod
     def plot_hist(dataframe, key, num_bins=50):
         checkbox = st.checkbox(
@@ -130,6 +136,17 @@ class Visualization:
             ax.axvline(x=key[1], color="r", linestyle="dashed")
             st.pyplot(fig)
     def filtering_of_docs(self):
         st.sidebar.subheader("Parameters of the filtering on documents")
@@ -143,11 +160,6 @@ class Visualization:
                     return self.docs[key] <= cutoff
                 return self.docs[key] >= cutoff
-            def print_discared_by_cond(cond):
-                st.caption(
-                    f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
-                )
             if "number_words" in columns:
                 with st.sidebar.expander("Number of words"):
                     cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
@@ -159,7 +171,7 @@ class Visualization:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond_1)
                     cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
                     cutoff_max_number_words = st.slider(
@@ -168,7 +180,7 @@ class Visualization:
                     new_key = ("number_words", cutoff_max_number_words, True)
                     keys.append(new_key)
                     cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond_2)
                     conds["number_words"] = [cond_1, cond_2]
@@ -216,7 +228,7 @@ class Visualization:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond)
                     conds["repetitions_ratio"] = [cond]
             if "special_characters_ratio" in columns:
@@ -233,7 +245,7 @@ class Visualization:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond)
                     conds["special_characters_ratio"] = [cond]
             if "stopwords_ratio" in columns:
@@ -269,7 +281,7 @@ class Visualization:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond)
                     conds["stopwords_ratio"] = [cond]
             if "flagged_words_ratio" in columns:
@@ -298,14 +310,15 @@ class Visualization:
                                 new_flagged_words,
                             )
                     cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
                     cutoff_flagged_words_ratio = st.slider(
-                        cutoff_def, 0.0, 1.0, 1.0, step=0.01
                     )
                     new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond)
                     conds["flagged_words_ratio"] = [cond]
             if "lang_id_score" in columns:
@@ -318,7 +331,7 @@ class Visualization:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond)
                     conds["lang_id_score"] = [cond]
             if "perplexity_score" in columns:
@@ -330,7 +343,7 @@ class Visualization:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
-                    print_discared_by_cond(cond)
                     conds["perplexity_score"] = [cond]
             return keys, conds
@@ -348,17 +361,9 @@ class Visualization:
                 f"Filtering on documents, for {self.num_docs} {self.lang} documents"
             )
-            def display_dataset(cond, description):
-                displayed_docs = self.docs.loc[cond]
-                st.subheader(
-                    f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
-                )
-                st.markdown(
-                    "Click on a column to sort by it, place the cursor on the text to display it."
-                )
-                st.dataframe(displayed_docs)
-            display_dataset(np.invert(all_conds), "Discarded documents")
             # st.subheader("Display discarded documents by filter")
             display_discarded_documents_by_filter = st.checkbox(
@@ -370,58 +375,74 @@ class Visualization:
                 if "number_words" in columns:
                     cond_filter = np.invert(np.all(conds["number_words"], axis=0))
-                    display_dataset(
                         cond_filter,
                         "Discarded documents for the filter on the number of words",
                     )
                 if "repetitions_ratio" in columns:
                     cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
-                    display_dataset(
                         cond_filter,
                         "Discarded documents for the filter on the repetitions ratio",
                     )
                 if "special_characters_ratio" in columns:
                     cond_filter = np.invert(
                         np.all(conds["special_characters_ratio"], axis=0)
                     )
-                    display_dataset(
                         cond_filter,
                         "Discarded documents for the filter on the special characters ratio",
                     )
                 if "stopwords_ratio" in columns:
                     cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
-                    display_dataset(
                         cond_filter,
                         "Discarded documents for the filter on the stop words ratio",
                     )
                 if "flagged_words_ratio" in columns:
                     cond_filter = np.invert(
                         np.all(conds["flagged_words_ratio"], axis=0)
                     )
-                    display_dataset(
                         cond_filter,
                         "Discarded documents for the filter on the flagged words ratio",
                     )
                 if "lang_id_score" in columns:
                     cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
-                    display_dataset(
                         cond_filter,
                         "Discarded documents for the filter on the language identification confidence score",
                     )
                 if "perplexity_score" in columns:
                     cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
-                    display_dataset(
                         cond_filter,
                         "Discarded documents for the filter on the perplexity score",
                     )
-            display_dataset(all_conds, "Retained documents")
             st.header("Download data")
@@ -434,57 +455,109 @@ class Visualization:
     def filtering_of_words(self):
         if not (self.words is None):
             st.sidebar.subheader("Parameter of the filtering on words")
-            with st.sidebar.expander("Length of words"):
-                cutoff_def = "If the length of a word is higher than this number, the word is removed."
-                max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
-                cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
-                new_key = ("len_word", cutoff_word, True)
-                self.parameters.append(new_key)
-                Visualization.plot_hist(self.words, new_key)
-            with st.sidebar.expander("Words with incorrect substrings"):
-                incorrect_substrings = st.checkbox(
-                    "Remove words with incorrect substrings."
-                )
-                self.parameters.append(("incorrect_substrings", incorrect_substrings))
-                cond_words = self.words["len_word"] <= cutoff_word
-                if incorrect_substrings:
-                    cond_words = cond_words & np.invert(
-                        self.words["incorrect_substring"]
                     )
             with st.expander(
-                f"Filtering on words, for {self.num_docs} {self.lang} documents"
             ):
                 st.header(
-                    f"Filtering on words, for {self.num_docs} {self.lang} documents"
                 )
                 st.markdown(
                     f"Since the number of words is way larger than the number of documents, "
-                    f"we consider in this section words for the first {self.num_docs_for_words} documents only."
                 )
-                discarded_words = self.words.loc[np.invert(cond_words)]
-                st.subheader(
-                    f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
                 )
-                st.markdown(
-                    "Click on a column to sort by it, place the cursor on the text to display it."
-                )
-                st.dataframe(discarded_words)
-                retained_words = self.words.loc[cond_words]
-                st.subheader(
-                    f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
                 )
-                st.markdown(
-                    "Click on a column to sort by it, place the cursor on the text to display it."
                 )
-                st.dataframe(retained_words)
     def download_parameters(self):
         st.sidebar.subheader("Download parameters")

     def set_title(self):
         st.title(f"Filtering visualization")
+    @staticmethod
+    def print_discarded_by_cond(cond):
+        st.caption(
+            f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
+        )
     @staticmethod
     def plot_hist(dataframe, key, num_bins=50):
         checkbox = st.checkbox(
             ax.axvline(x=key[1], color="r", linestyle="dashed")
             st.pyplot(fig)
+    @staticmethod
+    def display_dataset(dataframe, cond, description, type_of_examples):
+        displayed_examples = dataframe.loc[cond]
+        st.subheader(
+            f"{description}: {len(displayed_examples)} {type_of_examples} ({len(displayed_examples) / len(dataframe.index) * 100:.2f}%)"
+        )
+        st.markdown(
+            "Click on a column to sort by it, place the cursor on the text to display it."
+        )
+        st.dataframe(displayed_examples)
     def filtering_of_docs(self):
         st.sidebar.subheader("Parameters of the filtering on documents")
                     return self.docs[key] <= cutoff
                 return self.docs[key] >= cutoff
             if "number_words" in columns:
                 with st.sidebar.expander("Number of words"):
                     cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond_1)
                     cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
                     cutoff_max_number_words = st.slider(
                     new_key = ("number_words", cutoff_max_number_words, True)
                     keys.append(new_key)
                     cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond_2)
                     conds["number_words"] = [cond_1, cond_2]
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond)
                     conds["repetitions_ratio"] = [cond]
             if "special_characters_ratio" in columns:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond)
                     conds["special_characters_ratio"] = [cond]
             if "stopwords_ratio" in columns:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond)
                     conds["stopwords_ratio"] = [cond]
             if "flagged_words_ratio" in columns:
                                 new_flagged_words,
                             )
                     cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
+                    max_fwr = np.max(self.docs["flagged_words_ratio"])
                     cutoff_flagged_words_ratio = st.slider(
+                        cutoff_def, 0.0, max_fwr, max_fwr, step=0.001
                     )
                     new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond)
                     conds["flagged_words_ratio"] = [cond]
             if "lang_id_score" in columns:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond)
                     conds["lang_id_score"] = [cond]
             if "perplexity_score" in columns:
                     keys.append(new_key)
                     Visualization.plot_hist(self.docs, new_key)
                     cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization.print_discarded_by_cond(cond)
                     conds["perplexity_score"] = [cond]
             return keys, conds
                 f"Filtering on documents, for {self.num_docs} {self.lang} documents"
             )
+            Visualization.display_dataset(
+                self.docs, np.invert(all_conds), "Discarded documents", "docs"
+            )
             # st.subheader("Display discarded documents by filter")
             display_discarded_documents_by_filter = st.checkbox(
                 if "number_words" in columns:
                     cond_filter = np.invert(np.all(conds["number_words"], axis=0))
+                    Visualization.display_dataset(
+                        self.docs,
                         cond_filter,
                         "Discarded documents for the filter on the number of words",
+                        "docs",
                     )
                 if "repetitions_ratio" in columns:
                     cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
+                    Visualization.display_dataset(
+                        self.docs,
                         cond_filter,
                         "Discarded documents for the filter on the repetitions ratio",
+                        "docs",
                     )
                 if "special_characters_ratio" in columns:
                     cond_filter = np.invert(
                         np.all(conds["special_characters_ratio"], axis=0)
                     )
+                    Visualization.display_dataset(
+                        self.docs,
                         cond_filter,
                         "Discarded documents for the filter on the special characters ratio",
+                        "docs",
                     )
                 if "stopwords_ratio" in columns:
                     cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
+                    Visualization.display_dataset(
+                        self.docs,
                         cond_filter,
                         "Discarded documents for the filter on the stop words ratio",
+                        "docs",
                     )
                 if "flagged_words_ratio" in columns:
                     cond_filter = np.invert(
                         np.all(conds["flagged_words_ratio"], axis=0)
                     )
+                    Visualization.display_dataset(
+                        self.docs,
                         cond_filter,
                         "Discarded documents for the filter on the flagged words ratio",
+                        "docs",
                     )
                 if "lang_id_score" in columns:
                     cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
+                    Visualization.display_dataset(
+                        self.docs,
                         cond_filter,
                         "Discarded documents for the filter on the language identification confidence score",
+                        "docs",
                     )
                 if "perplexity_score" in columns:
                     cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
+                    Visualization.display_dataset(
+                        self.docs,
                         cond_filter,
                         "Discarded documents for the filter on the perplexity score",
+                        "docs",
                     )
+            Visualization.display_dataset(
+                self.docs, all_conds, "Retained documents", "docs"
+            )
             st.header("Download data")
     def filtering_of_words(self):
         if not (self.words is None):
+            columns = list(self.words)
             st.sidebar.subheader("Parameter of the filtering on words")
+            conds_words = {}
+            if "len_word" in columns:
+                with st.sidebar.expander("Length of words"):
+                    cutoff_def = "If the length of a word is higher than this number, the word is removed."
+                    max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
+                    cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
+                    new_key = ("len_word", cutoff_word, True)
+                    self.parameters.append(new_key)
+                    Visualization.plot_hist(self.words, new_key)
+                    cond_len_words = self.words["len_word"] <= cutoff_word
+                    Visualization.print_discarded_by_cond(cond_len_words)
+                    conds_words["len_word"] = cond_len_words
+            if "incorrect_substrings" in columns:
+                with st.sidebar.expander("Words with incorrect substrings"):
+                    incorrect_substrings = st.checkbox(
+                        "Remove words with incorrect substrings."
+                    )
+                    self.parameters.append(
+                        ("incorrect_substrings", incorrect_substrings)
+                    )
+                    checkbox = st.checkbox(
+                        "Diplay distribution",
+                        value=True,
+                        key="display_distribution_incorrect_substrings",
                     )
+                    if checkbox:
+                        incor_sub = np.array(self.words["incorrect_substrings"]) * 1
+                        with_incor_sub = np.sum(incor_sub)
+                        without_incor_sub = len(incor_sub) - with_incor_sub
+                        st.markdown(
+                            f"Number of words with incorrect substrings: {with_incor_sub}"
+                        )
+                        st.markdown(
+                            f"Number of words without incorrect substrings: {without_incor_sub}"
+                        )
+                    if incorrect_substrings:
+                        cond_incorrect_substrings = np.invert(
+                            self.words["incorrect_substrings"]
+                        )
+                    else:
+                        cond_incorrect_substrings = np.array(
+                            [
+                                True
+                                for i in range(len(self.words["incorrect_substrings"]))
+                            ]
+                        )
+                    Visualization.print_discarded_by_cond(cond_incorrect_substrings)
+                    conds_words["incorrect_substrings"] = cond_incorrect_substrings
+            all_conds_words = np.all(list(conds_words.values()), axis=0)
             with st.expander(
+                f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
             ):
                 st.header(
+                    f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
                 )
                 st.markdown(
                     f"Since the number of words is way larger than the number of documents, "
+                    f"we consider in this section words for only {self.num_docs_for_words} documents."
                 )
+                Visualization.display_dataset(
+                    self.words, np.invert(all_conds_words), "Discarded words", "words"
                 )
+                # st.subheader("Display discarded words by filter")
+                display_discarded_words_by_filter = st.checkbox(
+                    "Display discarded words by filter"
                 )
+                if display_discarded_words_by_filter:
+                    if "len_word" in columns:
+                        cond_filter = np.invert(conds_words["len_word"])
+                        Visualization.display_dataset(
+                            self.words,
+                            cond_filter,
+                            "Discarded words for the filter on length",
+                            "words",
+                        )
+                    if "incorrect_substrings" in columns:
+                        cond_filter = np.invert(conds_words["incorrect_substrings"])
+                        Visualization.display_dataset(
+                            self.words,
+                            cond_filter,
+                            "Discarded words for the filter on incorrect substrings",
+                            "words",
+                        )
+                Visualization.display_dataset(
+                    self.words, all_conds_words, "Retained words", "words"
                 )
     def download_parameters(self):
         st.sidebar.subheader("Download parameters")

en_examples_with_stats.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ffbb8afeba42822e4b10341112999321e0e14a19a5eeebc342dc68a9f65d3c7f
-size 237426014

 version https://git-lfs.github.com/spec/v1
+oid sha256:29417f05cc029ab24ba89cfc4358dac755411b01f1925c735c2205b68f975fcc
+size 240781004