Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Jan 27, 2022

Commit

061d2e4

1 Parent(s): 4809033

add register information

Browse files

Files changed (3) hide show

app.py +129 -92
en_examples_with_stats.json +2 -2
zh_examples_with_stats.json +2 -2

app.py CHANGED Viewed

@@ -120,8 +120,6 @@ class Visualization_for_lang:
         st.dataframe(displayed_examples)
     def filtering_of_docs(self):
-        st.sidebar.subheader("Parameters of the filtering on documents")
         def set_sliders():
             columns = list(self.docs)
             keys = []
@@ -377,12 +375,6 @@ class Visualization_for_lang:
             return keys, conds
-        self.keys, conds = set_sliders()
-        self.parameters = self.keys * 1
-        all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
-        all_conds = np.all(all_conds, axis=0)
         with st.expander(
             f"Filtering on documents, for {self.num_docs} {self.lang} documents"
         ):
@@ -390,101 +382,146 @@ class Visualization_for_lang:
                 f"Filtering on documents, for {self.num_docs} {self.lang} documents"
             )
-            Visualization_for_lang.display_dataset(
-                self.docs, np.invert(all_conds), "Discarded documents", "docs"
-            )
-            # st.subheader("Display discarded documents by filter")
-            display_discarded_documents_by_filter = st.checkbox(
-                "Display discarded documents by filter"
-            )
-            if display_discarded_documents_by_filter:
-                columns = list(self.docs)
-                if "number_words" in columns:
-                    cond_filter = np.invert(np.all(conds["number_words"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the number of words",
-                        "docs",
-                    )
-                if "character_repetition_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["character_repetition_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the character repetition ratio",
-                        "docs",
-                    )
-                if "word_repetition_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["word_repetition_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the word repetition ratio",
-                        "docs",
-                    )
-                if "special_characters_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["special_characters_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the special characters ratio",
-                        "docs",
-                    )
-                if "stopwords_ratio" in columns:
-                    cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the stop words ratio",
-                        "docs",
-                    )
-                if "flagged_words_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["flagged_words_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the flagged words ratio",
-                        "docs",
-                    )
-                if "lang_id_score" in columns:
-                    cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the language identification confidence score",
-                        "docs",
-                    )
-                if "perplexity_score" in columns:
-                    cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the perplexity score",
-                        "docs",
-                    )
-            Visualization_for_lang.display_dataset(
-                self.docs, all_conds, "Retained documents", "docs"
-            )
             st.header("Download data")

         st.dataframe(displayed_examples)
     def filtering_of_docs(self):
         def set_sliders():
             columns = list(self.docs)
             keys = []
             return keys, conds
         with st.expander(
             f"Filtering on documents, for {self.num_docs} {self.lang} documents"
         ):
                 f"Filtering on documents, for {self.num_docs} {self.lang} documents"
             )
+            if "labels" in list(self.docs):
+                chosen_label = st.selectbox(
+                    label="Consider only documents that include the following label",
+                    options=[
+                        "All",
+                        "NA: Narrative",
+                        "IN: Informational Description",
+                        "OP: Opinion",
+                        "ID: Interactive Discussion",
+                        "HI: How-to/Instruction",
+                        "IP: Informational Persuasion",
+                        "LY: Lyrical",
+                        "SP: Spoken",
+                    ],
+                )
+                chosen_label = chosen_label.split(":")[0]
+                if chosen_label != "All":
+                    cond_label = list(
+                        self.docs["labels"].apply(
+                            lambda x: True if chosen_label in x else False
+                        )
+                    )
+                    self.docs = self.docs[cond_label]
+            if self.docs.empty:
+                st.markdown(
+                    "No document to display, please try to select a different label."
+                )
+                self.keys = []
+                self.parameters = []
+            else:
+                st.sidebar.subheader("Parameters of the filtering on documents")
+                self.keys, conds = set_sliders()
+                self.parameters = self.keys * 1
+                all_conds = [
+                    subcond for cond in list(conds.values()) for subcond in cond
+                ]
+                all_conds = np.all(all_conds, axis=0)
+                Visualization_for_lang.display_dataset(
+                    self.docs, np.invert(all_conds), "Discarded documents", "docs"
+                )
+                # st.subheader("Display discarded documents by filter")
+                display_discarded_documents_by_filter = st.checkbox(
+                    "Display discarded documents by filter"
+                )
+                if display_discarded_documents_by_filter:
+                    columns = list(self.docs)
+                    if "number_words" in columns:
+                        cond_filter = np.invert(np.all(conds["number_words"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the number of words",
+                            "docs",
+                        )
+                    if "character_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["character_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the character repetition ratio",
+                            "docs",
+                        )
+                    if "word_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["word_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the word repetition ratio",
+                            "docs",
+                        )
+                    if "special_characters_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["special_characters_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the special characters ratio",
+                            "docs",
+                        )
+                    if "stopwords_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["stopwords_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the stop words ratio",
+                            "docs",
+                        )
+                    if "flagged_words_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["flagged_words_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the flagged words ratio",
+                            "docs",
+                        )
+                    if "lang_id_score" in columns:
+                        cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the language identification confidence score",
+                            "docs",
+                        )
+                    if "perplexity_score" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["perplexity_score"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the perplexity score",
+                            "docs",
+                        )
+                Visualization_for_lang.display_dataset(
+                    self.docs, all_conds, "Retained documents", "docs"
+                )
             st.header("Download data")

en_examples_with_stats.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac12d82e24642fd0b1d4f6c5b8fbe1edb42dc15a38185ccc8ec95ac0fe687bc2
-size 241407829

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd798b2bc010480cf0777b41bac9dfde2ab1c0ba17e151400b9e1359aa1a114c
+size 276101032

zh_examples_with_stats.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85f70e561c971b468ba69963841b73e6a6da0a230f19f191234701e926688feb
-size 63554172

 version https://git-lfs.github.com/spec/v1
+oid sha256:e8b02e485e2736cc5e407a567adcb09d228ce0e2eb6ed7609749e77028446175
+size 74914733