Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Dec 13, 2021

Commit

0add2d4

1 Parent(s): a547ccb

new visu

Browse files

Files changed (2) hide show

app.py +250 -109
en_examples_with_stats.json +3 -0

app.py CHANGED Viewed

@@ -1,138 +1,279 @@
 import streamlit as st
 import json
 import pandas as pd
-import math
 import numpy as np
 import matplotlib.pyplot as plt
-def visualization(path_data, lang, num_docs, num_docs_for_words):
-    with open(path_data) as json_file:
-        data = json.load(json_file)
-    num_docs = min(num_docs, len(data))
-    st.title(f"{num_docs} {lang} documents from Oscar with their stats.")
-    sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
-    words = set([word for sentence in sentences for word in sentence])
-    words_data = [{"len_word": len(word), "word": word} for word in words]
-    words_data = pd.DataFrame(words_data)
-    data = data[:num_docs]
-    data = pd.DataFrame(data)
-    columns = list(data)
-    keys = []
-    values = {}
-    st.header("Filtering based on document content")
-    if "special_%" in columns:
-        special_ratio = st.sidebar.slider(
-            "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=0.1
         )
-        cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
-        special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
-        st.sidebar.text(f"No docs with <{special_cutoff:.1f}% special chars")
-        keys.append(("special_%", special_cutoff, True))
-    if "stop_%" in columns:
-        stop_ratio = st.sidebar.slider(
-            "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=0.1
         )
-        cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
-        stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
-        st.sidebar.text(f"No docs with >{stop_cutoff:.2f}% stop words")
-        keys.append(("stop_%", stop_cutoff, False))
-    @st.cache(suppress_st_warning=True)
-    def recalculate_flagged_words(file):
-        def flagged_word_ratio(text: str, flagged_word_list):
-            return len([word for word in text.split() if word.lower().strip() in flagged_word_list]) / len(text.split())
-        flagged_word_list = [word.decode().strip() for word in file.readlines()]
-        flagged_word_ratios = [flagged_word_ratio(text, flagged_word_list) * 100 for text in data["text"]]
-        data["flagged_%"] = flagged_word_ratios
-    flagged_word_file = st.sidebar.file_uploader("Upload your own list of flagged words (1 word per line)")
-    if "flagged_%" in columns:
-        flagged_ratio = st.sidebar.slider(
-            "% filtered by flagged words ratio", 0.0, 50.0, 0.0, step=0.1
         )
-        flagged_index = max(0, math.floor((100 - flagged_ratio) * len(data.index) / 100) - 1)
-        flagged_cutoff = np.partition(data["flagged_%"], flagged_index)[flagged_index]
-        st.sidebar.text(f"No docs with >{flagged_cutoff:.2f}% flagged words")
-        keys.append(("flagged_%", flagged_cutoff, True))
-    if "perplexity" in columns:
-        ppl_ratio = st.sidebar.slider(
-            "% filtered by perplexity", 0.0, 50.0, 0.0, step=0.1
         )
-        ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
-        ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
-        st.sidebar.text(f"No docs with >{ppl_cutoff:.0f} perplexity")
-        keys.append(("perplexity", ppl_cutoff, True))
-    cond = [
-        (data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
-        for key, cutoff, max_cutoff in keys
-    ]
-    cond = np.all(cond, axis=0)
-    data_not_keep = data.loc[np.invert(cond)]
-    st.subheader(f"Filtered data: {np.invert(cond).sum()} docs")
-    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
-    st.dataframe(data_not_keep)
-    data_keep = data.loc[cond]
-    st.subheader(f"Kept data: {cond.sum()} docs")
-    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
-    st.dataframe(data_keep)
-    # def plot_hist(dataframe, key, num_bins=50):
-    #     st.subheader(" ".join(key.split("_")))
-    #     hist_values = dataframe[key].values
-    #     max_range = np.max(hist_values)
-    #     hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
-    #     st.bar_chart(hist_values)
-    #     st.markdown(f"Each bin is of size: {max_range/num_bins}.")
-    # for key, _, _ in keys:
-    #     plot_hist(data, key)
-    st.header("Filtering links and concatenated words")
-    max_len_word = int(np.max(words_data["len_word"])) + 1
-    cutoff_word = st.sidebar.slider("Word length cutoff", 0, max_len_word, max_len_word)
-    cond_words = words_data["len_word"] <= cutoff_word
-    words_keep = words_data.loc[cond_words]
-    st.subheader(f"Words that we keep (for {num_docs_for_words} documents)")
-    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
-    st.dataframe(words_keep)
-    words_not_keep = words_data.loc[np.invert(cond_words)]
-    st.subheader(f"Words that are thrown away (for {num_docs_for_words} documents)")
-    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
-    st.dataframe(words_not_keep)
-    st.header("Download data")
-    with open(path_data) as json_file:
-        btn = st.download_button(
-            label="Download data as json",
-            data=json_file,
-            file_name="data.json",
         )
-path_data = "./en_examples_with_stats_ldnoob.json"
 lang = "English"
-num_docs = 5000
-num_docs_for_words = 500
-visualization(path_data, lang, num_docs, num_docs_for_words)

+# Run with: streamlit run visualization.py
 import streamlit as st
 import json
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
+class Visualization:
+    def __init__(
+        self, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
+    ):
+        self.path_data = path_data
+        self.lang = lang
+        self.num_docs = num_docs
+        self.num_docs_for_words = num_docs_for_words
+        self.max_len_text_display = max_len_text_display
+    def open_data(self):
+        with open(self.path_data) as json_file:
+            data = json.load(json_file)
+        self.num_docs = min(self.num_docs, len(data))
+        self.num_docs_for_words = min(self.num_docs_for_words, len(data))
+        words = [doc["words"] for doc in data[: self.num_docs_for_words]]
+        words = [word for doc in words for word in doc]
+        self.words = pd.DataFrame(words)
+        docs = data[: self.num_docs]
+        for doc in docs:
+            del doc["words"]
+            if len(doc["text"]) > self.max_len_text_display:
+                doc["text"] = (
+                    doc["text"][: self.max_len_text_display]
+                    + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
+                )
+        self.docs = pd.DataFrame(docs)
+    def set_title(self):
+        st.title(f"{self.num_docs} {self.lang} documents from Oscar with their stats.")
+    def filtering_of_docs(self):
+        st.sidebar.subheader("Parameters of the filtering on documents")
+        def set_sliders(docs):
+            columns = list(docs)
+            keys = []
+            conds = []
+            def get_cond(key, cutoff, max_cutoff):
+                if max_cutoff:
+                    return self.docs[key] <= cutoff
+                return self.docs[key] >= cutoff
+            def print_discared_by_cond(cond):
+                st.sidebar.caption(
+                    f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter"
+                )
+                st.sidebar.caption("---------")
+            if "number_words" in columns:
+                max_nb_words = int(np.max(docs["number_words"])) + 1
+                cutoff_min_number_words = st.sidebar.slider(
+                    "Min cutoff number words", 0, max_nb_words, 0
+                )
+                new_key = ("number_words", cutoff_min_number_words, False)
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                conds.append(cond)
+                print_discared_by_cond(cond)
+                cutoff_max_number_words = st.sidebar.slider(
+                    "Max cutoff number words", 0, max_nb_words, max_nb_words
+                )
+                new_key = ("number_words", cutoff_max_number_words, True)
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                conds.append(cond)
+                print_discared_by_cond(cond)
+            if "special_characters_ratio" in columns:
+                cutoff_special_characters_ratio = st.sidebar.slider(
+                    "Max cutoff special characters ratio", 0.0, 1.0, 1.0, step=0.01
+                )
+                new_key = (
+                    "special_characters_ratio",
+                    cutoff_special_characters_ratio,
+                    True,
+                )
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                conds.append(cond)
+                print_discared_by_cond(cond)
+            if "stopwords_ratio" in columns:
+                cutoff_stopwords_ratio = st.sidebar.slider(
+                    "Min cutoff stopwords ratio", 0.0, 1.0, 0.0, step=0.01
+                )
+                new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                conds.append(cond)
+                print_discared_by_cond(cond)
+            if "badwords_ratio" in columns:
+                cutoff_badwords_ratio = st.sidebar.slider(
+                    "Max cutoff badwords ratio", 0.0, 1.0, 1.0, step=0.01
+                )
+                new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                conds.append(cond)
+                print_discared_by_cond(cond)
+            if "lang_id_score" in columns:
+                cutoff_lang_id_score = st.sidebar.slider(
+                    "Min cutoff lang id score", 0.0, 1.0, 0.0, step=0.01
+                )
+                new_key = ("lang_id_score", cutoff_lang_id_score, False)
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                conds.append(cond)
+                print_discared_by_cond(cond)
+            if "perplexity_score" in columns:
+                max_pp = int(np.max(docs["perplexity_score"])) + 1
+                cutoff_perplexity_score = st.sidebar.slider(
+                    "Perplexity cutoff perplexity score", 0, max_pp, max_pp
+                )
+                new_key = ("perplexity_score", cutoff_perplexity_score, True)
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                conds.append(cond)
+                print_discared_by_cond(cond)
+            return keys, conds
+        self.keys, conds = set_sliders(self.docs)
+        conds = np.all(conds, axis=0)
+        st.header("Filtering on documents")
+        self.discarded_docs = self.docs.loc[np.invert(conds)]
+        st.subheader(
+            f"Discarded documents: {len(self.discarded_docs)} docs ({len(self.discarded_docs) / self.num_docs * 100:.2f}%)"
         )
+        st.markdown(
+            "Click on a column to sort by it, place the cursor on the text to display it."
         )
+        st.dataframe(self.discarded_docs)
+        self.retained_docs = self.docs.loc[conds]
+        st.subheader(
+            f"Retained documents: {len(self.retained_docs)} docs ({len(self.retained_docs) / self.num_docs * 100:.2f}%)"
+        )
+        st.markdown(
+            "Click on a column to sort by it, place the cursor on the text to display it."
+        )
+        st.dataframe(self.retained_docs)
+    def filtering_of_words(self):
+        st.sidebar.subheader("Parameter of the filtering on words")
+        max_len_word = int(np.max(self.words["len_word"])) + 1
+        cutoff_word = st.sidebar.slider(
+            "Max cutoff length word", 0, max_len_word, max_len_word
+        )
+        incorrect_substrings = st.sidebar.checkbox(
+            "Remove words with incorrect substrings"
+        )
+        cond_words = self.words["len_word"] <= cutoff_word
+        if incorrect_substrings:
+            cond_words = cond_words & np.invert(self.words["incorrect_substring"])
+        st.header("Filtering on words")
+        st.markdown(
+            f"Since the number of words is way larger than the number of documents, "
+            f"we consider in this section words for the first {self.num_docs_for_words} documents only."
+        )
+        discarded_words = self.words.loc[np.invert(cond_words)]
+        st.subheader(
+            f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
+        )
+        st.markdown(
+            "Click on a column to sort by it, place the cursor on the text to display it."
         )
+        st.dataframe(discarded_words)
+        retained_words = self.words.loc[cond_words]
+        st.subheader(
+            f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
         )
+        st.markdown(
+            "Click on a column to sort by it, place the cursor on the text to display it."
         )
+        st.dataframe(retained_words)
+    def plot_distributions_filtering_parameters(self):
+        st.header("Distributions of the filtering parameters")
+        display_distributions = st.checkbox("Display distributions")
+        if display_distributions:
+            def plot_hist(dataframe, key, num_bins=50):
+                st.subheader(" ".join(key.split("_")))
+                hist_values = dataframe[key].values
+                max_range = np.max(hist_values)
+                hist_values = np.histogram(
+                    hist_values, bins=num_bins, range=(0, max_range)
+                )[0]
+                st.bar_chart(hist_values)
+                st.markdown(f"Each bin is of size: {max_range/num_bins}.")
+            for key in list({el[0]: None for el in self.keys}):
+                plot_hist(self.docs, key)
+            plot_hist(self.words, "len_word")
+    def plot_zipf_law(self):
+        st.header("Zipf's Law")
+        display_zipf_law = st.checkbox("Display Zipf's Law")
+        if display_zipf_law:
+            freq_words = {}
+            for _, row in self.words.iterrows():
+                freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
+            freq_words = np.array(list(freq_words.values()))
+            freq_words = -np.sort(-freq_words)
+            fig, ax = plt.subplots()
+            ax.loglog(freq_words)
+            ax.set_title("Zipf's Law")
+            ax.set_xlabel("$i$-th most frequent word")
+            ax.set_ylabel("frequency in the documents")
+            st.pyplot(fig)
+    def download_data(self):
+        st.header("Download data")
+        with open(self.path_data) as json_file:
+            btn = st.download_button(
+                label="Download data as json",
+                data=json_file,
+                file_name="data.json",
+            )
+    def visualization(self):
+        self.open_data()
+        self.set_title()
+        self.filtering_of_docs()
+        self.filtering_of_words()
+        self.plot_distributions_filtering_parameters()
+        self.plot_zipf_law()
+        self.download_data()
+path_data = "./en_examples_with_stats.json"
 lang = "English"
+num_docs = 15000
+num_docs_for_words = 1500
+max_len_text_display = 10000
+visualization = Visualization(
+    path_data, lang, num_docs, num_docs_for_words, max_len_text_display
+)
+visualization.visualization()

en_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63326ed83f24f9afef4cd8149e99c1344ed9338e47a9c48b3b6a45705504e1ca
+size 933098320