HugoLaurencon commited on
Commit
0add2d4
1 Parent(s): a547ccb
Files changed (2) hide show
  1. app.py +250 -109
  2. en_examples_with_stats.json +3 -0
app.py CHANGED
@@ -1,138 +1,279 @@
 
 
1
  import streamlit as st
 
2
  import json
3
  import pandas as pd
4
- import math
5
  import numpy as np
 
6
  import matplotlib.pyplot as plt
7
 
8
 
9
- def visualization(path_data, lang, num_docs, num_docs_for_words):
10
-
11
- with open(path_data) as json_file:
12
- data = json.load(json_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- num_docs = min(num_docs, len(data))
 
 
 
15
 
16
- st.title(f"{num_docs} {lang} documents from Oscar with their stats.")
 
 
 
 
17
 
18
- sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
19
- words = set([word for sentence in sentences for word in sentence])
20
- words_data = [{"len_word": len(word), "word": word} for word in words]
21
- words_data = pd.DataFrame(words_data)
 
 
 
 
 
 
22
 
23
- data = data[:num_docs]
24
- data = pd.DataFrame(data)
 
 
 
 
 
 
25
 
26
- columns = list(data)
27
- keys = []
28
- values = {}
 
 
 
 
 
 
 
 
 
 
29
 
30
- st.header("Filtering based on document content")
 
 
 
 
 
 
 
 
31
 
32
- if "special_%" in columns:
33
- special_ratio = st.sidebar.slider(
34
- "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  )
36
- cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
37
- special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
38
- st.sidebar.text(f"No docs with <{special_cutoff:.1f}% special chars")
39
- keys.append(("special_%", special_cutoff, True))
40
-
41
- if "stop_%" in columns:
42
- stop_ratio = st.sidebar.slider(
43
- "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=0.1
44
  )
45
- cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
46
- stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
47
- st.sidebar.text(f"No docs with >{stop_cutoff:.2f}% stop words")
48
- keys.append(("stop_%", stop_cutoff, False))
49
 
50
- @st.cache(suppress_st_warning=True)
51
- def recalculate_flagged_words(file):
 
 
 
 
 
 
52
 
53
- def flagged_word_ratio(text: str, flagged_word_list):
54
- return len([word for word in text.split() if word.lower().strip() in flagged_word_list]) / len(text.split())
55
 
56
- flagged_word_list = [word.decode().strip() for word in file.readlines()]
 
 
 
57
 
58
- flagged_word_ratios = [flagged_word_ratio(text, flagged_word_list) * 100 for text in data["text"]]
59
- data["flagged_%"] = flagged_word_ratios
 
 
 
 
 
60
 
61
- flagged_word_file = st.sidebar.file_uploader("Upload your own list of flagged words (1 word per line)")
62
 
63
- if "flagged_%" in columns:
64
- flagged_ratio = st.sidebar.slider(
65
- "% filtered by flagged words ratio", 0.0, 50.0, 0.0, step=0.1
 
 
 
 
 
 
 
 
66
  )
67
- flagged_index = max(0, math.floor((100 - flagged_ratio) * len(data.index) / 100) - 1)
68
- flagged_cutoff = np.partition(data["flagged_%"], flagged_index)[flagged_index]
69
- st.sidebar.text(f"No docs with >{flagged_cutoff:.2f}% flagged words")
70
- keys.append(("flagged_%", flagged_cutoff, True))
71
-
72
- if "perplexity" in columns:
73
- ppl_ratio = st.sidebar.slider(
74
- "% filtered by perplexity", 0.0, 50.0, 0.0, step=0.1
75
  )
76
- ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
77
- ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
78
- st.sidebar.text(f"No docs with >{ppl_cutoff:.0f} perplexity")
79
- keys.append(("perplexity", ppl_cutoff, True))
80
-
81
- cond = [
82
- (data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
83
- for key, cutoff, max_cutoff in keys
84
- ]
85
- cond = np.all(cond, axis=0)
86
-
87
- data_not_keep = data.loc[np.invert(cond)]
88
- st.subheader(f"Filtered data: {np.invert(cond).sum()} docs")
89
- st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
90
- st.dataframe(data_not_keep)
91
-
92
- data_keep = data.loc[cond]
93
- st.subheader(f"Kept data: {cond.sum()} docs")
94
- st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
95
- st.dataframe(data_keep)
96
-
97
- # def plot_hist(dataframe, key, num_bins=50):
98
- # st.subheader(" ".join(key.split("_")))
99
- # hist_values = dataframe[key].values
100
- # max_range = np.max(hist_values)
101
- # hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
102
- # st.bar_chart(hist_values)
103
- # st.markdown(f"Each bin is of size: {max_range/num_bins}.")
104
-
105
- # for key, _, _ in keys:
106
- # plot_hist(data, key)
107
-
108
- st.header("Filtering links and concatenated words")
109
- max_len_word = int(np.max(words_data["len_word"])) + 1
110
- cutoff_word = st.sidebar.slider("Word length cutoff", 0, max_len_word, max_len_word)
111
- cond_words = words_data["len_word"] <= cutoff_word
112
-
113
- words_keep = words_data.loc[cond_words]
114
- st.subheader(f"Words that we keep (for {num_docs_for_words} documents)")
115
- st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
116
- st.dataframe(words_keep)
117
-
118
- words_not_keep = words_data.loc[np.invert(cond_words)]
119
- st.subheader(f"Words that are thrown away (for {num_docs_for_words} documents)")
120
- st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
121
- st.dataframe(words_not_keep)
122
-
123
- st.header("Download data")
124
-
125
- with open(path_data) as json_file:
126
- btn = st.download_button(
127
- label="Download data as json",
128
- data=json_file,
129
- file_name="data.json",
130
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
 
133
- path_data = "./en_examples_with_stats_ldnoob.json"
134
  lang = "English"
135
- num_docs = 5000
136
- num_docs_for_words = 500
 
137
 
138
- visualization(path_data, lang, num_docs, num_docs_for_words)
 
 
 
 
1
+ # Run with: streamlit run visualization.py
2
+
3
  import streamlit as st
4
+
5
  import json
6
  import pandas as pd
7
+
8
  import numpy as np
9
+
10
  import matplotlib.pyplot as plt
11
 
12
 
13
+ class Visualization:
14
+ def __init__(
15
+ self, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
16
+ ):
17
+ self.path_data = path_data
18
+ self.lang = lang
19
+ self.num_docs = num_docs
20
+ self.num_docs_for_words = num_docs_for_words
21
+ self.max_len_text_display = max_len_text_display
22
+
23
+ def open_data(self):
24
+ with open(self.path_data) as json_file:
25
+ data = json.load(json_file)
26
+
27
+ self.num_docs = min(self.num_docs, len(data))
28
+ self.num_docs_for_words = min(self.num_docs_for_words, len(data))
29
+
30
+ words = [doc["words"] for doc in data[: self.num_docs_for_words]]
31
+ words = [word for doc in words for word in doc]
32
+ self.words = pd.DataFrame(words)
33
+
34
+ docs = data[: self.num_docs]
35
+ for doc in docs:
36
+ del doc["words"]
37
+ if len(doc["text"]) > self.max_len_text_display:
38
+ doc["text"] = (
39
+ doc["text"][: self.max_len_text_display]
40
+ + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
41
+ )
42
+ self.docs = pd.DataFrame(docs)
43
+
44
+ def set_title(self):
45
+ st.title(f"{self.num_docs} {self.lang} documents from Oscar with their stats.")
46
+
47
+ def filtering_of_docs(self):
48
+ st.sidebar.subheader("Parameters of the filtering on documents")
49
+
50
+ def set_sliders(docs):
51
+ columns = list(docs)
52
+ keys = []
53
+ conds = []
54
 
55
+ def get_cond(key, cutoff, max_cutoff):
56
+ if max_cutoff:
57
+ return self.docs[key] <= cutoff
58
+ return self.docs[key] >= cutoff
59
 
60
+ def print_discared_by_cond(cond):
61
+ st.sidebar.caption(
62
+ f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter"
63
+ )
64
+ st.sidebar.caption("---------")
65
 
66
+ if "number_words" in columns:
67
+ max_nb_words = int(np.max(docs["number_words"])) + 1
68
+ cutoff_min_number_words = st.sidebar.slider(
69
+ "Min cutoff number words", 0, max_nb_words, 0
70
+ )
71
+ new_key = ("number_words", cutoff_min_number_words, False)
72
+ keys.append(new_key)
73
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
74
+ conds.append(cond)
75
+ print_discared_by_cond(cond)
76
 
77
+ cutoff_max_number_words = st.sidebar.slider(
78
+ "Max cutoff number words", 0, max_nb_words, max_nb_words
79
+ )
80
+ new_key = ("number_words", cutoff_max_number_words, True)
81
+ keys.append(new_key)
82
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
83
+ conds.append(cond)
84
+ print_discared_by_cond(cond)
85
 
86
+ if "special_characters_ratio" in columns:
87
+ cutoff_special_characters_ratio = st.sidebar.slider(
88
+ "Max cutoff special characters ratio", 0.0, 1.0, 1.0, step=0.01
89
+ )
90
+ new_key = (
91
+ "special_characters_ratio",
92
+ cutoff_special_characters_ratio,
93
+ True,
94
+ )
95
+ keys.append(new_key)
96
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
97
+ conds.append(cond)
98
+ print_discared_by_cond(cond)
99
 
100
+ if "stopwords_ratio" in columns:
101
+ cutoff_stopwords_ratio = st.sidebar.slider(
102
+ "Min cutoff stopwords ratio", 0.0, 1.0, 0.0, step=0.01
103
+ )
104
+ new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
105
+ keys.append(new_key)
106
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
107
+ conds.append(cond)
108
+ print_discared_by_cond(cond)
109
 
110
+ if "badwords_ratio" in columns:
111
+ cutoff_badwords_ratio = st.sidebar.slider(
112
+ "Max cutoff badwords ratio", 0.0, 1.0, 1.0, step=0.01
113
+ )
114
+ new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
115
+ keys.append(new_key)
116
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
117
+ conds.append(cond)
118
+ print_discared_by_cond(cond)
119
+
120
+ if "lang_id_score" in columns:
121
+ cutoff_lang_id_score = st.sidebar.slider(
122
+ "Min cutoff lang id score", 0.0, 1.0, 0.0, step=0.01
123
+ )
124
+ new_key = ("lang_id_score", cutoff_lang_id_score, False)
125
+ keys.append(new_key)
126
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
127
+ conds.append(cond)
128
+ print_discared_by_cond(cond)
129
+
130
+ if "perplexity_score" in columns:
131
+ max_pp = int(np.max(docs["perplexity_score"])) + 1
132
+ cutoff_perplexity_score = st.sidebar.slider(
133
+ "Perplexity cutoff perplexity score", 0, max_pp, max_pp
134
+ )
135
+ new_key = ("perplexity_score", cutoff_perplexity_score, True)
136
+ keys.append(new_key)
137
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
138
+ conds.append(cond)
139
+ print_discared_by_cond(cond)
140
+
141
+ return keys, conds
142
+
143
+ self.keys, conds = set_sliders(self.docs)
144
+
145
+ conds = np.all(conds, axis=0)
146
+
147
+ st.header("Filtering on documents")
148
+
149
+ self.discarded_docs = self.docs.loc[np.invert(conds)]
150
+ st.subheader(
151
+ f"Discarded documents: {len(self.discarded_docs)} docs ({len(self.discarded_docs) / self.num_docs * 100:.2f}%)"
152
  )
153
+ st.markdown(
154
+ "Click on a column to sort by it, place the cursor on the text to display it."
 
 
 
 
 
 
155
  )
156
+ st.dataframe(self.discarded_docs)
 
 
 
157
 
158
+ self.retained_docs = self.docs.loc[conds]
159
+ st.subheader(
160
+ f"Retained documents: {len(self.retained_docs)} docs ({len(self.retained_docs) / self.num_docs * 100:.2f}%)"
161
+ )
162
+ st.markdown(
163
+ "Click on a column to sort by it, place the cursor on the text to display it."
164
+ )
165
+ st.dataframe(self.retained_docs)
166
 
167
+ def filtering_of_words(self):
168
+ st.sidebar.subheader("Parameter of the filtering on words")
169
 
170
+ max_len_word = int(np.max(self.words["len_word"])) + 1
171
+ cutoff_word = st.sidebar.slider(
172
+ "Max cutoff length word", 0, max_len_word, max_len_word
173
+ )
174
 
175
+ incorrect_substrings = st.sidebar.checkbox(
176
+ "Remove words with incorrect substrings"
177
+ )
178
+
179
+ cond_words = self.words["len_word"] <= cutoff_word
180
+ if incorrect_substrings:
181
+ cond_words = cond_words & np.invert(self.words["incorrect_substring"])
182
 
183
+ st.header("Filtering on words")
184
 
185
+ st.markdown(
186
+ f"Since the number of words is way larger than the number of documents, "
187
+ f"we consider in this section words for the first {self.num_docs_for_words} documents only."
188
+ )
189
+
190
+ discarded_words = self.words.loc[np.invert(cond_words)]
191
+ st.subheader(
192
+ f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
193
+ )
194
+ st.markdown(
195
+ "Click on a column to sort by it, place the cursor on the text to display it."
196
  )
197
+ st.dataframe(discarded_words)
198
+
199
+ retained_words = self.words.loc[cond_words]
200
+ st.subheader(
201
+ f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
 
 
 
202
  )
203
+ st.markdown(
204
+ "Click on a column to sort by it, place the cursor on the text to display it."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  )
206
+ st.dataframe(retained_words)
207
+
208
+ def plot_distributions_filtering_parameters(self):
209
+ st.header("Distributions of the filtering parameters")
210
+
211
+ display_distributions = st.checkbox("Display distributions")
212
+
213
+ if display_distributions:
214
+
215
+ def plot_hist(dataframe, key, num_bins=50):
216
+ st.subheader(" ".join(key.split("_")))
217
+ hist_values = dataframe[key].values
218
+ max_range = np.max(hist_values)
219
+ hist_values = np.histogram(
220
+ hist_values, bins=num_bins, range=(0, max_range)
221
+ )[0]
222
+ st.bar_chart(hist_values)
223
+ st.markdown(f"Each bin is of size: {max_range/num_bins}.")
224
+
225
+ for key in list({el[0]: None for el in self.keys}):
226
+ plot_hist(self.docs, key)
227
+
228
+ plot_hist(self.words, "len_word")
229
+
230
+ def plot_zipf_law(self):
231
+ st.header("Zipf's Law")
232
+
233
+ display_zipf_law = st.checkbox("Display Zipf's Law")
234
+
235
+ if display_zipf_law:
236
+
237
+ freq_words = {}
238
+ for _, row in self.words.iterrows():
239
+ freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
240
+ freq_words = np.array(list(freq_words.values()))
241
+ freq_words = -np.sort(-freq_words)
242
+
243
+ fig, ax = plt.subplots()
244
+ ax.loglog(freq_words)
245
+ ax.set_title("Zipf's Law")
246
+ ax.set_xlabel("$i$-th most frequent word")
247
+ ax.set_ylabel("frequency in the documents")
248
+ st.pyplot(fig)
249
+
250
+ def download_data(self):
251
+ st.header("Download data")
252
+
253
+ with open(self.path_data) as json_file:
254
+ btn = st.download_button(
255
+ label="Download data as json",
256
+ data=json_file,
257
+ file_name="data.json",
258
+ )
259
+
260
+ def visualization(self):
261
+ self.open_data()
262
+ self.set_title()
263
+ self.filtering_of_docs()
264
+ self.filtering_of_words()
265
+ self.plot_distributions_filtering_parameters()
266
+ self.plot_zipf_law()
267
+ self.download_data()
268
 
269
 
270
+ path_data = "./en_examples_with_stats.json"
271
  lang = "English"
272
+ num_docs = 15000
273
+ num_docs_for_words = 1500
274
+ max_len_text_display = 10000
275
 
276
+ visualization = Visualization(
277
+ path_data, lang, num_docs, num_docs_for_words, max_len_text_display
278
+ )
279
+ visualization.visualization()
en_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63326ed83f24f9afef4cd8149e99c1344ed9338e47a9c48b3b6a45705504e1ca
3
+ size 933098320