HugoLaurencon commited on
Commit
5d485e5
1 Parent(s): f217a73

display distributions in sidebar and filtering parameters in expanders

Browse files
Files changed (1) hide show
  1. app.py +155 -150
app.py CHANGED
@@ -113,6 +113,19 @@ class Visualization:
113
  def set_title(self):
114
  st.title(f"{self.num_docs} {self.lang} documents with their stats.")
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def filtering_of_docs(self):
117
  st.sidebar.subheader("Parameters of the filtering on documents")
118
 
@@ -127,135 +140,148 @@ class Visualization:
127
  return self.docs[key] >= cutoff
128
 
129
  def print_discared_by_cond(cond):
130
- st.sidebar.caption(
131
  f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
132
  )
133
- st.sidebar.caption("---------")
134
 
135
  if "number_words" in columns:
136
- cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
137
- max_nb_words = int(np.max(self.docs["number_words"])) + 1
138
- cutoff_min_number_words = st.sidebar.slider(
139
- cutoff_def, 0, min(max_nb_words, 500), 0
140
- )
141
- new_key = ("number_words", cutoff_min_number_words, False)
142
- keys.append(new_key)
143
- cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
144
- print_discared_by_cond(cond_1)
145
-
146
- cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
147
- cutoff_max_number_words = st.sidebar.slider(
148
- cutoff_def, 0, max_nb_words, max_nb_words
149
- )
150
- new_key = ("number_words", cutoff_max_number_words, True)
151
- keys.append(new_key)
152
- cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
153
- print_discared_by_cond(cond_2)
 
 
154
 
155
- conds["number_words"] = [cond_1, cond_2]
156
 
157
  if "repetitions_ratio" in columns:
158
- val_repetitions_lengths = list(
159
- self.docs["repetitions_ratio"].iloc[0].keys()
160
- )
161
- default_index = (
162
- val_repetitions_lengths.index("10")
163
- if "10" in val_repetitions_lengths
164
- else 0
165
- )
166
- label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
167
- repetitions_length = st.sidebar.selectbox(
168
- label=label_selectbox,
169
- options=val_repetitions_lengths,
170
- index=default_index,
171
- )
172
- st.sidebar.caption(
173
- "Choosing a higher or lower number does not mean that the filtering "
174
- "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
175
- "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
176
- "few or no repetitions, simply because their length gives them more diversity, and we do "
177
- "not want to discard such documents."
178
- )
179
- self.docs = self.docs_checkpoint
180
- for i in range(len(self.docs["repetitions_ratio"])):
181
- self.docs["repetitions_ratio"].iloc[i] = self.docs[
182
- "repetitions_ratio"
183
- ].iloc[i][repetitions_length]
184
-
185
- cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
186
- cutoff_repetitions_ratio = st.sidebar.slider(
187
- cutoff_def, 0.0, 1.0, 1.0, step=0.01
188
- )
189
- new_key = (
190
- "repetitions_ratio",
191
- cutoff_repetitions_ratio,
192
- True,
193
- repetitions_length,
194
- )
195
- keys.append(new_key)
196
- cond = get_cond(new_key[0], new_key[1], new_key[2])
197
- print_discared_by_cond(cond)
198
- conds["repetitions_ratio"] = [cond]
 
 
199
 
200
  if "special_characters_ratio" in columns:
201
- cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
202
- cutoff_special_characters_ratio = st.sidebar.slider(
203
- cutoff_def, 0.0, 1.0, 1.0, step=0.01
204
- )
205
- new_key = (
206
- "special_characters_ratio",
207
- cutoff_special_characters_ratio,
208
- True,
209
- )
210
- keys.append(new_key)
211
- cond = get_cond(new_key[0], new_key[1], new_key[2])
212
- print_discared_by_cond(cond)
213
- conds["special_characters_ratio"] = [cond]
 
 
214
 
215
  if "stopwords_ratio" in columns:
216
- cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
217
- cutoff_stopwords_ratio = st.sidebar.slider(
218
- cutoff_def, 0.0, 1.0, 0.0, step=0.01
219
- )
220
- new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
221
- keys.append(new_key)
222
- cond = get_cond(new_key[0], new_key[1], new_key[2])
223
- print_discared_by_cond(cond)
224
- conds["stopwords_ratio"] = [cond]
 
 
225
 
226
  if "flagged_words_ratio" in columns:
227
- cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
228
- cutoff_flagged_words_ratio = st.sidebar.slider(
229
- cutoff_def, 0.0, 1.0, 1.0, step=0.01
230
- )
231
- new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
232
- keys.append(new_key)
233
- cond = get_cond(new_key[0], new_key[1], new_key[2])
234
- print_discared_by_cond(cond)
235
- conds["flagged_words_ratio"] = [cond]
 
 
236
 
237
  if "lang_id_score" in columns:
238
- cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
239
- cutoff_lang_id_score = st.sidebar.slider(
240
- cutoff_def, 0.0, 1.0, 0.0, step=0.01
241
- )
242
- new_key = ("lang_id_score", cutoff_lang_id_score, False)
243
- keys.append(new_key)
244
- cond = get_cond(new_key[0], new_key[1], new_key[2])
245
- print_discared_by_cond(cond)
246
- conds["lang_id_score"] = [cond]
 
 
247
 
248
  if "perplexity_score" in columns:
249
- cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
250
- max_pp = int(np.max(self.docs["perplexity_score"])) + 1
251
- cutoff_perplexity_score = st.sidebar.slider(
252
- cutoff_def, 0, max_pp, max_pp
253
- )
254
- new_key = ("perplexity_score", cutoff_perplexity_score, True)
255
- keys.append(new_key)
256
- cond = get_cond(new_key[0], new_key[1], new_key[2])
257
- print_discared_by_cond(cond)
258
- conds["perplexity_score"] = [cond]
 
 
259
 
260
  return keys, conds
261
 
@@ -344,21 +370,23 @@ class Visualization:
344
  if not (self.words is None):
345
  st.sidebar.subheader("Parameter of the filtering on words")
346
 
347
- cutoff_def = "If the length of a word is higher than this number, the word is removed."
348
- max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
349
- cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
350
- self.parameters.append(("len_word", cutoff_word, True))
351
- st.sidebar.caption("---------")
352
-
353
- incorrect_substrings = st.sidebar.checkbox(
354
- "Remove words with incorrect substrings."
355
- )
356
- self.parameters.append(("incorrect_substrings", incorrect_substrings))
357
- st.sidebar.caption("---------")
 
 
358
 
359
- cond_words = self.words["len_word"] <= cutoff_word
360
- if incorrect_substrings:
361
- cond_words = cond_words & np.invert(self.words["incorrect_substring"])
362
 
363
  st.header("Filtering on words")
364
 
@@ -386,35 +414,13 @@ class Visualization:
386
  st.dataframe(retained_words)
387
 
388
  def download_parameters(self):
 
389
  btn = st.sidebar.download_button(
390
  label="Download current parameters as json",
391
  data=json.dumps(self.parameters),
392
  file_name=f"parameters_{self.lang_dataset_id}.json",
393
  )
394
 
395
- def plot_distributions_filtering_parameters(self):
396
- st.header("Distributions of the filtering parameters")
397
-
398
- display_distributions = st.checkbox("Display distributions")
399
-
400
- if display_distributions:
401
-
402
- def plot_hist(dataframe, key, num_bins=50):
403
- st.subheader(" ".join(key.split("_")))
404
- hist_values = dataframe[key].values
405
- max_range = np.max(hist_values)
406
- hist_values = np.histogram(
407
- hist_values, bins=num_bins, range=(0, max_range)
408
- )[0]
409
- st.bar_chart(hist_values)
410
- st.markdown(f"Each bin is of size: {max_range/num_bins}.")
411
-
412
- for key in list({el[0]: None for el in self.keys}):
413
- plot_hist(self.docs, key)
414
-
415
- if not (self.words is None):
416
- plot_hist(self.words, "len_word")
417
-
418
  def plot_zipf_law(self):
419
  if not (self.words is None):
420
  st.header("Zipf's Law")
@@ -570,7 +576,6 @@ class Visualization:
570
  self.filtering_of_docs()
571
  self.filtering_of_words()
572
  self.download_parameters()
573
- self.plot_distributions_filtering_parameters()
574
  # self.plot_zipf_law()
575
  self.analyse_personal_doc()
576
  self.download_data()
 
113
  def set_title(self):
114
  st.title(f"{self.num_docs} {self.lang} documents with their stats.")
115
 
116
+ @staticmethod
117
+ def plot_hist(dataframe, key, num_bins=50):
118
+ checkbox = st.checkbox("Diplay distribution", value=True, key=f"display_distribution_{key[0]}")
119
+ if checkbox:
120
+ fig, ax = plt.subplots()
121
+ val = dataframe[key[0]].values
122
+ if np.median(val) != 0:
123
+ val = val[abs(val - np.median(val)) < 9 * np.median(np.absolute(val - np.median(val)))]
124
+ ax.hist(val, bins=num_bins, density=True)
125
+ ax.set_title(" ".join(key[0].split("_")))
126
+ ax.axvline(x=key[1], color='r', linestyle='dashed')
127
+ st.pyplot(fig)
128
+
129
  def filtering_of_docs(self):
130
  st.sidebar.subheader("Parameters of the filtering on documents")
131
 
 
140
  return self.docs[key] >= cutoff
141
 
142
  def print_discared_by_cond(cond):
143
+ st.caption(
144
  f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
145
  )
 
146
 
147
  if "number_words" in columns:
148
+ with st.sidebar.expander("Number of words"):
149
+ cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
150
+ max_nb_words = int(np.max(self.docs["number_words"])) + 1
151
+ cutoff_min_number_words = st.slider(
152
+ cutoff_def, 0, min(max_nb_words, 500), 0
153
+ )
154
+ new_key = ("number_words", cutoff_min_number_words, False)
155
+ keys.append(new_key)
156
+ Visualization.plot_hist(self.docs, new_key)
157
+ cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
158
+ print_discared_by_cond(cond_1)
159
+
160
+ cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
161
+ cutoff_max_number_words = st.slider(
162
+ cutoff_def, 0, max_nb_words, max_nb_words
163
+ )
164
+ new_key = ("number_words", cutoff_max_number_words, True)
165
+ keys.append(new_key)
166
+ cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
167
+ print_discared_by_cond(cond_2)
168
 
169
+ conds["number_words"] = [cond_1, cond_2]
170
 
171
  if "repetitions_ratio" in columns:
172
+ with st.sidebar.expander("Repetitions ratio"):
173
+ val_repetitions_lengths = list(
174
+ self.docs["repetitions_ratio"].iloc[0].keys()
175
+ )
176
+ default_index = (
177
+ val_repetitions_lengths.index("10")
178
+ if "10" in val_repetitions_lengths
179
+ else 0
180
+ )
181
+ label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
182
+ repetitions_length = st.selectbox(
183
+ label=label_selectbox,
184
+ options=val_repetitions_lengths,
185
+ index=default_index,
186
+ )
187
+ st.caption(
188
+ "Choosing a higher or lower number does not mean that the filtering "
189
+ "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
190
+ "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
191
+ "few or no repetitions, simply because their length gives them more diversity, and we do "
192
+ "not want to discard such documents."
193
+ )
194
+ self.docs = self.docs_checkpoint
195
+ for i in range(len(self.docs["repetitions_ratio"])):
196
+ self.docs["repetitions_ratio"].iloc[i] = self.docs[
197
+ "repetitions_ratio"
198
+ ].iloc[i][repetitions_length]
199
+
200
+ cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
201
+ cutoff_repetitions_ratio = st.slider(
202
+ cutoff_def, 0.0, 1.0, 1.0, step=0.01
203
+ )
204
+ new_key = (
205
+ "repetitions_ratio",
206
+ cutoff_repetitions_ratio,
207
+ True,
208
+ repetitions_length,
209
+ )
210
+ keys.append(new_key)
211
+ Visualization.plot_hist(self.docs, new_key)
212
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
213
+ print_discared_by_cond(cond)
214
+ conds["repetitions_ratio"] = [cond]
215
 
216
  if "special_characters_ratio" in columns:
217
+ with st.sidebar.expander("Special characters ratio"):
218
+ cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
219
+ cutoff_special_characters_ratio = st.slider(
220
+ cutoff_def, 0.0, 1.0, 1.0, step=0.01
221
+ )
222
+ new_key = (
223
+ "special_characters_ratio",
224
+ cutoff_special_characters_ratio,
225
+ True,
226
+ )
227
+ keys.append(new_key)
228
+ Visualization.plot_hist(self.docs, new_key)
229
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
230
+ print_discared_by_cond(cond)
231
+ conds["special_characters_ratio"] = [cond]
232
 
233
  if "stopwords_ratio" in columns:
234
+ with st.sidebar.expander("Stop words ratio"):
235
+ cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
236
+ cutoff_stopwords_ratio = st.slider(
237
+ cutoff_def, 0.0, 1.0, 0.0, step=0.01
238
+ )
239
+ new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
240
+ keys.append(new_key)
241
+ Visualization.plot_hist(self.docs, new_key)
242
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
243
+ print_discared_by_cond(cond)
244
+ conds["stopwords_ratio"] = [cond]
245
 
246
  if "flagged_words_ratio" in columns:
247
+ with st.sidebar.expander("Flagged words ratio"):
248
+ cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
249
+ cutoff_flagged_words_ratio = st.slider(
250
+ cutoff_def, 0.0, 1.0, 1.0, step=0.01
251
+ )
252
+ new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
253
+ keys.append(new_key)
254
+ Visualization.plot_hist(self.docs, new_key)
255
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
256
+ print_discared_by_cond(cond)
257
+ conds["flagged_words_ratio"] = [cond]
258
 
259
  if "lang_id_score" in columns:
260
+ with st.sidebar.expander("Language ID confidence score"):
261
+ cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
262
+ cutoff_lang_id_score = st.slider(
263
+ cutoff_def, 0.0, 1.0, 0.0, step=0.01
264
+ )
265
+ new_key = ("lang_id_score", cutoff_lang_id_score, False)
266
+ keys.append(new_key)
267
+ Visualization.plot_hist(self.docs, new_key)
268
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
269
+ print_discared_by_cond(cond)
270
+ conds["lang_id_score"] = [cond]
271
 
272
  if "perplexity_score" in columns:
273
+ with st.sidebar.expander("Perplexity score"):
274
+ cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
275
+ max_pp = int(np.max(self.docs["perplexity_score"])) + 1
276
+ cutoff_perplexity_score = st.slider(
277
+ cutoff_def, 0, max_pp, max_pp
278
+ )
279
+ new_key = ("perplexity_score", cutoff_perplexity_score, True)
280
+ keys.append(new_key)
281
+ Visualization.plot_hist(self.docs, new_key)
282
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
283
+ print_discared_by_cond(cond)
284
+ conds["perplexity_score"] = [cond]
285
 
286
  return keys, conds
287
 
 
370
  if not (self.words is None):
371
  st.sidebar.subheader("Parameter of the filtering on words")
372
 
373
+ with st.sidebar.expander("Length of words"):
374
+ cutoff_def = "If the length of a word is higher than this number, the word is removed."
375
+ max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
376
+ cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
377
+ new_key = ("len_word", cutoff_word, True)
378
+ self.parameters.append(new_key)
379
+ Visualization.plot_hist(self.words, new_key)
380
+
381
+ with st.sidebar.expander("Words with incorrect substrings"):
382
+ incorrect_substrings = st.checkbox(
383
+ "Remove words with incorrect substrings."
384
+ )
385
+ self.parameters.append(("incorrect_substrings", incorrect_substrings))
386
 
387
+ cond_words = self.words["len_word"] <= cutoff_word
388
+ if incorrect_substrings:
389
+ cond_words = cond_words & np.invert(self.words["incorrect_substring"])
390
 
391
  st.header("Filtering on words")
392
 
 
414
  st.dataframe(retained_words)
415
 
416
  def download_parameters(self):
417
+ st.sidebar.subheader("Download parameters")
418
  btn = st.sidebar.download_button(
419
  label="Download current parameters as json",
420
  data=json.dumps(self.parameters),
421
  file_name=f"parameters_{self.lang_dataset_id}.json",
422
  )
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  def plot_zipf_law(self):
425
  if not (self.words is None):
426
  st.header("Zipf's Law")
 
576
  self.filtering_of_docs()
577
  self.filtering_of_words()
578
  self.download_parameters()
 
579
  # self.plot_zipf_law()
580
  self.analyse_personal_doc()
581
  self.download_data()