HugoLaurencon HF staff commited on
Commit
da13b29
1 Parent(s): 5d56c36

distributions for the filters on words and discarded words by filter

Browse files
Files changed (2) hide show
  1. app.py +139 -66
  2. en_examples_with_stats.json +2 -2
app.py CHANGED
@@ -112,6 +112,12 @@ class Visualization:
112
  def set_title(self):
113
  st.title(f"Filtering visualization")
114
 
 
 
 
 
 
 
115
  @staticmethod
116
  def plot_hist(dataframe, key, num_bins=50):
117
  checkbox = st.checkbox(
@@ -130,6 +136,17 @@ class Visualization:
130
  ax.axvline(x=key[1], color="r", linestyle="dashed")
131
  st.pyplot(fig)
132
 
 
 
 
 
 
 
 
 
 
 
 
133
  def filtering_of_docs(self):
134
  st.sidebar.subheader("Parameters of the filtering on documents")
135
 
@@ -143,11 +160,6 @@ class Visualization:
143
  return self.docs[key] <= cutoff
144
  return self.docs[key] >= cutoff
145
 
146
- def print_discared_by_cond(cond):
147
- st.caption(
148
- f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
149
- )
150
-
151
  if "number_words" in columns:
152
  with st.sidebar.expander("Number of words"):
153
  cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
@@ -159,7 +171,7 @@ class Visualization:
159
  keys.append(new_key)
160
  Visualization.plot_hist(self.docs, new_key)
161
  cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
162
- print_discared_by_cond(cond_1)
163
 
164
  cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
165
  cutoff_max_number_words = st.slider(
@@ -168,7 +180,7 @@ class Visualization:
168
  new_key = ("number_words", cutoff_max_number_words, True)
169
  keys.append(new_key)
170
  cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
171
- print_discared_by_cond(cond_2)
172
 
173
  conds["number_words"] = [cond_1, cond_2]
174
 
@@ -216,7 +228,7 @@ class Visualization:
216
  keys.append(new_key)
217
  Visualization.plot_hist(self.docs, new_key)
218
  cond = get_cond(new_key[0], new_key[1], new_key[2])
219
- print_discared_by_cond(cond)
220
  conds["repetitions_ratio"] = [cond]
221
 
222
  if "special_characters_ratio" in columns:
@@ -233,7 +245,7 @@ class Visualization:
233
  keys.append(new_key)
234
  Visualization.plot_hist(self.docs, new_key)
235
  cond = get_cond(new_key[0], new_key[1], new_key[2])
236
- print_discared_by_cond(cond)
237
  conds["special_characters_ratio"] = [cond]
238
 
239
  if "stopwords_ratio" in columns:
@@ -269,7 +281,7 @@ class Visualization:
269
  keys.append(new_key)
270
  Visualization.plot_hist(self.docs, new_key)
271
  cond = get_cond(new_key[0], new_key[1], new_key[2])
272
- print_discared_by_cond(cond)
273
  conds["stopwords_ratio"] = [cond]
274
 
275
  if "flagged_words_ratio" in columns:
@@ -298,14 +310,15 @@ class Visualization:
298
  new_flagged_words,
299
  )
300
  cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
 
301
  cutoff_flagged_words_ratio = st.slider(
302
- cutoff_def, 0.0, 1.0, 1.0, step=0.01
303
  )
304
  new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
305
  keys.append(new_key)
306
  Visualization.plot_hist(self.docs, new_key)
307
  cond = get_cond(new_key[0], new_key[1], new_key[2])
308
- print_discared_by_cond(cond)
309
  conds["flagged_words_ratio"] = [cond]
310
 
311
  if "lang_id_score" in columns:
@@ -318,7 +331,7 @@ class Visualization:
318
  keys.append(new_key)
319
  Visualization.plot_hist(self.docs, new_key)
320
  cond = get_cond(new_key[0], new_key[1], new_key[2])
321
- print_discared_by_cond(cond)
322
  conds["lang_id_score"] = [cond]
323
 
324
  if "perplexity_score" in columns:
@@ -330,7 +343,7 @@ class Visualization:
330
  keys.append(new_key)
331
  Visualization.plot_hist(self.docs, new_key)
332
  cond = get_cond(new_key[0], new_key[1], new_key[2])
333
- print_discared_by_cond(cond)
334
  conds["perplexity_score"] = [cond]
335
 
336
  return keys, conds
@@ -348,17 +361,9 @@ class Visualization:
348
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
349
  )
350
 
351
- def display_dataset(cond, description):
352
- displayed_docs = self.docs.loc[cond]
353
- st.subheader(
354
- f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
355
- )
356
- st.markdown(
357
- "Click on a column to sort by it, place the cursor on the text to display it."
358
- )
359
- st.dataframe(displayed_docs)
360
-
361
- display_dataset(np.invert(all_conds), "Discarded documents")
362
 
363
  # st.subheader("Display discarded documents by filter")
364
  display_discarded_documents_by_filter = st.checkbox(
@@ -370,58 +375,74 @@ class Visualization:
370
 
371
  if "number_words" in columns:
372
  cond_filter = np.invert(np.all(conds["number_words"], axis=0))
373
- display_dataset(
 
374
  cond_filter,
375
  "Discarded documents for the filter on the number of words",
 
376
  )
377
 
378
  if "repetitions_ratio" in columns:
379
  cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
380
- display_dataset(
 
381
  cond_filter,
382
  "Discarded documents for the filter on the repetitions ratio",
 
383
  )
384
 
385
  if "special_characters_ratio" in columns:
386
  cond_filter = np.invert(
387
  np.all(conds["special_characters_ratio"], axis=0)
388
  )
389
- display_dataset(
 
390
  cond_filter,
391
  "Discarded documents for the filter on the special characters ratio",
 
392
  )
393
 
394
  if "stopwords_ratio" in columns:
395
  cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
396
- display_dataset(
 
397
  cond_filter,
398
  "Discarded documents for the filter on the stop words ratio",
 
399
  )
400
 
401
  if "flagged_words_ratio" in columns:
402
  cond_filter = np.invert(
403
  np.all(conds["flagged_words_ratio"], axis=0)
404
  )
405
- display_dataset(
 
406
  cond_filter,
407
  "Discarded documents for the filter on the flagged words ratio",
 
408
  )
409
 
410
  if "lang_id_score" in columns:
411
  cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
412
- display_dataset(
 
413
  cond_filter,
414
  "Discarded documents for the filter on the language identification confidence score",
 
415
  )
416
 
417
  if "perplexity_score" in columns:
418
  cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
419
- display_dataset(
 
420
  cond_filter,
421
  "Discarded documents for the filter on the perplexity score",
 
422
  )
423
 
424
- display_dataset(all_conds, "Retained documents")
 
 
425
 
426
  st.header("Download data")
427
 
@@ -434,57 +455,109 @@ class Visualization:
434
 
435
  def filtering_of_words(self):
436
  if not (self.words is None):
 
 
437
  st.sidebar.subheader("Parameter of the filtering on words")
438
 
439
- with st.sidebar.expander("Length of words"):
440
- cutoff_def = "If the length of a word is higher than this number, the word is removed."
441
- max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
442
- cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
443
- new_key = ("len_word", cutoff_word, True)
444
- self.parameters.append(new_key)
445
- Visualization.plot_hist(self.words, new_key)
446
-
447
- with st.sidebar.expander("Words with incorrect substrings"):
448
- incorrect_substrings = st.checkbox(
449
- "Remove words with incorrect substrings."
450
- )
451
- self.parameters.append(("incorrect_substrings", incorrect_substrings))
 
 
 
 
 
 
 
 
 
452
 
453
- cond_words = self.words["len_word"] <= cutoff_word
454
- if incorrect_substrings:
455
- cond_words = cond_words & np.invert(
456
- self.words["incorrect_substring"]
457
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
  with st.expander(
460
- f"Filtering on words, for {self.num_docs} {self.lang} documents"
461
  ):
462
  st.header(
463
- f"Filtering on words, for {self.num_docs} {self.lang} documents"
464
  )
465
 
466
  st.markdown(
467
  f"Since the number of words is way larger than the number of documents, "
468
- f"we consider in this section words for the first {self.num_docs_for_words} documents only."
469
  )
470
 
471
- discarded_words = self.words.loc[np.invert(cond_words)]
472
- st.subheader(
473
- f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
474
  )
475
- st.markdown(
476
- "Click on a column to sort by it, place the cursor on the text to display it."
477
- )
478
- st.dataframe(discarded_words)
479
 
480
- retained_words = self.words.loc[cond_words]
481
- st.subheader(
482
- f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
483
  )
484
- st.markdown(
485
- "Click on a column to sort by it, place the cursor on the text to display it."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  )
487
- st.dataframe(retained_words)
488
 
489
  def download_parameters(self):
490
  st.sidebar.subheader("Download parameters")
 
112
  def set_title(self):
113
  st.title(f"Filtering visualization")
114
 
115
+ @staticmethod
116
+ def print_discarded_by_cond(cond):
117
+ st.caption(
118
+ f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
119
+ )
120
+
121
  @staticmethod
122
  def plot_hist(dataframe, key, num_bins=50):
123
  checkbox = st.checkbox(
 
136
  ax.axvline(x=key[1], color="r", linestyle="dashed")
137
  st.pyplot(fig)
138
 
139
+ @staticmethod
140
+ def display_dataset(dataframe, cond, description, type_of_examples):
141
+ displayed_examples = dataframe.loc[cond]
142
+ st.subheader(
143
+ f"{description}: {len(displayed_examples)} {type_of_examples} ({len(displayed_examples) / len(dataframe.index) * 100:.2f}%)"
144
+ )
145
+ st.markdown(
146
+ "Click on a column to sort by it, place the cursor on the text to display it."
147
+ )
148
+ st.dataframe(displayed_examples)
149
+
150
  def filtering_of_docs(self):
151
  st.sidebar.subheader("Parameters of the filtering on documents")
152
 
 
160
  return self.docs[key] <= cutoff
161
  return self.docs[key] >= cutoff
162
 
 
 
 
 
 
163
  if "number_words" in columns:
164
  with st.sidebar.expander("Number of words"):
165
  cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
 
171
  keys.append(new_key)
172
  Visualization.plot_hist(self.docs, new_key)
173
  cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
174
+ Visualization.print_discarded_by_cond(cond_1)
175
 
176
  cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
177
  cutoff_max_number_words = st.slider(
 
180
  new_key = ("number_words", cutoff_max_number_words, True)
181
  keys.append(new_key)
182
  cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
183
+ Visualization.print_discarded_by_cond(cond_2)
184
 
185
  conds["number_words"] = [cond_1, cond_2]
186
 
 
228
  keys.append(new_key)
229
  Visualization.plot_hist(self.docs, new_key)
230
  cond = get_cond(new_key[0], new_key[1], new_key[2])
231
+ Visualization.print_discarded_by_cond(cond)
232
  conds["repetitions_ratio"] = [cond]
233
 
234
  if "special_characters_ratio" in columns:
 
245
  keys.append(new_key)
246
  Visualization.plot_hist(self.docs, new_key)
247
  cond = get_cond(new_key[0], new_key[1], new_key[2])
248
+ Visualization.print_discarded_by_cond(cond)
249
  conds["special_characters_ratio"] = [cond]
250
 
251
  if "stopwords_ratio" in columns:
 
281
  keys.append(new_key)
282
  Visualization.plot_hist(self.docs, new_key)
283
  cond = get_cond(new_key[0], new_key[1], new_key[2])
284
+ Visualization.print_discarded_by_cond(cond)
285
  conds["stopwords_ratio"] = [cond]
286
 
287
  if "flagged_words_ratio" in columns:
 
310
  new_flagged_words,
311
  )
312
  cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
313
+ max_fwr = np.max(self.docs["flagged_words_ratio"])
314
  cutoff_flagged_words_ratio = st.slider(
315
+ cutoff_def, 0.0, max_fwr, max_fwr, step=0.001
316
  )
317
  new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
318
  keys.append(new_key)
319
  Visualization.plot_hist(self.docs, new_key)
320
  cond = get_cond(new_key[0], new_key[1], new_key[2])
321
+ Visualization.print_discarded_by_cond(cond)
322
  conds["flagged_words_ratio"] = [cond]
323
 
324
  if "lang_id_score" in columns:
 
331
  keys.append(new_key)
332
  Visualization.plot_hist(self.docs, new_key)
333
  cond = get_cond(new_key[0], new_key[1], new_key[2])
334
+ Visualization.print_discarded_by_cond(cond)
335
  conds["lang_id_score"] = [cond]
336
 
337
  if "perplexity_score" in columns:
 
343
  keys.append(new_key)
344
  Visualization.plot_hist(self.docs, new_key)
345
  cond = get_cond(new_key[0], new_key[1], new_key[2])
346
+ Visualization.print_discarded_by_cond(cond)
347
  conds["perplexity_score"] = [cond]
348
 
349
  return keys, conds
 
361
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
362
  )
363
 
364
+ Visualization.display_dataset(
365
+ self.docs, np.invert(all_conds), "Discarded documents", "docs"
366
+ )
 
 
 
 
 
 
 
 
367
 
368
  # st.subheader("Display discarded documents by filter")
369
  display_discarded_documents_by_filter = st.checkbox(
 
375
 
376
  if "number_words" in columns:
377
  cond_filter = np.invert(np.all(conds["number_words"], axis=0))
378
+ Visualization.display_dataset(
379
+ self.docs,
380
  cond_filter,
381
  "Discarded documents for the filter on the number of words",
382
+ "docs",
383
  )
384
 
385
  if "repetitions_ratio" in columns:
386
  cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
387
+ Visualization.display_dataset(
388
+ self.docs,
389
  cond_filter,
390
  "Discarded documents for the filter on the repetitions ratio",
391
+ "docs",
392
  )
393
 
394
  if "special_characters_ratio" in columns:
395
  cond_filter = np.invert(
396
  np.all(conds["special_characters_ratio"], axis=0)
397
  )
398
+ Visualization.display_dataset(
399
+ self.docs,
400
  cond_filter,
401
  "Discarded documents for the filter on the special characters ratio",
402
+ "docs",
403
  )
404
 
405
  if "stopwords_ratio" in columns:
406
  cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
407
+ Visualization.display_dataset(
408
+ self.docs,
409
  cond_filter,
410
  "Discarded documents for the filter on the stop words ratio",
411
+ "docs",
412
  )
413
 
414
  if "flagged_words_ratio" in columns:
415
  cond_filter = np.invert(
416
  np.all(conds["flagged_words_ratio"], axis=0)
417
  )
418
+ Visualization.display_dataset(
419
+ self.docs,
420
  cond_filter,
421
  "Discarded documents for the filter on the flagged words ratio",
422
+ "docs",
423
  )
424
 
425
  if "lang_id_score" in columns:
426
  cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
427
+ Visualization.display_dataset(
428
+ self.docs,
429
  cond_filter,
430
  "Discarded documents for the filter on the language identification confidence score",
431
+ "docs",
432
  )
433
 
434
  if "perplexity_score" in columns:
435
  cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
436
+ Visualization.display_dataset(
437
+ self.docs,
438
  cond_filter,
439
  "Discarded documents for the filter on the perplexity score",
440
+ "docs",
441
  )
442
 
443
+ Visualization.display_dataset(
444
+ self.docs, all_conds, "Retained documents", "docs"
445
+ )
446
 
447
  st.header("Download data")
448
 
 
455
 
456
  def filtering_of_words(self):
457
  if not (self.words is None):
458
+ columns = list(self.words)
459
+
460
  st.sidebar.subheader("Parameter of the filtering on words")
461
 
462
+ conds_words = {}
463
+
464
+ if "len_word" in columns:
465
+ with st.sidebar.expander("Length of words"):
466
+ cutoff_def = "If the length of a word is higher than this number, the word is removed."
467
+ max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
468
+ cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
469
+ new_key = ("len_word", cutoff_word, True)
470
+ self.parameters.append(new_key)
471
+ Visualization.plot_hist(self.words, new_key)
472
+ cond_len_words = self.words["len_word"] <= cutoff_word
473
+ Visualization.print_discarded_by_cond(cond_len_words)
474
+ conds_words["len_word"] = cond_len_words
475
+
476
+ if "incorrect_substrings" in columns:
477
+ with st.sidebar.expander("Words with incorrect substrings"):
478
+ incorrect_substrings = st.checkbox(
479
+ "Remove words with incorrect substrings."
480
+ )
481
+ self.parameters.append(
482
+ ("incorrect_substrings", incorrect_substrings)
483
+ )
484
 
485
+ checkbox = st.checkbox(
486
+ "Diplay distribution",
487
+ value=True,
488
+ key="display_distribution_incorrect_substrings",
489
  )
490
+ if checkbox:
491
+ incor_sub = np.array(self.words["incorrect_substrings"]) * 1
492
+ with_incor_sub = np.sum(incor_sub)
493
+ without_incor_sub = len(incor_sub) - with_incor_sub
494
+ st.markdown(
495
+ f"Number of words with incorrect substrings: {with_incor_sub}"
496
+ )
497
+ st.markdown(
498
+ f"Number of words without incorrect substrings: {without_incor_sub}"
499
+ )
500
+
501
+ if incorrect_substrings:
502
+ cond_incorrect_substrings = np.invert(
503
+ self.words["incorrect_substrings"]
504
+ )
505
+ else:
506
+ cond_incorrect_substrings = np.array(
507
+ [
508
+ True
509
+ for i in range(len(self.words["incorrect_substrings"]))
510
+ ]
511
+ )
512
+ Visualization.print_discarded_by_cond(cond_incorrect_substrings)
513
+ conds_words["incorrect_substrings"] = cond_incorrect_substrings
514
+
515
+ all_conds_words = np.all(list(conds_words.values()), axis=0)
516
 
517
  with st.expander(
518
+ f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
519
  ):
520
  st.header(
521
+ f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
522
  )
523
 
524
  st.markdown(
525
  f"Since the number of words is way larger than the number of documents, "
526
+ f"we consider in this section words for only {self.num_docs_for_words} documents."
527
  )
528
 
529
+ Visualization.display_dataset(
530
+ self.words, np.invert(all_conds_words), "Discarded words", "words"
 
531
  )
 
 
 
 
532
 
533
+ # st.subheader("Display discarded words by filter")
534
+ display_discarded_words_by_filter = st.checkbox(
535
+ "Display discarded words by filter"
536
  )
537
+
538
+ if display_discarded_words_by_filter:
539
+
540
+ if "len_word" in columns:
541
+ cond_filter = np.invert(conds_words["len_word"])
542
+ Visualization.display_dataset(
543
+ self.words,
544
+ cond_filter,
545
+ "Discarded words for the filter on length",
546
+ "words",
547
+ )
548
+
549
+ if "incorrect_substrings" in columns:
550
+ cond_filter = np.invert(conds_words["incorrect_substrings"])
551
+ Visualization.display_dataset(
552
+ self.words,
553
+ cond_filter,
554
+ "Discarded words for the filter on incorrect substrings",
555
+ "words",
556
+ )
557
+
558
+ Visualization.display_dataset(
559
+ self.words, all_conds_words, "Retained words", "words"
560
  )
 
561
 
562
  def download_parameters(self):
563
  st.sidebar.subheader("Download parameters")
en_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffbb8afeba42822e4b10341112999321e0e14a19a5eeebc342dc68a9f65d3c7f
3
- size 237426014
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29417f05cc029ab24ba89cfc4358dac755411b01f1925c735c2205b68f975fcc
3
+ size 240781004