HugoLaurencon HF staff commited on
Commit
bfbcd60
1 Parent(s): 649ea6a

button to download parameters

Browse files
Files changed (2) hide show
  1. app.py +114 -88
  2. explanation_filtering_pipeline.pdf +0 -0
app.py CHANGED
@@ -162,9 +162,7 @@ class Visualization:
162
  if "10" in val_repetitions_lengths
163
  else 0
164
  )
165
- label_selectbox = (
166
- "Length of the repetitions (that will determine the repetitions ratio)."
167
- )
168
  repetitions_length = st.sidebar.selectbox(
169
  label=label_selectbox,
170
  options=val_repetitions_lengths,
@@ -261,6 +259,7 @@ class Visualization:
261
  return keys, conds
262
 
263
  self.keys, conds = set_sliders()
 
264
 
265
  all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
266
  all_conds = np.all(all_conds, axis=0)
@@ -347,10 +346,14 @@ class Visualization:
347
  cutoff_def = "If the length of a word is higher than this number, the word is removed."
348
  max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
349
  cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
 
 
350
 
351
  incorrect_substrings = st.sidebar.checkbox(
352
  "Remove words with incorrect substrings."
353
  )
 
 
354
 
355
  cond_words = self.words["len_word"] <= cutoff_word
356
  if incorrect_substrings:
@@ -381,6 +384,13 @@ class Visualization:
381
  )
382
  st.dataframe(retained_words)
383
 
 
 
 
 
 
 
 
384
  def plot_distributions_filtering_parameters(self):
385
  st.header("Distributions of the filtering parameters")
386
 
@@ -437,94 +447,109 @@ class Visualization:
437
  is_discarded = False
438
 
439
  def is_doc_discarded(key, score):
440
- if key[2]: # max cutoff
441
  return score > key[1]
442
  else:
443
  return score < key[1]
444
 
445
- st.markdown("Statistics of the document:")
446
-
447
- for key in self.keys:
448
- if key[0] == "number_words":
449
- words = ModifyingDocuments.get_words_from_document(
450
- personal_doc,
451
- self.sentencepiece_model_tok,
452
- lower_case=False,
453
- strip_characters=self.param["strip_characters"],
454
- )
455
- if key[2]:
456
- st.markdown(f"Number of words: {len(words)}")
457
- if is_doc_discarded(key, len(words)):
458
- is_discarded = True
459
-
460
- elif key[0] == "repetitions_ratio":
461
- repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
462
- repetitions_ratio = round(repetitions_ratio, 3)
463
- st.markdown(f"Repetitions ratio: {repetitions_ratio}")
464
- if is_doc_discarded(key, repetitions_ratio):
465
- is_discarded = True
466
-
467
- elif key[0] == "special_characters_ratio":
468
- special_characters_ratio = Filtering.compute_special_characters_ratio(
469
- personal_doc, self.param["special_characters"]
470
- )
471
- special_characters_ratio = round(special_characters_ratio, 3)
472
- st.markdown(f"Special characters ratio: {special_characters_ratio}")
473
- if is_doc_discarded(key, special_characters_ratio):
474
- is_discarded = True
475
-
476
- elif key[0] == "stopwords_ratio":
477
- stopwords_ratio = Filtering.compute_stopwords_ratio(
478
- personal_doc,
479
- self.sentencepiece_model_tok,
480
- self.param["strip_characters"],
481
- self.param["cond_words_augmentation"],
482
- self.param["words_augmentation_group_sizes"],
483
- self.param["words_augmentation_join_char"],
484
- self.stopwords,
485
- )
486
- stopwords_ratio = round(stopwords_ratio, 3)
487
- st.markdown(f"Stop words ratio: {stopwords_ratio}")
488
- if is_doc_discarded(key, stopwords_ratio):
489
- is_discarded = True
490
-
491
- elif key[0] == "badwords_ratio":
492
- badwords_ratio = Filtering.compute_badwords_ratio(
493
- personal_doc,
494
- self.sentencepiece_model_tok,
495
- self.param["strip_characters"],
496
- self.param["cond_words_augmentation"],
497
- self.param["words_augmentation_group_sizes"],
498
- self.param["words_augmentation_join_char"],
499
- self.badwords,
500
- )
501
- badwords_ratio = round(badwords_ratio, 3)
502
- st.markdown(f"Flagged words ratio: {badwords_ratio}")
503
- if is_doc_discarded(key, badwords_ratio):
504
- is_discarded = True
505
-
506
- elif key[0] == "lang_id_score":
507
- lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
508
- personal_doc, self.model_lang_id
509
- )
510
- lang_id_score = round(lang_id_score, 3)
511
- st.markdown(f"Language identification confidence score: {lang_id_score}")
512
- if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
513
- is_discarded = True
514
-
515
- elif key[0] == "perplexity_score":
516
- perplexity_score = Filtering.compute_perplexity_score(
517
- personal_doc,
518
- self.sentencepiece_model,
519
- self.kenlm_model,
520
- )
521
- perplexity_score = round(perplexity_score, 3)
522
- st.markdown(f"Perplexity score: {perplexity_score}")
523
- if is_doc_discarded(key, perplexity_score):
524
- is_discarded = True
525
-
526
- is_discarded = "" if is_discarded else "not "
527
- st.markdown(f"With the current filtering parameters, this document **is {is_discarded}discarded**.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
 
529
  def download_data(self):
530
  st.header("Download data")
@@ -543,8 +568,9 @@ class Visualization:
543
  self.set_title()
544
  self.filtering_of_docs()
545
  self.filtering_of_words()
 
546
  self.plot_distributions_filtering_parameters()
547
- #self.plot_zipf_law()
548
  self.analyse_personal_doc()
549
  self.download_data()
550
 
 
162
  if "10" in val_repetitions_lengths
163
  else 0
164
  )
165
+ label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
 
 
166
  repetitions_length = st.sidebar.selectbox(
167
  label=label_selectbox,
168
  options=val_repetitions_lengths,
 
259
  return keys, conds
260
 
261
  self.keys, conds = set_sliders()
262
+ self.parameters = self.keys * 1
263
 
264
  all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
265
  all_conds = np.all(all_conds, axis=0)
 
346
  cutoff_def = "If the length of a word is higher than this number, the word is removed."
347
  max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
348
  cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
349
+ self.parameters.append(("len_word", cutoff_word, True))
350
+ st.sidebar.caption("---------")
351
 
352
  incorrect_substrings = st.sidebar.checkbox(
353
  "Remove words with incorrect substrings."
354
  )
355
+ self.parameters.append(("incorrect_substrings", incorrect_substrings))
356
+ st.sidebar.caption("---------")
357
 
358
  cond_words = self.words["len_word"] <= cutoff_word
359
  if incorrect_substrings:
 
384
  )
385
  st.dataframe(retained_words)
386
 
387
+ def download_parameters(self):
388
+ btn = st.sidebar.download_button(
389
+ label="Download current parameters as json",
390
+ data=json.dumps(self.parameters),
391
+ file_name=f"parameters_{self.lang_dataset_id}.json",
392
+ )
393
+
394
  def plot_distributions_filtering_parameters(self):
395
  st.header("Distributions of the filtering parameters")
396
 
 
447
  is_discarded = False
448
 
449
  def is_doc_discarded(key, score):
450
+ if key[2]: # max cutoff
451
  return score > key[1]
452
  else:
453
  return score < key[1]
454
 
455
+ if personal_doc:
456
+
457
+ st.markdown("Statistics of the document:")
458
+
459
+ for key in self.keys:
460
+ if key[0] == "number_words":
461
+ words = ModifyingDocuments.get_words_from_document(
462
+ personal_doc,
463
+ self.sentencepiece_model_tok,
464
+ lower_case=False,
465
+ strip_characters=self.param["strip_characters"],
466
+ )
467
+ if key[2]:
468
+ st.markdown(f"Number of words: {len(words)}")
469
+ if is_doc_discarded(key, len(words)):
470
+ is_discarded = True
471
+
472
+ elif key[0] == "repetitions_ratio":
473
+ repetitions_ratio = Filtering.compute_repetitions_ratio(
474
+ personal_doc, int(key[3])
475
+ )
476
+ repetitions_ratio = round(repetitions_ratio, 3)
477
+ st.markdown(f"Repetitions ratio: {repetitions_ratio}")
478
+ if is_doc_discarded(key, repetitions_ratio):
479
+ is_discarded = True
480
+
481
+ elif key[0] == "special_characters_ratio":
482
+ special_characters_ratio = (
483
+ Filtering.compute_special_characters_ratio(
484
+ personal_doc, self.param["special_characters"]
485
+ )
486
+ )
487
+ special_characters_ratio = round(special_characters_ratio, 3)
488
+ st.markdown(f"Special characters ratio: {special_characters_ratio}")
489
+ if is_doc_discarded(key, special_characters_ratio):
490
+ is_discarded = True
491
+
492
+ elif key[0] == "stopwords_ratio":
493
+ stopwords_ratio = Filtering.compute_stopwords_ratio(
494
+ personal_doc,
495
+ self.sentencepiece_model_tok,
496
+ self.param["strip_characters"],
497
+ self.param["cond_words_augmentation"],
498
+ self.param["words_augmentation_group_sizes"],
499
+ self.param["words_augmentation_join_char"],
500
+ self.stopwords,
501
+ )
502
+ stopwords_ratio = round(stopwords_ratio, 3)
503
+ st.markdown(f"Stop words ratio: {stopwords_ratio}")
504
+ if is_doc_discarded(key, stopwords_ratio):
505
+ is_discarded = True
506
+
507
+ elif key[0] == "badwords_ratio":
508
+ badwords_ratio = Filtering.compute_badwords_ratio(
509
+ personal_doc,
510
+ self.sentencepiece_model_tok,
511
+ self.param["strip_characters"],
512
+ self.param["cond_words_augmentation"],
513
+ self.param["words_augmentation_group_sizes"],
514
+ self.param["words_augmentation_join_char"],
515
+ self.badwords,
516
+ )
517
+ badwords_ratio = round(badwords_ratio, 3)
518
+ st.markdown(f"Flagged words ratio: {badwords_ratio}")
519
+ if is_doc_discarded(key, badwords_ratio):
520
+ is_discarded = True
521
+
522
+ elif key[0] == "lang_id_score":
523
+ (
524
+ lang_pred_dataset_id,
525
+ lang_id_score,
526
+ ) = Filtering.compute_lang_id_pred_score(
527
+ personal_doc, self.model_lang_id
528
+ )
529
+ lang_id_score = round(lang_id_score, 3)
530
+ st.markdown(
531
+ f"Language identification confidence score: {lang_id_score}"
532
+ )
533
+ if is_doc_discarded(key, badwords_ratio) or (
534
+ self.lang_dataset_id != lang_pred_dataset_id
535
+ ):
536
+ is_discarded = True
537
+
538
+ elif key[0] == "perplexity_score":
539
+ perplexity_score = Filtering.compute_perplexity_score(
540
+ personal_doc,
541
+ self.sentencepiece_model,
542
+ self.kenlm_model,
543
+ )
544
+ perplexity_score = round(perplexity_score, 3)
545
+ st.markdown(f"Perplexity score: {perplexity_score}")
546
+ if is_doc_discarded(key, perplexity_score):
547
+ is_discarded = True
548
+
549
+ is_discarded = "" if is_discarded else "not "
550
+ st.markdown(
551
+ f"With the current filtering parameters, this document **is {is_discarded}discarded**."
552
+ )
553
 
554
  def download_data(self):
555
  st.header("Download data")
 
568
  self.set_title()
569
  self.filtering_of_docs()
570
  self.filtering_of_words()
571
+ self.download_parameters()
572
  self.plot_distributions_filtering_parameters()
573
+ # self.plot_zipf_law()
574
  self.analyse_personal_doc()
575
  self.download_data()
576
 
explanation_filtering_pipeline.pdf CHANGED
Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ