HugoLaurencon HF staff commited on
Commit
6f25c5c
1 Parent(s): d463071

new tool to analyse our own doc

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +132 -4
  3. parameters_filtering.py +2 -2
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
1
+ *cpython-39.pyc
2
+ .DS_Store
app.py CHANGED
@@ -13,7 +13,7 @@ import numpy as np
13
 
14
  import matplotlib.pyplot as plt
15
 
16
- from filtering import Filtering
17
 
18
 
19
  class Visualization:
@@ -25,6 +25,10 @@ class Visualization:
25
  num_docs,
26
  num_docs_for_words,
27
  max_len_text_display,
 
 
 
 
28
  ):
29
  self.path_instructions = path_instructions
30
  self.path_data = path_data
@@ -33,6 +37,23 @@ class Visualization:
33
  self.num_docs_for_words = num_docs_for_words
34
  self.max_len_text_display = max_len_text_display
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def preamble(self):
37
  st.markdown(
38
  "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
@@ -159,6 +180,7 @@ class Visualization:
159
  "repetitions_ratio",
160
  cutoff_repetitions_ratio,
161
  True,
 
162
  )
163
  keys.append(new_key)
164
  cond = get_cond(new_key[0], new_key[1], new_key[2])
@@ -392,8 +414,104 @@ class Visualization:
392
  ax.set_ylabel("frequency in the documents")
393
  st.pyplot(fig)
394
 
395
- def check_personal_doc(self):
396
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  def download_data(self):
399
  st.header("Download data")
@@ -413,7 +531,7 @@ class Visualization:
413
  self.filtering_of_words()
414
  self.plot_distributions_filtering_parameters()
415
  #self.plot_zipf_law()
416
- self.check_personal_doc()
417
  self.download_data()
418
 
419
 
@@ -424,6 +542,12 @@ num_docs = 5000
424
  num_docs_for_words = 500
425
  max_len_text_display = 10000
426
 
 
 
 
 
 
 
427
  visualization = Visualization(
428
  path_instructions,
429
  path_data,
@@ -431,5 +555,9 @@ visualization = Visualization(
431
  num_docs,
432
  num_docs_for_words,
433
  max_len_text_display,
 
 
 
 
434
  )
435
  visualization.visualization()
13
 
14
  import matplotlib.pyplot as plt
15
 
16
+ from filtering import LoadParameters, ModifyingDocuments, Filtering
17
 
18
 
19
  class Visualization:
25
  num_docs,
26
  num_docs_for_words,
27
  max_len_text_display,
28
+ lang_dataset_id,
29
+ path_fasttext_model,
30
+ path_sentencepiece_model,
31
+ path_kenlm_model,
32
  ):
33
  self.path_instructions = path_instructions
34
  self.path_data = path_data
37
  self.num_docs_for_words = num_docs_for_words
38
  self.max_len_text_display = max_len_text_display
39
 
40
+ self.lang_dataset_id = lang_dataset_id
41
+ self.param = LoadParameters.load_parameters(lang_dataset_id)
42
+ self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
43
+ self.badwords = LoadParameters.load_badwords(lang_dataset_id)
44
+ self.model_lang_id = LoadParameters.load_model_lang_id(
45
+ lang_dataset_id, path_fasttext_model
46
+ )
47
+ self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
48
+ lang_dataset_id, path_sentencepiece_model
49
+ )
50
+ self.sentencepiece_model_tok = (
51
+ self.sentencepiece_model if self.param["tokenization"] else None
52
+ )
53
+ self.kenlm_model = LoadParameters.load_kenlm_model(
54
+ lang_dataset_id, path_kenlm_model
55
+ )
56
+
57
  def preamble(self):
58
  st.markdown(
59
  "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
180
  "repetitions_ratio",
181
  cutoff_repetitions_ratio,
182
  True,
183
+ repetitions_length,
184
  )
185
  keys.append(new_key)
186
  cond = get_cond(new_key[0], new_key[1], new_key[2])
414
  ax.set_ylabel("frequency in the documents")
415
  st.pyplot(fig)
416
 
417
+ def analyse_personal_doc(self):
418
+ st.header("Analyse your own document")
419
+
420
+ personal_doc = st.text_area(
421
+ label="Paste here the document you want to analyse",
422
+ value="",
423
+ max_chars=10000,
424
+ )
425
+
426
+ is_discarded = False
427
+
428
+ def is_doc_discarded(key, score):
429
+ if key[2]: # max cutoff
430
+ return score > key[1]
431
+ else:
432
+ return score < key[1]
433
+
434
+ for key in self.keys:
435
+ if key[0] == "number_words":
436
+ words = ModifyingDocuments.get_words_from_document(
437
+ personal_doc,
438
+ self.sentencepiece_model_tok,
439
+ lower_case=False,
440
+ strip_characters=self.param["strip_characters"],
441
+ )
442
+ if key[2]:
443
+ st.markdown(f"Number of words: {len(words)}")
444
+ if is_doc_discarded(key, len(words)):
445
+ is_discarded = True
446
+
447
+ elif key[0] == "repetitions_ratio":
448
+ repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
449
+ repetitions_ratio = round(repetitions_ratio, 3)
450
+ st.markdown(f"Repetitions ratio: {repetitions_ratio}")
451
+ if is_doc_discarded(key, repetitions_ratio):
452
+ is_discarded = True
453
+
454
+ elif key[0] == "special_characters_ratio":
455
+ special_characters_ratio = Filtering.compute_special_characters_ratio(
456
+ personal_doc, self.param["special_characters"]
457
+ )
458
+ special_characters_ratio = round(special_characters_ratio, 3)
459
+ st.markdown(f"Special characters ratio: {special_characters_ratio}")
460
+ if is_doc_discarded(key, special_characters_ratio):
461
+ is_discarded = True
462
+
463
+ elif key[0] == "stopwords_ratio":
464
+ stopwords_ratio = Filtering.compute_stopwords_ratio(
465
+ personal_doc,
466
+ self.sentencepiece_model_tok,
467
+ self.param["strip_characters"],
468
+ self.param["cond_words_augmentation"],
469
+ self.param["words_augmentation_group_sizes"],
470
+ self.param["words_augmentation_join_char"],
471
+ self.stopwords,
472
+ )
473
+ stopwords_ratio = round(stopwords_ratio, 3)
474
+ st.markdown(f"Stop words ratio: {stopwords_ratio}")
475
+ if is_doc_discarded(key, stopwords_ratio):
476
+ is_discarded = True
477
+
478
+ elif key[0] == "badwords_ratio":
479
+ badwords_ratio = Filtering.compute_badwords_ratio(
480
+ personal_doc,
481
+ self.sentencepiece_model_tok,
482
+ self.param["strip_characters"],
483
+ self.param["cond_words_augmentation"],
484
+ self.param["words_augmentation_group_sizes"],
485
+ self.param["words_augmentation_join_char"],
486
+ self.badwords,
487
+ )
488
+ badwords_ratio = round(badwords_ratio, 3)
489
+ st.markdown(f"Flagged words ratio: {badwords_ratio}")
490
+ if is_doc_discarded(key, badwords_ratio):
491
+ is_discarded = True
492
+
493
+ elif key[0] == "lang_id_score":
494
+ lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
495
+ personal_doc, self.model_lang_id
496
+ )
497
+ lang_id_score = round(lang_id_score, 3)
498
+ st.markdown(f"Language identification confidence score: {lang_id_score}")
499
+ if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
500
+ is_discarded = True
501
+
502
+ elif key[0] == "perplexity_score":
503
+ perplexity_score = Filtering.compute_perplexity_score(
504
+ personal_doc,
505
+ self.sentencepiece_model,
506
+ self.kenlm_model,
507
+ )
508
+ perplexity_score = round(perplexity_score, 3)
509
+ st.markdown(f"Perplexity score: {perplexity_score}")
510
+ if is_doc_discarded(key, perplexity_score):
511
+ is_discarded = True
512
+
513
+ is_discarded = "" if is_discarded else "not "
514
+ st.markdown(f"With the current filtering parameters, this document is {is_discarded}discarded.")
515
 
516
  def download_data(self):
517
  st.header("Download data")
531
  self.filtering_of_words()
532
  self.plot_distributions_filtering_parameters()
533
  #self.plot_zipf_law()
534
+ self.analyse_personal_doc()
535
  self.download_data()
536
 
537
 
542
  num_docs_for_words = 500
543
  max_len_text_display = 10000
544
 
545
+ # Only useful for analyse_personal_doc
546
+ lang_dataset_id = "en"
547
+ path_fasttext_model = "./lid.176.bin"
548
+ path_sentencepiece_model = "./en.sp.model"
549
+ path_kenlm_model = "./en.arpa.bin"
550
+
551
  visualization = Visualization(
552
  path_instructions,
553
  path_data,
555
  num_docs,
556
  num_docs_for_words,
557
  max_len_text_display,
558
+ lang_dataset_id,
559
+ path_fasttext_model,
560
+ path_sentencepiece_model,
561
+ path_kenlm_model,
562
  )
563
  visualization.visualization()
parameters_filtering.py CHANGED
@@ -7,8 +7,8 @@ other_special_characters = (
7
  "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"
8
  "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
9
  "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
10
- "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬x?▷Г♫∟™ª₪®「—"
11
- "❖」﴾》"
12
  )
13
  emoji = list(emoji.UNICODE_EMOJI["en"].keys())
14
 
7
  "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"
8
  "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
9
  "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
10
+ "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
11
+ "」﴾》"
12
  )
13
  emoji = list(emoji.UNICODE_EMOJI["en"].keys())
14