HugoLaurencon HF staff commited on
Commit
061d2e4
1 Parent(s): 4809033

add register information

Browse files
app.py CHANGED
@@ -120,8 +120,6 @@ class Visualization_for_lang:
120
  st.dataframe(displayed_examples)
121
 
122
  def filtering_of_docs(self):
123
- st.sidebar.subheader("Parameters of the filtering on documents")
124
-
125
  def set_sliders():
126
  columns = list(self.docs)
127
  keys = []
@@ -377,12 +375,6 @@ class Visualization_for_lang:
377
 
378
  return keys, conds
379
 
380
- self.keys, conds = set_sliders()
381
- self.parameters = self.keys * 1
382
-
383
- all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
384
- all_conds = np.all(all_conds, axis=0)
385
-
386
  with st.expander(
387
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
388
  ):
@@ -390,101 +382,146 @@ class Visualization_for_lang:
390
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
391
  )
392
 
393
- Visualization_for_lang.display_dataset(
394
- self.docs, np.invert(all_conds), "Discarded documents", "docs"
395
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
- # st.subheader("Display discarded documents by filter")
398
- display_discarded_documents_by_filter = st.checkbox(
399
- "Display discarded documents by filter"
400
- )
 
 
401
 
402
- if display_discarded_documents_by_filter:
403
- columns = list(self.docs)
 
 
404
 
405
- if "number_words" in columns:
406
- cond_filter = np.invert(np.all(conds["number_words"], axis=0))
407
- Visualization_for_lang.display_dataset(
408
- self.docs,
409
- cond_filter,
410
- "Discarded documents for the filter on the number of words",
411
- "docs",
412
- )
413
 
414
- if "character_repetition_ratio" in columns:
415
- cond_filter = np.invert(
416
- np.all(conds["character_repetition_ratio"], axis=0)
417
- )
418
- Visualization_for_lang.display_dataset(
419
- self.docs,
420
- cond_filter,
421
- "Discarded documents for the filter on the character repetition ratio",
422
- "docs",
423
- )
424
 
425
- if "word_repetition_ratio" in columns:
426
- cond_filter = np.invert(
427
- np.all(conds["word_repetition_ratio"], axis=0)
428
- )
429
- Visualization_for_lang.display_dataset(
430
- self.docs,
431
- cond_filter,
432
- "Discarded documents for the filter on the word repetition ratio",
433
- "docs",
434
- )
435
 
436
- if "special_characters_ratio" in columns:
437
- cond_filter = np.invert(
438
- np.all(conds["special_characters_ratio"], axis=0)
439
- )
440
- Visualization_for_lang.display_dataset(
441
- self.docs,
442
- cond_filter,
443
- "Discarded documents for the filter on the special characters ratio",
444
- "docs",
445
- )
446
 
447
- if "stopwords_ratio" in columns:
448
- cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
449
- Visualization_for_lang.display_dataset(
450
- self.docs,
451
- cond_filter,
452
- "Discarded documents for the filter on the stop words ratio",
453
- "docs",
454
- )
455
 
456
- if "flagged_words_ratio" in columns:
457
- cond_filter = np.invert(
458
- np.all(conds["flagged_words_ratio"], axis=0)
459
- )
460
- Visualization_for_lang.display_dataset(
461
- self.docs,
462
- cond_filter,
463
- "Discarded documents for the filter on the flagged words ratio",
464
- "docs",
465
- )
466
 
467
- if "lang_id_score" in columns:
468
- cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
469
- Visualization_for_lang.display_dataset(
470
- self.docs,
471
- cond_filter,
472
- "Discarded documents for the filter on the language identification confidence score",
473
- "docs",
474
- )
 
 
475
 
476
- if "perplexity_score" in columns:
477
- cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
478
- Visualization_for_lang.display_dataset(
479
- self.docs,
480
- cond_filter,
481
- "Discarded documents for the filter on the perplexity score",
482
- "docs",
483
- )
 
 
484
 
485
- Visualization_for_lang.display_dataset(
486
- self.docs, all_conds, "Retained documents", "docs"
487
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
 
489
  st.header("Download data")
490
 
120
  st.dataframe(displayed_examples)
121
 
122
  def filtering_of_docs(self):
 
 
123
  def set_sliders():
124
  columns = list(self.docs)
125
  keys = []
375
 
376
  return keys, conds
377
 
 
 
 
 
 
 
378
  with st.expander(
379
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
380
  ):
382
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
383
  )
384
 
385
+ if "labels" in list(self.docs):
386
+ chosen_label = st.selectbox(
387
+ label="Consider only documents that include the following label",
388
+ options=[
389
+ "All",
390
+ "NA: Narrative",
391
+ "IN: Informational Description",
392
+ "OP: Opinion",
393
+ "ID: Interactive Discussion",
394
+ "HI: How-to/Instruction",
395
+ "IP: Informational Persuasion",
396
+ "LY: Lyrical",
397
+ "SP: Spoken",
398
+ ],
399
+ )
400
+ chosen_label = chosen_label.split(":")[0]
401
+ if chosen_label != "All":
402
+ cond_label = list(
403
+ self.docs["labels"].apply(
404
+ lambda x: True if chosen_label in x else False
405
+ )
406
+ )
407
+ self.docs = self.docs[cond_label]
408
 
409
+ if self.docs.empty:
410
+ st.markdown(
411
+ "No document to display, please try to select a different label."
412
+ )
413
+ self.keys = []
414
+ self.parameters = []
415
 
416
+ else:
417
+ st.sidebar.subheader("Parameters of the filtering on documents")
418
+ self.keys, conds = set_sliders()
419
+ self.parameters = self.keys * 1
420
 
421
+ all_conds = [
422
+ subcond for cond in list(conds.values()) for subcond in cond
423
+ ]
424
+ all_conds = np.all(all_conds, axis=0)
 
 
 
 
425
 
426
+ Visualization_for_lang.display_dataset(
427
+ self.docs, np.invert(all_conds), "Discarded documents", "docs"
428
+ )
 
 
 
 
 
 
 
429
 
430
+ # st.subheader("Display discarded documents by filter")
431
+ display_discarded_documents_by_filter = st.checkbox(
432
+ "Display discarded documents by filter"
433
+ )
 
 
 
 
 
 
434
 
435
+ if display_discarded_documents_by_filter:
436
+ columns = list(self.docs)
 
 
 
 
 
 
 
 
437
 
438
+ if "number_words" in columns:
439
+ cond_filter = np.invert(np.all(conds["number_words"], axis=0))
440
+ Visualization_for_lang.display_dataset(
441
+ self.docs,
442
+ cond_filter,
443
+ "Discarded documents for the filter on the number of words",
444
+ "docs",
445
+ )
446
 
447
+ if "character_repetition_ratio" in columns:
448
+ cond_filter = np.invert(
449
+ np.all(conds["character_repetition_ratio"], axis=0)
450
+ )
451
+ Visualization_for_lang.display_dataset(
452
+ self.docs,
453
+ cond_filter,
454
+ "Discarded documents for the filter on the character repetition ratio",
455
+ "docs",
456
+ )
457
 
458
+ if "word_repetition_ratio" in columns:
459
+ cond_filter = np.invert(
460
+ np.all(conds["word_repetition_ratio"], axis=0)
461
+ )
462
+ Visualization_for_lang.display_dataset(
463
+ self.docs,
464
+ cond_filter,
465
+ "Discarded documents for the filter on the word repetition ratio",
466
+ "docs",
467
+ )
468
 
469
+ if "special_characters_ratio" in columns:
470
+ cond_filter = np.invert(
471
+ np.all(conds["special_characters_ratio"], axis=0)
472
+ )
473
+ Visualization_for_lang.display_dataset(
474
+ self.docs,
475
+ cond_filter,
476
+ "Discarded documents for the filter on the special characters ratio",
477
+ "docs",
478
+ )
479
 
480
+ if "stopwords_ratio" in columns:
481
+ cond_filter = np.invert(
482
+ np.all(conds["stopwords_ratio"], axis=0)
483
+ )
484
+ Visualization_for_lang.display_dataset(
485
+ self.docs,
486
+ cond_filter,
487
+ "Discarded documents for the filter on the stop words ratio",
488
+ "docs",
489
+ )
490
+
491
+ if "flagged_words_ratio" in columns:
492
+ cond_filter = np.invert(
493
+ np.all(conds["flagged_words_ratio"], axis=0)
494
+ )
495
+ Visualization_for_lang.display_dataset(
496
+ self.docs,
497
+ cond_filter,
498
+ "Discarded documents for the filter on the flagged words ratio",
499
+ "docs",
500
+ )
501
+
502
+ if "lang_id_score" in columns:
503
+ cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
504
+ Visualization_for_lang.display_dataset(
505
+ self.docs,
506
+ cond_filter,
507
+ "Discarded documents for the filter on the language identification confidence score",
508
+ "docs",
509
+ )
510
+
511
+ if "perplexity_score" in columns:
512
+ cond_filter = np.invert(
513
+ np.all(conds["perplexity_score"], axis=0)
514
+ )
515
+ Visualization_for_lang.display_dataset(
516
+ self.docs,
517
+ cond_filter,
518
+ "Discarded documents for the filter on the perplexity score",
519
+ "docs",
520
+ )
521
+
522
+ Visualization_for_lang.display_dataset(
523
+ self.docs, all_conds, "Retained documents", "docs"
524
+ )
525
 
526
  st.header("Download data")
527
 
en_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac12d82e24642fd0b1d4f6c5b8fbe1edb42dc15a38185ccc8ec95ac0fe687bc2
3
- size 241407829
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd798b2bc010480cf0777b41bac9dfde2ab1c0ba17e151400b9e1359aa1a114c
3
+ size 276101032
zh_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85f70e561c971b468ba69963841b73e6a6da0a230f19f191234701e926688feb
3
- size 63554172
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8b02e485e2736cc5e407a567adcb09d228ce0e2eb6ed7609749e77028446175
3
+ size 74914733