Spaces:
Runtime error
Runtime error
Commit
·
fa81556
1
Parent(s):
0610f9d
visualization: small step for the slider on flagged words ratio
Browse files
app.py
CHANGED
|
@@ -282,9 +282,16 @@ class Visualization_for_lang:
|
|
| 282 |
new_flagged_words,
|
| 283 |
)
|
| 284 |
cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
|
| 285 |
-
max_fwr =
|
|
|
|
|
|
|
| 286 |
cutoff_flagged_words_ratio = st.slider(
|
| 287 |
-
cutoff_def,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
)
|
| 289 |
new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
|
| 290 |
keys.append(new_key)
|
|
@@ -481,7 +488,9 @@ class Visualization_for_lang:
|
|
| 481 |
for i in range(len(self.words["incorrect_substrings"]))
|
| 482 |
]
|
| 483 |
)
|
| 484 |
-
Visualization_for_lang.print_discarded_by_cond(
|
|
|
|
|
|
|
| 485 |
conds_words["incorrect_substrings"] = cond_incorrect_substrings
|
| 486 |
|
| 487 |
all_conds_words = np.all(list(conds_words.values()), axis=0)
|
|
@@ -723,7 +732,10 @@ class Visualization:
|
|
| 723 |
)
|
| 724 |
|
| 725 |
def choose_lang(self):
|
| 726 |
-
options = [
|
|
|
|
|
|
|
|
|
|
| 727 |
index = options.index("English") if ("English" in options) else 0
|
| 728 |
lang_chosen = st.selectbox(
|
| 729 |
label="Select the language for visualization",
|
|
@@ -731,23 +743,37 @@ class Visualization:
|
|
| 731 |
index=index,
|
| 732 |
)
|
| 733 |
if lang_chosen != "None":
|
| 734 |
-
lang_chosen_dataset_id = langs_id.loc[
|
|
|
|
|
|
|
| 735 |
visualization_for_lang = Visualization_for_lang(
|
| 736 |
-
path_data
|
| 737 |
-
lang
|
| 738 |
-
num_docs
|
| 739 |
-
num_docs_for_words
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 745 |
)
|
| 746 |
visualization_for_lang.visualization_for_lang()
|
| 747 |
|
| 748 |
def visualization(self):
|
| 749 |
self.preamble()
|
| 750 |
-
self.warning_preamble()
|
| 751 |
self.choose_lang()
|
| 752 |
|
| 753 |
|
|
|
|
| 282 |
new_flagged_words,
|
| 283 |
)
|
| 284 |
cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
|
| 285 |
+
max_fwr = np.max(self.docs["flagged_words_ratio"])
|
| 286 |
+
max_fwr = np.ceil(max_fwr * 1000) / 1000
|
| 287 |
+
max_fwr = float(max_fwr)
|
| 288 |
cutoff_flagged_words_ratio = st.slider(
|
| 289 |
+
cutoff_def,
|
| 290 |
+
0.000,
|
| 291 |
+
max_fwr,
|
| 292 |
+
max_fwr,
|
| 293 |
+
step=0.001,
|
| 294 |
+
format="%f",
|
| 295 |
)
|
| 296 |
new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
|
| 297 |
keys.append(new_key)
|
|
|
|
| 488 |
for i in range(len(self.words["incorrect_substrings"]))
|
| 489 |
]
|
| 490 |
)
|
| 491 |
+
Visualization_for_lang.print_discarded_by_cond(
|
| 492 |
+
cond_incorrect_substrings
|
| 493 |
+
)
|
| 494 |
conds_words["incorrect_substrings"] = cond_incorrect_substrings
|
| 495 |
|
| 496 |
all_conds_words = np.all(list(conds_words.values()), axis=0)
|
|
|
|
| 732 |
)
|
| 733 |
|
| 734 |
def choose_lang(self):
|
| 735 |
+
options = [
|
| 736 |
+
self.param_visu_langs[lang_dataset_id]["lang"]
|
| 737 |
+
for lang_dataset_id in self.param_visu_langs
|
| 738 |
+
]
|
| 739 |
index = options.index("English") if ("English" in options) else 0
|
| 740 |
lang_chosen = st.selectbox(
|
| 741 |
label="Select the language for visualization",
|
|
|
|
| 743 |
index=index,
|
| 744 |
)
|
| 745 |
if lang_chosen != "None":
|
| 746 |
+
lang_chosen_dataset_id = langs_id.loc[
|
| 747 |
+
langs_id["lang"] == lang_chosen, "dataset_id"
|
| 748 |
+
].iloc[0]
|
| 749 |
visualization_for_lang = Visualization_for_lang(
|
| 750 |
+
path_data=self.param_visu_langs[lang_chosen_dataset_id]["path_data"],
|
| 751 |
+
lang=self.param_visu_langs[lang_chosen_dataset_id]["lang"],
|
| 752 |
+
num_docs=self.param_visu_langs[lang_chosen_dataset_id]["num_docs"],
|
| 753 |
+
num_docs_for_words=self.param_visu_langs[lang_chosen_dataset_id][
|
| 754 |
+
"num_docs_for_words"
|
| 755 |
+
],
|
| 756 |
+
max_len_text_display=self.param_visu_langs[lang_chosen_dataset_id][
|
| 757 |
+
"max_len_text_display"
|
| 758 |
+
],
|
| 759 |
+
lang_dataset_id=self.param_visu_langs[lang_chosen_dataset_id][
|
| 760 |
+
"lang_dataset_id"
|
| 761 |
+
],
|
| 762 |
+
path_fasttext_model=self.param_visu_langs[lang_chosen_dataset_id][
|
| 763 |
+
"path_fasttext_model"
|
| 764 |
+
],
|
| 765 |
+
path_sentencepiece_model=self.param_visu_langs[lang_chosen_dataset_id][
|
| 766 |
+
"path_sentencepiece_model"
|
| 767 |
+
],
|
| 768 |
+
path_kenlm_model=self.param_visu_langs[lang_chosen_dataset_id][
|
| 769 |
+
"path_kenlm_model"
|
| 770 |
+
],
|
| 771 |
)
|
| 772 |
visualization_for_lang.visualization_for_lang()
|
| 773 |
|
| 774 |
def visualization(self):
|
| 775 |
self.preamble()
|
| 776 |
+
# self.warning_preamble()
|
| 777 |
self.choose_lang()
|
| 778 |
|
| 779 |
|