Spaces:
Runtime error
Runtime error
update app
Browse files
app.py
CHANGED
@@ -43,25 +43,26 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
43 |
)
|
44 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
45 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
46 |
-
st.sidebar.text(f"Kept text with >{stop_cutoff:.
|
47 |
keys.append(("stop_%", stop_cutoff, False))
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
if "bad_%" in columns:
|
67 |
bad_ratio = st.sidebar.slider(
|
@@ -69,7 +70,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
69 |
)
|
70 |
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
|
71 |
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
|
72 |
-
st.sidebar.text(f"Kept text with <{bad_cutoff:.
|
73 |
keys.append(("bad_%", bad_cutoff, True))
|
74 |
|
75 |
if "perplexity" in columns:
|
@@ -133,7 +134,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
133 |
)
|
134 |
|
135 |
|
136 |
-
path_data = "./
|
137 |
lang = "English"
|
138 |
num_docs = 5000
|
139 |
num_docs_for_words = 500
|
|
|
43 |
)
|
44 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
45 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
46 |
+
st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words")
|
47 |
keys.append(("stop_%", stop_cutoff, False))
|
48 |
|
49 |
+
@st.cache(suppress_st_warning=True)
|
50 |
+
def recalculate_bad_words(file):
|
51 |
+
|
52 |
+
def bad_word_ratio(text: str, bad_word_list):
|
53 |
+
return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split())
|
54 |
+
|
55 |
+
bad_word_list = [word.decode().strip() for word in file.readlines()]
|
56 |
+
|
57 |
+
bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
|
58 |
+
data["bad_%"] = bad_word_ratios
|
59 |
+
|
60 |
+
bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
|
61 |
+
|
62 |
+
st.session_state.old_bad_word_file = None
|
63 |
+
if bad_word_file != st.write(st.session_state.old_bad_word_file):
|
64 |
+
recalculate_bad_words(bad_word_file)
|
65 |
+
st.session_state.old_bad_word_file = bad_word_file
|
66 |
|
67 |
if "bad_%" in columns:
|
68 |
bad_ratio = st.sidebar.slider(
|
|
|
70 |
)
|
71 |
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
|
72 |
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
|
73 |
+
st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words")
|
74 |
keys.append(("bad_%", bad_cutoff, True))
|
75 |
|
76 |
if "perplexity" in columns:
|
|
|
134 |
)
|
135 |
|
136 |
|
137 |
+
path_data = "./en_examples_with_stats_ldnoob.json"
|
138 |
lang = "English"
|
139 |
num_docs = 5000
|
140 |
num_docs_for_words = 500
|