teven commited on
Commit
a446a8b
1 Parent(s): 64ce142

update app

Browse files
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -43,25 +43,26 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
43
  )
44
  cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
45
  stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
46
- st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
47
  keys.append(("stop_%", stop_cutoff, False))
48
 
49
- # def recalculate_bad_words(file):
50
- #
51
- # def bad_word_ratio(text: str, bad_word_list):
52
- # return sum(
53
- # [text.count(bad_word.decode()) * len(bad_word.decode().split()) for bad_word in bad_word_list]) / len(
54
- # text.split())
55
- #
56
- # bad_word_list = file.readlines()
57
- #
58
- # bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
59
- # data["bad_%"] = bad_word_ratios
60
- #
61
- # bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
62
- #
63
- # if bad_word_file is not None:
64
- # recalculate_bad_words(bad_word_file)
 
65
 
66
  if "bad_%" in columns:
67
  bad_ratio = st.sidebar.slider(
@@ -69,7 +70,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
69
  )
70
  bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
71
  bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
72
- st.sidebar.text(f"Kept text with <{bad_cutoff:.1f}% bad words")
73
  keys.append(("bad_%", bad_cutoff, True))
74
 
75
  if "perplexity" in columns:
@@ -133,7 +134,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
133
  )
134
 
135
 
136
- path_data = "./en_examples_with_stats_no_small_docs.json"
137
  lang = "English"
138
  num_docs = 5000
139
  num_docs_for_words = 500
 
43
  )
44
  cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
45
  stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
46
+ st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words")
47
  keys.append(("stop_%", stop_cutoff, False))
48
 
49
+ @st.cache(suppress_st_warning=True)
50
+ def recalculate_bad_words(file):
51
+
52
+ def bad_word_ratio(text: str, bad_word_list):
53
+ return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split())
54
+
55
+ bad_word_list = [word.decode().strip() for word in file.readlines()]
56
+
57
+ bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
58
+ data["bad_%"] = bad_word_ratios
59
+
60
+ bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
61
+
62
+ st.session_state.old_bad_word_file = None
63
+ if bad_word_file != st.write(st.session_state.old_bad_word_file):
64
+ recalculate_bad_words(bad_word_file)
65
+ st.session_state.old_bad_word_file = bad_word_file
66
 
67
  if "bad_%" in columns:
68
  bad_ratio = st.sidebar.slider(
 
70
  )
71
  bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
72
  bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
73
+ st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words")
74
  keys.append(("bad_%", bad_cutoff, True))
75
 
76
  if "perplexity" in columns:
 
134
  )
135
 
136
 
137
+ path_data = "./en_examples_with_stats_ldnoob.json"
138
  lang = "English"
139
  num_docs = 5000
140
  num_docs_for_words = 500