teven commited on
Commit
64ce142
1 Parent(s): 1fed88b

better sliders

Browse files
Files changed (1) hide show
  1. app.py +28 -13
app.py CHANGED
@@ -7,7 +7,6 @@ import matplotlib.pyplot as plt
7
 
8
 
9
  def visualization(path_data, lang, num_docs, num_docs_for_words):
10
-
11
  with open(path_data) as json_file:
12
  data = json.load(json_file)
13
 
@@ -29,10 +28,9 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
29
 
30
  st.header("Filtering based on document content")
31
 
32
-
33
  if "special_%" in columns:
34
  special_ratio = st.sidebar.slider(
35
- "% filtered by special characters ratio", 0.0, 100.0, 0.0, step=1.0
36
  )
37
  cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
38
  special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
@@ -41,16 +39,33 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
41
 
42
  if "stop_%" in columns:
43
  stop_ratio = st.sidebar.slider(
44
- "% filtered by stop word ratio", 0.0, 100.0, 0.0, step=1.0
45
  )
46
  cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
47
  stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
48
  st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
49
  keys.append(("stop_%", stop_cutoff, False))
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  if "bad_%" in columns:
52
  bad_ratio = st.sidebar.slider(
53
- "% filtered by badwords ratio", 0.0, 100.0, 0.0, step=1.0
54
  )
55
  bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
56
  bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
@@ -59,7 +74,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
59
 
60
  if "perplexity" in columns:
61
  ppl_ratio = st.sidebar.slider(
62
- "% filtered by perplexity", 0.0, 100.0, 0.0, step=1.0
63
  )
64
  ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
65
  ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
@@ -82,13 +97,13 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
82
  st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
83
  st.dataframe(data_keep)
84
 
85
- def plot_hist(dataframe, key, num_bins=50):
86
- st.subheader(" ".join(key.split("_")))
87
- hist_values = dataframe[key].values
88
- max_range = np.max(hist_values)
89
- hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
90
- st.bar_chart(hist_values)
91
- st.markdown(f"Each bin is of size: {max_range/num_bins}.")
92
 
93
  # for key, _, _ in keys:
94
  # plot_hist(data, key)
 
7
 
8
 
9
  def visualization(path_data, lang, num_docs, num_docs_for_words):
 
10
  with open(path_data) as json_file:
11
  data = json.load(json_file)
12
 
 
28
 
29
  st.header("Filtering based on document content")
30
 
 
31
  if "special_%" in columns:
32
  special_ratio = st.sidebar.slider(
33
+ "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1.0
34
  )
35
  cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
36
  special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
 
39
 
40
  if "stop_%" in columns:
41
  stop_ratio = st.sidebar.slider(
42
+ "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1.0
43
  )
44
  cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
45
  stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
46
  st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
47
  keys.append(("stop_%", stop_cutoff, False))
48
 
49
+ # def recalculate_bad_words(file):
50
+ #
51
+ # def bad_word_ratio(text: str, bad_word_list):
52
+ # return sum(
53
+ # [text.count(bad_word.decode()) * len(bad_word.decode().split()) for bad_word in bad_word_list]) / len(
54
+ # text.split())
55
+ #
56
+ # bad_word_list = file.readlines()
57
+ #
58
+ # bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
59
+ # data["bad_%"] = bad_word_ratios
60
+ #
61
+ # bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
62
+ #
63
+ # if bad_word_file is not None:
64
+ # recalculate_bad_words(bad_word_file)
65
+
66
  if "bad_%" in columns:
67
  bad_ratio = st.sidebar.slider(
68
+ "% filtered by badwords ratio", 0.0, 50.0, 0.0, step=0.1
69
  )
70
  bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
71
  bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
 
74
 
75
  if "perplexity" in columns:
76
  ppl_ratio = st.sidebar.slider(
77
+ "% filtered by perplexity", 0.0, 50.0, 0.0, step=1.0
78
  )
79
  ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
80
  ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
 
97
  st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
98
  st.dataframe(data_keep)
99
 
100
+ # def plot_hist(dataframe, key, num_bins=50):
101
+ # st.subheader(" ".join(key.split("_")))
102
+ # hist_values = dataframe[key].values
103
+ # max_range = np.max(hist_values)
104
+ # hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
105
+ # st.bar_chart(hist_values)
106
+ # st.markdown(f"Each bin is of size: {max_range/num_bins}.")
107
 
108
  # for key, _, _ in keys:
109
  # plot_hist(data, key)