HugoLaurencon HF staff commited on
Commit
5d56c36
1 Parent(s): 1bc0c1e

visualization: upload our own stop words and flagged words list

Browse files
Files changed (1) hide show
  1. app.py +55 -8
app.py CHANGED
@@ -4,6 +4,7 @@ import streamlit as st
4
 
5
  import os
6
 
 
7
  import base64
8
  import json
9
  import pandas as pd
@@ -65,10 +66,6 @@ class Visualization:
65
  )
66
 
67
  def preamble(self):
68
- st.markdown(
69
- "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
70
- )
71
-
72
  def get_binary_file_downloader_html(bin_file, file_label="File"):
73
  with open(bin_file, "rb") as f:
74
  data = f.read()
@@ -77,10 +74,12 @@ class Visualization:
77
  return href
78
 
79
  st.markdown(
80
- get_binary_file_downloader_html(
 
81
  self.path_instructions,
82
- "Download the explanation of the filtering pipeline as pdf",
83
- ),
 
84
  unsafe_allow_html=True,
85
  )
86
 
@@ -196,7 +195,9 @@ class Visualization:
196
  "few or no repetitions, simply because their length gives them more diversity, and we do "
197
  "not want to discard such documents."
198
  )
199
- self.docs = self.docs_checkpoint
 
 
200
  for i in range(len(self.docs["repetitions_ratio"])):
201
  self.docs["repetitions_ratio"].iloc[i] = self.docs[
202
  "repetitions_ratio"
@@ -237,6 +238,29 @@ class Visualization:
237
 
238
  if "stopwords_ratio" in columns:
239
  with st.sidebar.expander("Stop words ratio"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
241
  cutoff_stopwords_ratio = st.slider(
242
  cutoff_def, 0.0, 1.0, 0.0, step=0.01
@@ -250,6 +274,29 @@ class Visualization:
250
 
251
  if "flagged_words_ratio" in columns:
252
  with st.sidebar.expander("Flagged words ratio"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
254
  cutoff_flagged_words_ratio = st.slider(
255
  cutoff_def, 0.0, 1.0, 1.0, step=0.01
 
4
 
5
  import os
6
 
7
+ from io import StringIO
8
  import base64
9
  import json
10
  import pandas as pd
 
66
  )
67
 
68
  def preamble(self):
 
 
 
 
69
  def get_binary_file_downloader_html(bin_file, file_label="File"):
70
  with open(bin_file, "rb") as f:
71
  data = f.read()
 
74
  return href
75
 
76
  st.markdown(
77
+ "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
78
+ + get_binary_file_downloader_html(
79
  self.path_instructions,
80
+ "pdf",
81
+ )
82
+ + ".",
83
  unsafe_allow_html=True,
84
  )
85
 
 
195
  "few or no repetitions, simply because their length gives them more diversity, and we do "
196
  "not want to discard such documents."
197
  )
198
+ self.docs["repetitions_ratio"] = self.docs_checkpoint[
199
+ "repetitions_ratio"
200
+ ]
201
  for i in range(len(self.docs["repetitions_ratio"])):
202
  self.docs["repetitions_ratio"].iloc[i] = self.docs[
203
  "repetitions_ratio"
 
238
 
239
  if "stopwords_ratio" in columns:
240
  with st.sidebar.expander("Stop words ratio"):
241
+ stopwords_file = st.file_uploader(
242
+ "Upload your own list of stop words (one per line). If there is none, the default one is used."
243
+ )
244
+ if stopwords_file:
245
+ new_stopwords = StringIO(
246
+ stopwords_file.getvalue().decode("utf-8")
247
+ ).read()
248
+ new_stopwords = set(new_stopwords.split("\n"))
249
+ self.docs["stopwords_ratio"] = self.docs_checkpoint[
250
+ "stopwords_ratio"
251
+ ]
252
+ for i in range(len(self.docs["stopwords_ratio"])):
253
+ self.docs["stopwords_ratio"].iloc[
254
+ i
255
+ ] = Filtering.compute_stopwords_ratio(
256
+ self.docs["text"].iloc[i],
257
+ self.sentencepiece_model_tok,
258
+ self.param["strip_characters"],
259
+ self.param["cond_words_augmentation"],
260
+ self.param["words_augmentation_group_sizes"],
261
+ self.param["words_augmentation_join_char"],
262
+ new_stopwords,
263
+ )
264
  cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
265
  cutoff_stopwords_ratio = st.slider(
266
  cutoff_def, 0.0, 1.0, 0.0, step=0.01
 
274
 
275
  if "flagged_words_ratio" in columns:
276
  with st.sidebar.expander("Flagged words ratio"):
277
+ flagged_words_file = st.file_uploader(
278
+ "Upload your own list of flagged words (one per line). If there is none, the default one is used."
279
+ )
280
+ if flagged_words_file:
281
+ new_flagged_words = StringIO(
282
+ flagged_words_file.getvalue().decode("utf-8")
283
+ ).read()
284
+ new_flagged_words = set(new_flagged_words.split("\n"))
285
+ self.docs["flagged_words_ratio"] = self.docs_checkpoint[
286
+ "flagged_words_ratio"
287
+ ]
288
+ for i in range(len(self.docs["flagged_words_ratio"])):
289
+ self.docs["flagged_words_ratio"].iloc[
290
+ i
291
+ ] = Filtering.compute_flagged_words_ratio(
292
+ self.docs["text"].iloc[i],
293
+ self.sentencepiece_model_tok,
294
+ self.param["strip_characters"],
295
+ self.param["cond_words_augmentation"],
296
+ self.param["words_augmentation_group_sizes"],
297
+ self.param["words_augmentation_join_char"],
298
+ new_flagged_words,
299
+ )
300
  cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
301
  cutoff_flagged_words_ratio = st.slider(
302
  cutoff_def, 0.0, 1.0, 1.0, step=0.01