HugoLaurencon commited on
Commit
14574d7
1 Parent(s): 6303415

visu with discarded documents by filter

Browse files
Files changed (1) hide show
  1. app.py +56 -29
app.py CHANGED
@@ -66,7 +66,7 @@ class Visualization:
66
  def set_sliders(docs):
67
  columns = list(docs)
68
  keys = []
69
- conds = []
70
 
71
  def get_cond(key, cutoff, max_cutoff):
72
  if max_cutoff:
@@ -87,9 +87,8 @@ class Visualization:
87
  )
88
  new_key = ("number_words", cutoff_min_number_words, False)
89
  keys.append(new_key)
90
- cond = get_cond(new_key[0], new_key[1], new_key[2])
91
- conds.append(cond)
92
- print_discared_by_cond(cond)
93
 
94
  cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
95
  cutoff_max_number_words = st.sidebar.slider(
@@ -97,9 +96,10 @@ class Visualization:
97
  )
98
  new_key = ("number_words", cutoff_max_number_words, True)
99
  keys.append(new_key)
100
- cond = get_cond(new_key[0], new_key[1], new_key[2])
101
- conds.append(cond)
102
- print_discared_by_cond(cond)
 
103
 
104
  if "special_characters_ratio" in columns:
105
  cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
@@ -113,8 +113,8 @@ class Visualization:
113
  )
114
  keys.append(new_key)
115
  cond = get_cond(new_key[0], new_key[1], new_key[2])
116
- conds.append(cond)
117
  print_discared_by_cond(cond)
 
118
 
119
  if "stopwords_ratio" in columns:
120
  cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
@@ -124,8 +124,8 @@ class Visualization:
124
  new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
125
  keys.append(new_key)
126
  cond = get_cond(new_key[0], new_key[1], new_key[2])
127
- conds.append(cond)
128
  print_discared_by_cond(cond)
 
129
 
130
  if "badwords_ratio" in columns:
131
  cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
@@ -135,8 +135,8 @@ class Visualization:
135
  new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
136
  keys.append(new_key)
137
  cond = get_cond(new_key[0], new_key[1], new_key[2])
138
- conds.append(cond)
139
  print_discared_by_cond(cond)
 
140
 
141
  if "lang_id_score" in columns:
142
  cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
@@ -146,8 +146,8 @@ class Visualization:
146
  new_key = ("lang_id_score", cutoff_lang_id_score, False)
147
  keys.append(new_key)
148
  cond = get_cond(new_key[0], new_key[1], new_key[2])
149
- conds.append(cond)
150
  print_discared_by_cond(cond)
 
151
 
152
  if "perplexity_score" in columns:
153
  cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
@@ -158,34 +158,61 @@ class Visualization:
158
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
159
  keys.append(new_key)
160
  cond = get_cond(new_key[0], new_key[1], new_key[2])
161
- conds.append(cond)
162
  print_discared_by_cond(cond)
 
163
 
164
  return keys, conds
165
 
166
  self.keys, conds = set_sliders(self.docs)
167
 
168
- conds = np.all(conds, axis=0)
 
169
 
170
  st.header("Filtering on documents")
171
 
172
- self.discarded_docs = self.docs.loc[np.invert(conds)]
173
- st.subheader(
174
- f"Discarded documents: {len(self.discarded_docs)} docs ({len(self.discarded_docs) / self.num_docs * 100:.2f}%)"
175
- )
176
- st.markdown(
177
- "Click on a column to sort by it, place the cursor on the text to display it."
178
- )
179
- st.dataframe(self.discarded_docs)
 
 
 
180
 
181
- self.retained_docs = self.docs.loc[conds]
182
- st.subheader(
183
- f"Retained documents: {len(self.retained_docs)} docs ({len(self.retained_docs) / self.num_docs * 100:.2f}%)"
184
- )
185
- st.markdown(
186
- "Click on a column to sort by it, place the cursor on the text to display it."
187
- )
188
- st.dataframe(self.retained_docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  def filtering_of_words(self):
191
  st.sidebar.subheader("Parameter of the filtering on words")
 
66
  def set_sliders(docs):
67
  columns = list(docs)
68
  keys = []
69
+ conds = {}
70
 
71
  def get_cond(key, cutoff, max_cutoff):
72
  if max_cutoff:
 
87
  )
88
  new_key = ("number_words", cutoff_min_number_words, False)
89
  keys.append(new_key)
90
+ cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
91
+ print_discared_by_cond(cond_1)
 
92
 
93
  cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
94
  cutoff_max_number_words = st.sidebar.slider(
 
96
  )
97
  new_key = ("number_words", cutoff_max_number_words, True)
98
  keys.append(new_key)
99
+ cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
100
+ print_discared_by_cond(cond_2)
101
+
102
+ conds["number_words"] = [cond_1, cond_2]
103
 
104
  if "special_characters_ratio" in columns:
105
  cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
 
113
  )
114
  keys.append(new_key)
115
  cond = get_cond(new_key[0], new_key[1], new_key[2])
 
116
  print_discared_by_cond(cond)
117
+ conds["special_characters_ratio"] = [cond]
118
 
119
  if "stopwords_ratio" in columns:
120
  cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
 
124
  new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
125
  keys.append(new_key)
126
  cond = get_cond(new_key[0], new_key[1], new_key[2])
 
127
  print_discared_by_cond(cond)
128
+ conds["stopwords_ratio"] = [cond]
129
 
130
  if "badwords_ratio" in columns:
131
  cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
 
135
  new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
136
  keys.append(new_key)
137
  cond = get_cond(new_key[0], new_key[1], new_key[2])
 
138
  print_discared_by_cond(cond)
139
+ conds["badwords_ratio"] = [cond]
140
 
141
  if "lang_id_score" in columns:
142
  cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
 
146
  new_key = ("lang_id_score", cutoff_lang_id_score, False)
147
  keys.append(new_key)
148
  cond = get_cond(new_key[0], new_key[1], new_key[2])
 
149
  print_discared_by_cond(cond)
150
+ conds["lang_id_score"] = [cond]
151
 
152
  if "perplexity_score" in columns:
153
  cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
 
158
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
159
  keys.append(new_key)
160
  cond = get_cond(new_key[0], new_key[1], new_key[2])
 
161
  print_discared_by_cond(cond)
162
+ conds["perplexity_score"] = [cond]
163
 
164
  return keys, conds
165
 
166
  self.keys, conds = set_sliders(self.docs)
167
 
168
+ all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
169
+ all_conds = np.all(all_conds, axis=0)
170
 
171
  st.header("Filtering on documents")
172
 
173
+ def display_dataset(cond, description):
174
+ displayed_docs = self.docs.loc[cond]
175
+ st.subheader(
176
+ f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
177
+ )
178
+ st.markdown(
179
+ "Click on a column to sort by it, place the cursor on the text to display it."
180
+ )
181
+ st.dataframe(displayed_docs)
182
+
183
+ display_dataset(np.invert(all_conds), "Discarded documents")
184
 
185
+ #st.subheader("Display discarded documents by filter")
186
+ display_discarded_documents_by_filter = st.checkbox("Display discarded documents by filter")
187
+
188
+ if display_discarded_documents_by_filter:
189
+ columns = list(self.docs)
190
+
191
+ if "number_words" in columns:
192
+ cond_filter = np.invert(np.all(conds["number_words"], axis=0))
193
+ display_dataset(cond_filter, "Discarded documents for the filter on the number of words")
194
+
195
+ if "special_characters_ratio" in columns:
196
+ cond_filter = np.invert(np.all(conds["special_characters_ratio"], axis=0))
197
+ display_dataset(cond_filter, "Discarded documents for the filter on the special characters ratio")
198
+
199
+ if "stopwords_ratio" in columns:
200
+ cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
201
+ display_dataset(cond_filter, "Discarded documents for the filter on the stop words ratio")
202
+
203
+ if "badwords_ratio" in columns:
204
+ cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
205
+ display_dataset(cond_filter, "Discarded documents for the filter on the bad words ratio")
206
+
207
+ if "lang_id_score" in columns:
208
+ cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
209
+ display_dataset(cond_filter, "Discarded documents for the filter on the language identification confidence score")
210
+
211
+ if "perplexity_score" in columns:
212
+ cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
213
+ display_dataset(cond_filter, "Discarded documents for the filter on the perplexity score")
214
+
215
+ display_dataset(all_conds, "Retained documents")
216
 
217
  def filtering_of_words(self):
218
  st.sidebar.subheader("Parameter of the filtering on words")