meg-huggingface commited on
Commit
e1f2cc3
1 Parent(s): 6af9ef6

Removing any need for a dataframe in expander_general_stats; instead making sure to cache and load the small amount of details needed for this widget. Note I also moved around a couple functions -- same content, just moved -- so that it was easier for me to navigate through the code. I also pulled out a couple of sub-functions from larger functions, again to make the code easier to work with/understand, as well as helping to further modularize so we can limit what needs to be cached.

Browse files
app.py CHANGED
@@ -143,7 +143,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
143
  logs.info("showing header")
144
  st_utils.expander_header(dstats, ds_name_to_dict, column_id)
145
  logs.info("showing general stats")
146
- st_utils.expander_general_stats(dstats, _SHOW_TOP_N_WORDS, column_id)
147
  st_utils.expander_label_distribution(dstats.label_df, dstats.fig_labels, column_id)
148
  st_utils.expander_text_lengths(
149
  dstats.tokenized_df,
@@ -154,7 +154,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
154
  LENGTH_FIELD,
155
  column_id,
156
  )
157
- st_utils.expander_text_duplicates(dstats.text_dup_counts_df, column_id)
158
 
159
  # We do the loading of these after the others in order to have some time
160
  # to compute while the user works with the details above.
@@ -191,7 +191,6 @@ def main():
191
 
192
  # When not doing new development, use the cache.
193
  use_cache = True
194
- # TODO: Better handling of this eg, st.sidebar.checkbox("Show clustering")=
195
  show_embeddings = st.sidebar.checkbox("Show embeddings")
196
  # List of datasets for which embeddings are hard to compute:
197
 
 
143
  logs.info("showing header")
144
  st_utils.expander_header(dstats, ds_name_to_dict, column_id)
145
  logs.info("showing general stats")
146
+ st_utils.expander_general_stats(dstats, column_id)
147
  st_utils.expander_label_distribution(dstats.label_df, dstats.fig_labels, column_id)
148
  st_utils.expander_text_lengths(
149
  dstats.tokenized_df,
 
154
  LENGTH_FIELD,
155
  column_id,
156
  )
157
+ st_utils.expander_text_duplicates(dstats, column_id)
158
 
159
  # We do the loading of these after the others in order to have some time
160
  # to compute while the user works with the details above.
 
191
 
192
  # When not doing new development, use the cache.
193
  use_cache = True
 
194
  show_embeddings = st.sidebar.checkbox("Show embeddings")
195
  # List of datasets for which embeddings are hard to compute:
196
 
data_measurements/dataset_statistics.py CHANGED
@@ -33,6 +33,8 @@ from nltk.corpus import stopwords
33
  from sklearn.feature_extraction.text import CountVectorizer
34
 
35
  from .dataset_utils import (
 
 
36
  CNT,
37
  DEDUP_TOT,
38
  EMBEDDING_FIELD,
@@ -143,13 +145,9 @@ _TREE_MIN_NODES = 250
143
  # as long as we're using sklearn - already pushing the resources
144
  _MAX_CLUSTER_EXAMPLES = 5000
145
  _NUM_VOCAB_BATCHES = 2000
146
-
147
-
148
  _CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
149
 
150
- num_rows = 200000
151
-
152
-
153
  class DatasetStatisticsCacheClass:
154
  def __init__(
155
  self,
@@ -193,7 +191,7 @@ class DatasetStatisticsCacheClass:
193
  self.label_dset = None
194
  ## Data frames
195
  # Tokenized text
196
- self.tokenized_df = []
197
  # save sentence length histogram in the class so it doesn't ge re-computed
198
  self.fig_tok_length = None
199
  # Data Frame version of self.label_dset
@@ -205,12 +203,14 @@ class DatasetStatisticsCacheClass:
205
  # Vocabulary filtered to remove stopwords
206
  self.vocab_counts_filtered_df = None
207
  ## General statistics and duplicates
 
 
208
  # Number of NaN values (NOT empty strings)
209
  self.text_nan_count = 0
210
  # Number of text items that appear more than once in the dataset
211
  self.dedup_total = 0
212
  # Duplicated text items along with their number of occurences ("count")
213
- self.text_dup_counts_df = None
214
  self.avg_length = None
215
  self.std_length = None
216
  self.general_stats_dict = None
@@ -258,10 +258,12 @@ class DatasetStatisticsCacheClass:
258
  self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
259
  self.label_dset_fid = pjoin(self.cache_path, "label_dset")
260
  self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
261
- self.general_stats_fid = pjoin(self.cache_path, "general_stats.json")
262
- self.text_duplicate_counts_df_fid = pjoin(
263
- self.cache_path, "text_dup_counts_df.feather"
264
  )
 
 
265
  self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
266
  self.fig_labels_fid = pjoin(self.cache_path, "fig_labels.json")
267
  self.node_list_fid = pjoin(self.cache_path, "node_list.th")
@@ -285,38 +287,47 @@ class DatasetStatisticsCacheClass:
285
  self.get_base_dataset()
286
  return self.dset[:100]
287
 
288
- def load_or_prepare_general_stats(self, use_cache=False):
289
- """Data structures used in calculating general statistics and duplicates"""
 
 
 
 
 
290
 
291
- # TODO: These probably don't need to be feather files, could be csv.
 
 
292
  # General statistics
293
  if (
294
  use_cache
295
  and exists(self.general_stats_fid)
296
- and exists(self.text_duplicate_counts_df_fid)
 
297
  ):
298
- self.load_general_stats(
299
- self.general_stats_fid, self.text_duplicate_counts_df_fid
300
- )
301
  else:
302
- (
303
- self.text_nan_count,
304
- self.dedup_total,
305
- self.text_dup_counts_df,
306
- ) = self.prepare_general_text_stats()
307
- self.general_stats_dict = {
308
- TEXT_NAN_CNT: self.text_nan_count,
309
- DEDUP_TOT: self.dedup_total,
310
- }
311
- write_df(self.text_dup_counts_df, self.text_duplicate_counts_df_fid)
312
- write_json(self.general_stats_dict, self.general_stats_fid)
 
 
313
 
314
  def load_or_prepare_text_lengths(self, use_cache=False, save=True):
315
  # TODO: Everything here can be read from cache; it's in a transitory
316
  # state atm where just the fig is cached. Clean up.
317
  if use_cache and exists(self.fig_tok_length_fid):
318
  self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
319
- if len(self.tokenized_df) == 0:
320
  self.tokenized_df = self.do_tokenization()
321
  self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(len)
322
  self.avg_length = round(
@@ -385,56 +396,54 @@ class DatasetStatisticsCacheClass:
385
  logs.info("filtered vocab")
386
  logs.info(self.vocab_counts_filtered_df)
387
 
388
- def load_or_prepare_npmi_terms(self, use_cache=False):
389
- self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=use_cache)
390
- self.npmi_stats.load_or_prepare_npmi_terms()
391
-
392
- def load_or_prepare_zipf(self, use_cache=False, save=True):
393
- # TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
394
- # when only reading from cache. Either the UI should use it, or it should
395
- # be removed when reading in cache
396
- if use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
397
- with open(self.zipf_fid, "r") as f:
398
- zipf_dict = json.load(f)
399
- self.z = Zipf()
400
- self.z.load(zipf_dict)
401
- self.zipf_fig = read_plotly(self.zipf_fig_fid)
402
- elif use_cache and exists(self.zipf_fid):
403
- # TODO: Read zipf data so that the vocab is there.
404
- with open(self.zipf_fid, "r") as f:
405
- zipf_dict = json.load(f)
406
- self.z = Zipf()
407
- self.z.load(zipf_dict)
408
- self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
409
- if save:
410
- write_plotly(self.zipf_fig, self.zipf_fig_fid)
411
- else:
412
- self.z = Zipf(self.vocab_counts_df)
413
- self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
414
- if save:
415
- write_zipf_data(self.z, self.zipf_fid)
416
- write_plotly(self.zipf_fig, self.zipf_fig_fid)
417
 
418
- def prepare_general_text_stats(self):
419
- text_nan_count = int(self.tokenized_df.isnull().sum().sum())
420
- dup_df = self.tokenized_df[self.tokenized_df.duplicated([self.our_text_field])]
421
- dedup_df = pd.DataFrame(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  dup_df.pivot_table(
423
- columns=[self.our_text_field], aggfunc="size"
424
  ).sort_values(ascending=False),
425
  columns=[CNT],
426
  )
427
- dedup_df.index = dedup_df.index.map(str)
428
- dedup_df[OUR_TEXT_FIELD] = dedup_df.index
429
- dedup_total = sum(dedup_df[CNT])
430
- return text_nan_count, dedup_total, dedup_df
431
-
432
- def load_general_stats(self, general_stats_fid, text_duplicate_counts_df_fid):
433
- general_stats = json.load(open(general_stats_fid, encoding="utf-8"))
434
- self.text_nan_count = general_stats[TEXT_NAN_CNT]
435
- self.dedup_total = general_stats[DEDUP_TOT]
436
- with open(text_duplicate_counts_df_fid, "rb") as f:
437
- self.text_dup_counts_df = feather.read_feather(f)
438
 
439
  def load_or_prepare_dataset(self, use_cache=True, save=True):
440
  """
@@ -449,20 +458,24 @@ class DatasetStatisticsCacheClass:
449
  Returns:
450
 
451
  """
452
- self.load_or_prepare_text_dset(save, use_cache)
453
- self.load_or_prepare_tokenized_df(save, use_cache)
 
 
454
 
455
- def load_or_prepare_tokenized_df(self, save, use_cache):
 
456
  if (use_cache and exists(self.tokenized_df_fid)):
457
  self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
458
  else:
459
  # tokenize all text instances
460
  self.tokenized_df = self.do_tokenization()
461
  if save:
 
462
  # save tokenized text
463
  write_df(self.tokenized_df, self.tokenized_df_fid)
464
 
465
- def load_or_prepare_text_dset(self, save, use_cache):
466
  if (use_cache and exists(self.text_dset_fid)):
467
  # load extracted text
468
  self.text_dset = load_from_disk(self.text_dset_fid)
@@ -557,11 +570,35 @@ class DatasetStatisticsCacheClass:
557
  self.label_dset.save_to_disk(self.label_dset_fid)
558
  write_plotly(self.fig_labels, self.fig_labels_fid)
559
 
560
- def load_vocab(self):
561
- with open(self.vocab_counts_df_fid, "rb") as f:
562
- self.vocab_counts_df = feather.read_feather(f)
563
- # Handling for changes in how the index is saved.
564
- self.vocab_counts_df = self._set_idx_col_names(self.vocab_counts_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
  def _set_idx_col_names(self, input_vocab_df):
567
  if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
 
33
  from sklearn.feature_extraction.text import CountVectorizer
34
 
35
  from .dataset_utils import (
36
+ TOT_WORDS,
37
+ TOT_OPEN_WORDS,
38
  CNT,
39
  DEDUP_TOT,
40
  EMBEDDING_FIELD,
 
145
  # as long as we're using sklearn - already pushing the resources
146
  _MAX_CLUSTER_EXAMPLES = 5000
147
  _NUM_VOCAB_BATCHES = 2000
148
+ _TOP_N = 100
 
149
  _CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
150
 
 
 
 
151
  class DatasetStatisticsCacheClass:
152
  def __init__(
153
  self,
 
191
  self.label_dset = None
192
  ## Data frames
193
  # Tokenized text
194
+ self.tokenized_df = None
195
  # save sentence length histogram in the class so it doesn't ge re-computed
196
  self.fig_tok_length = None
197
  # Data Frame version of self.label_dset
 
203
  # Vocabulary filtered to remove stopwords
204
  self.vocab_counts_filtered_df = None
205
  ## General statistics and duplicates
206
+ self.total_words = 0
207
+ self.total_open_words = 0
208
  # Number of NaN values (NOT empty strings)
209
  self.text_nan_count = 0
210
  # Number of text items that appear more than once in the dataset
211
  self.dedup_total = 0
212
  # Duplicated text items along with their number of occurences ("count")
213
+ self.dup_counts_df = None
214
  self.avg_length = None
215
  self.std_length = None
216
  self.general_stats_dict = None
 
258
  self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
259
  self.label_dset_fid = pjoin(self.cache_path, "label_dset")
260
  self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
261
+ self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
262
+ self.dup_counts_df_fid = pjoin(
263
+ self.cache_path, "dup_counts_df.feather"
264
  )
265
+ self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
266
+ "sorted_top_vocab.feather")
267
  self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
268
  self.fig_labels_fid = pjoin(self.cache_path, "fig_labels.json")
269
  self.node_list_fid = pjoin(self.cache_path, "node_list.th")
 
287
  self.get_base_dataset()
288
  return self.dset[:100]
289
 
290
+ def load_or_prepare_general_stats(self, use_cache=False, save=True):
291
+ """
292
+ Content for expander_general_stats widget.
293
+ Provides statistics for total words, total open words,
294
+ the sorted top vocab, the NaN count, and the duplicate count.
295
+ Args:
296
+ use_cache:
297
 
298
+ Returns:
299
+
300
+ """
301
  # General statistics
302
  if (
303
  use_cache
304
  and exists(self.general_stats_fid)
305
+ and exists(self.dup_counts_df_fid)
306
+ and exists(self.sorted_top_vocab_df_fid)
307
  ):
308
+ print('Loading cached general stats')
309
+ self.load_general_stats()
 
310
  else:
311
+ print('Preparing general stats')
312
+ self.prepare_general_stats()
313
+ if save:
314
+ print(self.sorted_top_vocab_df)
315
+ print(self.sorted_top_vocab_df_fid)
316
+ write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
317
+ print(self.dup_counts_df)
318
+ print(self.dup_counts_df_fid)
319
+ write_df(self.dup_counts_df, self.dup_counts_df_fid)
320
+ print(self.general_stats_dict)
321
+ print(self.general_stats_fid)
322
+ write_json(self.general_stats_dict, self.general_stats_fid)
323
+
324
 
325
  def load_or_prepare_text_lengths(self, use_cache=False, save=True):
326
  # TODO: Everything here can be read from cache; it's in a transitory
327
  # state atm where just the fig is cached. Clean up.
328
  if use_cache and exists(self.fig_tok_length_fid):
329
  self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
330
+ if self.tokenized_df is None:
331
  self.tokenized_df = self.do_tokenization()
332
  self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(len)
333
  self.avg_length = round(
 
396
  logs.info("filtered vocab")
397
  logs.info(self.vocab_counts_filtered_df)
398
 
399
+ def load_vocab(self):
400
+ with open(self.vocab_counts_df_fid, "rb") as f:
401
+ self.vocab_counts_df = feather.read_feather(f)
402
+ # Handling for changes in how the index is saved.
403
+ self.vocab_counts_df = self._set_idx_col_names(self.vocab_counts_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ def load_general_stats(self):
406
+ self.general_stats_dict = json.load(open(self.general_stats_fid, encoding="utf-8"))
407
+ with open(self.dup_counts_df_fid, "rb") as f:
408
+ self.dup_counts_df = feather.read_feather(f)
409
+ with open(self.sorted_top_vocab_df_fid, "rb") as f:
410
+ self.sorted_top_vocab_df = feather.read_feather(f)
411
+ self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
412
+ self.dedup_total = self.general_stats_dict[DEDUP_TOT]
413
+ self.total_words = self.general_stats_dict[TOT_WORDS]
414
+ self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
415
+
416
+ def prepare_general_stats(self):
417
+ if self.tokenized_df is None:
418
+ logs.warning("Tokenized dataset not yet loaded; doing so.")
419
+ self.load_or_prepare_dataset()
420
+ if self.vocab_counts_df is None:
421
+ logs.warning("Vocab not yet loaded; doing so.")
422
+ self.load_or_prepare_vocab()
423
+ self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
424
+ "count", ascending=False
425
+ ).head(_TOP_N)
426
+ print('basics')
427
+ self.total_words = len(self.vocab_counts_df)
428
+ self.total_open_words = len(self.vocab_counts_filtered_df)
429
+ self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
430
+ dup_df = self.tokenized_df[self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
431
+ print('dup df')
432
+ self.dup_counts_df = pd.DataFrame(
433
  dup_df.pivot_table(
434
+ columns=[OUR_TEXT_FIELD], aggfunc="size"
435
  ).sort_values(ascending=False),
436
  columns=[CNT],
437
  )
438
+ print('deddup df')
439
+ self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
440
+ self.dedup_total = sum(self.dup_counts_df[CNT])
441
+ self.general_stats_dict = {
442
+ TOT_WORDS: self.total_words,
443
+ TOT_OPEN_WORDS: self.total_open_words,
444
+ TEXT_NAN_CNT: self.text_nan_count,
445
+ DEDUP_TOT: self.dedup_total,
446
+ }
 
 
447
 
448
  def load_or_prepare_dataset(self, use_cache=True, save=True):
449
  """
 
458
  Returns:
459
 
460
  """
461
+ logs.info("Doing text dset.")
462
+ self.load_or_prepare_text_dset(use_cache, save)
463
+ logs.info("Doing tokenized dataframe")
464
+ self.load_or_prepare_tokenized_df(use_cache, save)
465
 
466
+
467
+ def load_or_prepare_tokenized_df(self, use_cache, save):
468
  if (use_cache and exists(self.tokenized_df_fid)):
469
  self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
470
  else:
471
  # tokenize all text instances
472
  self.tokenized_df = self.do_tokenization()
473
  if save:
474
+ logs.warning("Saving tokenized dataset to disk")
475
  # save tokenized text
476
  write_df(self.tokenized_df, self.tokenized_df_fid)
477
 
478
+ def load_or_prepare_text_dset(self, use_cache, save):
479
  if (use_cache and exists(self.text_dset_fid)):
480
  # load extracted text
481
  self.text_dset = load_from_disk(self.text_dset_fid)
 
570
  self.label_dset.save_to_disk(self.label_dset_fid)
571
  write_plotly(self.fig_labels, self.fig_labels_fid)
572
 
573
+ def load_or_prepare_npmi_terms(self, use_cache=False):
574
+ self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=use_cache)
575
+ self.npmi_stats.load_or_prepare_npmi_terms()
576
+
577
+ def load_or_prepare_zipf(self, use_cache=False, save=True):
578
+ # TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
579
+ # when only reading from cache. Either the UI should use it, or it should
580
+ # be removed when reading in cache
581
+ if use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
582
+ with open(self.zipf_fid, "r") as f:
583
+ zipf_dict = json.load(f)
584
+ self.z = Zipf()
585
+ self.z.load(zipf_dict)
586
+ self.zipf_fig = read_plotly(self.zipf_fig_fid)
587
+ elif use_cache and exists(self.zipf_fid):
588
+ # TODO: Read zipf data so that the vocab is there.
589
+ with open(self.zipf_fid, "r") as f:
590
+ zipf_dict = json.load(f)
591
+ self.z = Zipf()
592
+ self.z.load(zipf_dict)
593
+ self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
594
+ if save:
595
+ write_plotly(self.zipf_fig, self.zipf_fig_fid)
596
+ else:
597
+ self.z = Zipf(self.vocab_counts_df)
598
+ self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
599
+ if save:
600
+ write_zipf_data(self.z, self.zipf_fid)
601
+ write_plotly(self.zipf_fig, self.zipf_fig_fid)
602
 
603
  def _set_idx_col_names(self, input_vocab_df):
604
  if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
data_measurements/dataset_utils.py CHANGED
@@ -43,6 +43,8 @@ PROP = "proportion"
43
  TEXT_NAN_CNT = "text_nan_count"
44
  TXT_LEN = "text lengths"
45
  DEDUP_TOT = "dedup_total"
 
 
46
 
47
  _DATASET_LIST = [
48
  "c4",
 
43
  TEXT_NAN_CNT = "text_nan_count"
44
  TXT_LEN = "text lengths"
45
  DEDUP_TOT = "dedup_total"
46
+ TOT_WORDS = "total words"
47
+ TOT_OPEN_WORDS = "total open words"
48
 
49
  _DATASET_LIST = [
50
  "c4",
data_measurements/streamlit_utils.py CHANGED
@@ -102,32 +102,34 @@ def expander_header(dstats, ds_name_to_dict, column_id):
102
  st.dataframe(dstats.get_dataset_peek())
103
 
104
 
105
- def expander_general_stats(dstats, top_n, column_id):
106
  with st.expander(f"General Text Statistics{column_id}"):
107
  st.caption(
108
- "Use this widget to check whether the terms you see most represented in the dataset make sense for the goals of the dataset."
 
109
  )
110
  st.markdown(
111
- "There are {0} total words".format(str(len(dstats.vocab_counts_df)))
112
  )
113
  st.markdown(
114
  "There are {0} words after removing closed "
115
- "class words".format(str(len(dstats.vocab_counts_filtered_df)))
116
  )
117
- sorted_top_vocab_df = dstats.vocab_counts_filtered_df.sort_values(
118
- "count", ascending=False
119
- ).head(top_n)
120
  st.markdown(
121
- "The most common [open class words](https://dictionary.apa.org/open-class-words) and their counts are: "
 
 
122
  )
123
- st.dataframe(sorted_top_vocab_df)
124
  st.markdown(
125
  "There are {0} missing values in the dataset.".format(
126
  str(dstats.text_nan_count)
127
  )
128
  )
129
  st.markdown(
130
- "There are {0} duplicate items in the dataset. For more information about the duplicates, click the 'Duplicates' tab below.".format(
 
 
131
  str(dstats.dedup_total)
132
  )
133
  )
@@ -269,7 +271,8 @@ def expander_text_embeddings(
269
 
270
 
271
  ### Then, show duplicates
272
- def expander_text_duplicates(dedup_df, column_id):
 
273
  with st.expander(f"Text Duplicates{column_id}", expanded=False):
274
  st.caption(
275
  "Use this widget to identify text strings that appear more than once."
@@ -277,16 +280,15 @@ def expander_text_duplicates(dedup_df, column_id):
277
  st.markdown(
278
  "A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
279
  )
280
- dedup_df["count"] = dedup_df["count"] + 1
281
  st.markdown("------")
282
  st.write(
283
  "### Here is the list of all the duplicated items and their counts in your dataset:"
284
  )
285
  # Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
286
- if len(dedup_df) == 0:
287
  st.write("There are no duplicates in this dataset! 🥳")
288
  else:
289
- gb = GridOptionsBuilder.from_dataframe(dedup_df)
290
  gb.configure_column(
291
  f"text{column_id}",
292
  wrapText=True,
@@ -296,7 +298,7 @@ def expander_text_duplicates(dedup_df, column_id):
296
  use_container_width=True,
297
  )
298
  go = gb.build()
299
- AgGrid(dedup_df, gridOptions=go)
300
 
301
 
302
  def expander_npmi_description(min_vocab):
 
102
  st.dataframe(dstats.get_dataset_peek())
103
 
104
 
105
+ def expander_general_stats(dstats, column_id):
106
  with st.expander(f"General Text Statistics{column_id}"):
107
  st.caption(
108
+ "Use this widget to check whether the terms you see most represented"
109
+ " in the dataset make sense for the goals of the dataset."
110
  )
111
  st.markdown(
112
+ "There are {0} total words".format(str(dstats.total_words))
113
  )
114
  st.markdown(
115
  "There are {0} words after removing closed "
116
+ "class words".format(str(dstats.total_open_words))
117
  )
 
 
 
118
  st.markdown(
119
+ "The most common "
120
+ "[open class words](https://dictionary.apa.org/open-class-words) "
121
+ "and their counts are: "
122
  )
123
+ st.dataframe(dstats.sorted_top_vocab_df)
124
  st.markdown(
125
  "There are {0} missing values in the dataset.".format(
126
  str(dstats.text_nan_count)
127
  )
128
  )
129
  st.markdown(
130
+ "There are {0} duplicate items in the dataset. "
131
+ "For more information about the duplicates, "
132
+ "click the 'Duplicates' tab below.".format(
133
  str(dstats.dedup_total)
134
  )
135
  )
 
271
 
272
 
273
  ### Then, show duplicates
274
+ def expander_text_duplicates(dstats, column_id):
275
+ # TODO: Saving/loading figure
276
  with st.expander(f"Text Duplicates{column_id}", expanded=False):
277
  st.caption(
278
  "Use this widget to identify text strings that appear more than once."
 
280
  st.markdown(
281
  "A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
282
  )
 
283
  st.markdown("------")
284
  st.write(
285
  "### Here is the list of all the duplicated items and their counts in your dataset:"
286
  )
287
  # Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
288
+ if len(dstats.dup_counts_df) == 0:
289
  st.write("There are no duplicates in this dataset! 🥳")
290
  else:
291
+ gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
292
  gb.configure_column(
293
  f"text{column_id}",
294
  wrapText=True,
 
298
  use_container_width=True,
299
  )
300
  go = gb.build()
301
+ AgGrid(dstats.dup_counts_df, gridOptions=go)
302
 
303
 
304
  def expander_npmi_description(min_vocab):