meg-huggingface commited on
Commit
66693d5
·
1 Parent(s): e1f2cc3

Removing need to keep around base dset for the header widget; now just saving what is shown -- the first n lines of the base dataset -- as a json, and loading if it's cached.

Browse files
data_measurements/dataset_statistics.py CHANGED
@@ -185,6 +185,7 @@ class DatasetStatisticsCacheClass:
185
  self.dset = None # original dataset
186
  # HF dataset with all of the self.text_field instances in self.dset
187
  self.text_dset = None
 
188
  # HF dataset with text embeddings in the same order as self.text_dset
189
  self.embeddings_dset = None
190
  # HF dataset with all of the self.label_field instances in self.dset
@@ -254,6 +255,7 @@ class DatasetStatisticsCacheClass:
254
  logs.warning("Creating cache directory %s." % self.cache_path)
255
  mkdir(self.cache_path)
256
  self.dset_fid = pjoin(self.cache_path, "base_dset")
 
257
  self.text_dset_fid = pjoin(self.cache_path, "text_dset")
258
  self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
259
  self.label_dset_fid = pjoin(self.cache_path, "label_dset")
@@ -283,10 +285,6 @@ class DatasetStatisticsCacheClass:
283
  use_streaming=True,
284
  )
285
 
286
- def get_dataset_peek(self):
287
- self.get_base_dataset()
288
- return self.dset[:100]
289
-
290
  def load_or_prepare_general_stats(self, use_cache=False, save=True):
291
  """
292
  Content for expander_general_stats widget.
@@ -462,7 +460,19 @@ class DatasetStatisticsCacheClass:
462
  self.load_or_prepare_text_dset(use_cache, save)
463
  logs.info("Doing tokenized dataframe")
464
  self.load_or_prepare_tokenized_df(use_cache, save)
 
 
465
 
 
 
 
 
 
 
 
 
 
 
466
 
467
  def load_or_prepare_tokenized_df(self, use_cache, save):
468
  if (use_cache and exists(self.tokenized_df_fid)):
@@ -483,20 +493,23 @@ class DatasetStatisticsCacheClass:
483
  logs.info(self.text_dset)
484
  # ...Or load it from the server and store it anew
485
  else:
486
- self.get_base_dataset()
487
- # extract all text instances
488
- self.text_dset = self.dset.map(
489
- lambda examples: extract_field(
490
- examples, self.text_field, OUR_TEXT_FIELD
491
- ),
492
- batched=True,
493
- remove_columns=list(self.dset.features),
494
- )
495
  if save:
496
  # save extracted text instances
497
  logs.warning("Saving dataset to disk")
498
  self.text_dset.save_to_disk(self.text_dset_fid)
499
 
 
 
 
 
 
 
 
 
 
 
 
500
  def do_tokenization(self):
501
  """
502
  Tokenizes the dataset
 
185
  self.dset = None # original dataset
186
  # HF dataset with all of the self.text_field instances in self.dset
187
  self.text_dset = None
188
+ self.dset_peek = None
189
  # HF dataset with text embeddings in the same order as self.text_dset
190
  self.embeddings_dset = None
191
  # HF dataset with all of the self.label_field instances in self.dset
 
255
  logs.warning("Creating cache directory %s." % self.cache_path)
256
  mkdir(self.cache_path)
257
  self.dset_fid = pjoin(self.cache_path, "base_dset")
258
+ self.dset_peek_fid = pjoin(self.cache_path, "dset_peek.json")
259
  self.text_dset_fid = pjoin(self.cache_path, "text_dset")
260
  self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
261
  self.label_dset_fid = pjoin(self.cache_path, "label_dset")
 
285
  use_streaming=True,
286
  )
287
 
 
 
 
 
288
  def load_or_prepare_general_stats(self, use_cache=False, save=True):
289
  """
290
  Content for expander_general_stats widget.
 
460
  self.load_or_prepare_text_dset(use_cache, save)
461
  logs.info("Doing tokenized dataframe")
462
  self.load_or_prepare_tokenized_df(use_cache, save)
463
+ logs.info("Doing dataset peek")
464
+ self.load_or_prepare_dset_peek(save, use_cache)
465
 
466
+ def load_or_prepare_dset_peek(self, save, use_cache):
467
+ if use_cache and exists(self.dset_peek_fid):
468
+ with open(self.dset_peek_fid, "r") as f:
469
+ self.dset_peek = json.load(f)["dset peek"]
470
+ else:
471
+ if self.dset is None:
472
+ self.get_base_dataset()
473
+ self.dset_peek = self.dset[:100]
474
+ if save:
475
+ write_json({"dset_peek": self.dset_peek}, self.dset_peek_fid)
476
 
477
  def load_or_prepare_tokenized_df(self, use_cache, save):
478
  if (use_cache and exists(self.tokenized_df_fid)):
 
493
  logs.info(self.text_dset)
494
  # ...Or load it from the server and store it anew
495
  else:
496
+ self.prepare_text_dset()
 
 
 
 
 
 
 
 
497
  if save:
498
  # save extracted text instances
499
  logs.warning("Saving dataset to disk")
500
  self.text_dset.save_to_disk(self.text_dset_fid)
501
 
502
+ def prepare_text_dset(self):
503
+ self.get_base_dataset()
504
+ # extract all text instances
505
+ self.text_dset = self.dset.map(
506
+ lambda examples: extract_field(
507
+ examples, self.text_field, OUR_TEXT_FIELD
508
+ ),
509
+ batched=True,
510
+ remove_columns=list(self.dset.features),
511
+ )
512
+
513
  def do_tokenization(self):
514
  """
515
  Tokenizes the dataset
data_measurements/streamlit_utils.py CHANGED
@@ -99,7 +99,7 @@ def expander_header(dstats, ds_name_to_dict, column_id):
99
  st.markdown(
100
  ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
101
  )
102
- st.dataframe(dstats.get_dataset_peek())
103
 
104
 
105
  def expander_general_stats(dstats, column_id):
 
99
  st.markdown(
100
  ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
101
  )
102
+ st.dataframe(dstats.dset_peek)
103
 
104
 
105
  def expander_general_stats(dstats, column_id):