Michael-Geis commited on
Commit
b777cd0
1 Parent(s): c77644c

added save_to_feather and load_from_feather

Browse files
Files changed (1) hide show
  1. data_storage.py +11 -2
data_storage.py CHANGED
@@ -13,12 +13,12 @@ class ArXivData:
13
  self.arxiv_subjects = None
14
  self._returned_metadata = None
15
 
16
- def load_from_file(self, dataset_file_name, path_to_data_dir):
17
  path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
18
  self._returned_metadata = pd.read_feather(path_to_dataset)
19
 
20
- self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
21
  self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
 
22
 
23
  def load_from_query(self, query, max_results, offset=0, raw=False):
24
  if raw:
@@ -34,6 +34,15 @@ class ArXivData:
34
  self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
35
  self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
36
 
 
 
 
 
 
 
 
 
 
37
  def get_OHE_arxiv_subjects(self, returned_metadata):
38
  mlb = MultiLabelBinarizer()
39
 
 
13
  self.arxiv_subjects = None
14
  self._returned_metadata = None
15
 
16
+ def load_from_feather(self, dataset_file_name, path_to_data_dir):
17
  path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
18
  self._returned_metadata = pd.read_feather(path_to_dataset)
19
 
 
20
  self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
21
+ self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
22
 
23
  def load_from_query(self, query, max_results, offset=0, raw=False):
24
  if raw:
 
34
  self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
35
  self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
36
 
37
+ def save_as_feather(self, dataset_file_name, path_to_data_dir):
38
+ if self._returned_metadata is None:
39
+ raise Exception(
40
+ "No data stored. Run load_from_query or load_from_feather to retrieve data."
41
+ )
42
+
43
+ path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
44
+ self._returned_metadata.to_feather(path_to_dataset)
45
+
46
  def get_OHE_arxiv_subjects(self, returned_metadata):
47
  mlb = MultiLabelBinarizer()
48