Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
b777cd0
1
Parent(s):
c77644c
added save_to_feather and load_from_feather
Browse files- data_storage.py +11 -2
data_storage.py
CHANGED
@@ -13,12 +13,12 @@ class ArXivData:
|
|
13 |
self.arxiv_subjects = None
|
14 |
self._returned_metadata = None
|
15 |
|
16 |
-
def
|
17 |
path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
|
18 |
self._returned_metadata = pd.read_feather(path_to_dataset)
|
19 |
|
20 |
-
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
21 |
self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
|
|
|
22 |
|
23 |
def load_from_query(self, query, max_results, offset=0, raw=False):
|
24 |
if raw:
|
@@ -34,6 +34,15 @@ class ArXivData:
|
|
34 |
self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
|
35 |
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def get_OHE_arxiv_subjects(self, returned_metadata):
|
38 |
mlb = MultiLabelBinarizer()
|
39 |
|
|
|
13 |
self.arxiv_subjects = None
|
14 |
self._returned_metadata = None
|
15 |
|
16 |
+
def load_from_feather(self, dataset_file_name, path_to_data_dir):
|
17 |
path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
|
18 |
self._returned_metadata = pd.read_feather(path_to_dataset)
|
19 |
|
|
|
20 |
self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
|
21 |
+
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
22 |
|
23 |
def load_from_query(self, query, max_results, offset=0, raw=False):
|
24 |
if raw:
|
|
|
34 |
self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
|
35 |
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
36 |
|
37 |
+
def save_as_feather(self, dataset_file_name, path_to_data_dir):
|
38 |
+
if self._returned_metadata is None:
|
39 |
+
raise Exception(
|
40 |
+
"No data stored. Run load_from_query or load_from_feather to retrieve data."
|
41 |
+
)
|
42 |
+
|
43 |
+
path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
|
44 |
+
self._returned_metadata.to_feather(path_to_dataset)
|
45 |
+
|
46 |
def get_OHE_arxiv_subjects(self, returned_metadata):
|
47 |
mlb = MultiLabelBinarizer()
|
48 |
|