Spaces:
Build error
Build error
fixes
Browse files- src/Surveyor.py +20 -19
src/Surveyor.py
CHANGED
@@ -18,7 +18,7 @@ except:
|
|
18 |
|
19 |
from src.defaults import DEFAULTS_CPU_COMPAT, DEFAULTS_HIGH_GPU
|
20 |
|
21 |
-
|
22 |
|
23 |
class Surveyor:
|
24 |
'''
|
@@ -79,15 +79,16 @@ class Surveyor:
|
|
79 |
spacy.require_gpu()
|
80 |
|
81 |
self.high_gpu = high_gpu
|
|
|
82 |
if self.high_gpu:
|
83 |
-
DEFAULTS = DEFAULTS_HIGH_GPU
|
84 |
|
85 |
if not kw_model_name:
|
86 |
-
kw_model_name = DEFAULTS["kw_model_name"]
|
87 |
-
self.num_papers = DEFAULTS['num_papers']
|
88 |
-
self.max_search = DEFAULTS['max_search']
|
89 |
if not models_dir:
|
90 |
-
models_dir = DEFAULTS['models_dir']
|
91 |
|
92 |
models_found = False
|
93 |
if os.path.exists(models_dir) and not no_save_models:
|
@@ -95,17 +96,17 @@ class Surveyor:
|
|
95 |
models_found = True
|
96 |
|
97 |
if not title_model_name:
|
98 |
-
title_model_name = DEFAULTS["title_model_name"]
|
99 |
if not ex_summ_model_name:
|
100 |
-
ex_summ_model_name = DEFAULTS["ex_summ_model_name"]
|
101 |
if not ledmodel_name:
|
102 |
-
ledmodel_name = DEFAULTS["ledmodel_name"]
|
103 |
if not embedder_name:
|
104 |
-
embedder_name = DEFAULTS["embedder_name"]
|
105 |
if not nlp_name:
|
106 |
-
nlp_name = DEFAULTS["nlp_name"]
|
107 |
if not similarity_nlp_name:
|
108 |
-
similarity_nlp_name = DEFAULTS["similarity_nlp_name"]
|
109 |
|
110 |
if refresh_models or not models_found:
|
111 |
print(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
|
@@ -183,27 +184,27 @@ class Surveyor:
|
|
183 |
if pdf_dir:
|
184 |
self.pdf_dir = pdf_dir
|
185 |
else:
|
186 |
-
self.pdf_dir = DEFAULTS["pdf_dir"]
|
187 |
|
188 |
if txt_dir:
|
189 |
self.txt_dir = txt_dir
|
190 |
else:
|
191 |
-
self.txt_dir = DEFAULTS["txt_dir"]
|
192 |
|
193 |
if img_dir:
|
194 |
self.img_dir = img_dir
|
195 |
else:
|
196 |
-
self.img_dir = DEFAULTS["img_dir"]
|
197 |
|
198 |
if tab_dir:
|
199 |
self.tab_dir = tab_dir
|
200 |
else:
|
201 |
-
self.tab_dir = DEFAULTS["tab_dir"]
|
202 |
|
203 |
if dump_dir:
|
204 |
self.dump_dir = dump_dir
|
205 |
else:
|
206 |
-
self.dump_dir = DEFAULTS["dump_dir"]
|
207 |
|
208 |
dirs = [self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir]
|
209 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
@@ -1337,9 +1338,9 @@ class Surveyor:
|
|
1337 |
import joblib
|
1338 |
import os, shutil
|
1339 |
if not max_search:
|
1340 |
-
max_search = DEFAULTS['max_search']
|
1341 |
if not num_papers:
|
1342 |
-
num_papers = DEFAULTS['num_papers']
|
1343 |
# arxiv api relevance search and data preparation
|
1344 |
print("\nsearching arXiv for top 100 papers.. ")
|
1345 |
results, searched_papers = self.search(query, max_search=max_search)
|
|
|
18 |
|
19 |
from src.defaults import DEFAULTS_CPU_COMPAT, DEFAULTS_HIGH_GPU
|
20 |
|
21 |
+
|
22 |
|
23 |
class Surveyor:
|
24 |
'''
|
|
|
79 |
spacy.require_gpu()
|
80 |
|
81 |
self.high_gpu = high_gpu
|
82 |
+
self.DEFAULTS = DEFAULTS_CPU_COMPAT
|
83 |
if self.high_gpu:
|
84 |
+
self.DEFAULTS = DEFAULTS_HIGH_GPU
|
85 |
|
86 |
if not kw_model_name:
|
87 |
+
kw_model_name = self.DEFAULTS["kw_model_name"]
|
88 |
+
self.num_papers = self.DEFAULTS['num_papers']
|
89 |
+
self.max_search = self.DEFAULTS['max_search']
|
90 |
if not models_dir:
|
91 |
+
models_dir = self.DEFAULTS['models_dir']
|
92 |
|
93 |
models_found = False
|
94 |
if os.path.exists(models_dir) and not no_save_models:
|
|
|
96 |
models_found = True
|
97 |
|
98 |
if not title_model_name:
|
99 |
+
title_model_name = self.DEFAULTS["title_model_name"]
|
100 |
if not ex_summ_model_name:
|
101 |
+
ex_summ_model_name = self.DEFAULTS["ex_summ_model_name"]
|
102 |
if not ledmodel_name:
|
103 |
+
ledmodel_name = self.DEFAULTS["ledmodel_name"]
|
104 |
if not embedder_name:
|
105 |
+
embedder_name = self.DEFAULTS["embedder_name"]
|
106 |
if not nlp_name:
|
107 |
+
nlp_name = self.DEFAULTS["nlp_name"]
|
108 |
if not similarity_nlp_name:
|
109 |
+
similarity_nlp_name = self.DEFAULTS["similarity_nlp_name"]
|
110 |
|
111 |
if refresh_models or not models_found:
|
112 |
print(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
|
|
|
184 |
if pdf_dir:
|
185 |
self.pdf_dir = pdf_dir
|
186 |
else:
|
187 |
+
self.pdf_dir = self.DEFAULTS["pdf_dir"]
|
188 |
|
189 |
if txt_dir:
|
190 |
self.txt_dir = txt_dir
|
191 |
else:
|
192 |
+
self.txt_dir = self.DEFAULTS["txt_dir"]
|
193 |
|
194 |
if img_dir:
|
195 |
self.img_dir = img_dir
|
196 |
else:
|
197 |
+
self.img_dir = self.DEFAULTS["img_dir"]
|
198 |
|
199 |
if tab_dir:
|
200 |
self.tab_dir = tab_dir
|
201 |
else:
|
202 |
+
self.tab_dir = self.DEFAULTS["tab_dir"]
|
203 |
|
204 |
if dump_dir:
|
205 |
self.dump_dir = dump_dir
|
206 |
else:
|
207 |
+
self.dump_dir = self.DEFAULTS["dump_dir"]
|
208 |
|
209 |
dirs = [self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir]
|
210 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
|
|
1338 |
import joblib
|
1339 |
import os, shutil
|
1340 |
if not max_search:
|
1341 |
+
max_search = self.DEFAULTS['max_search']
|
1342 |
if not num_papers:
|
1343 |
+
num_papers = self.DEFAULTS['num_papers']
|
1344 |
# arxiv api relevance search and data preparation
|
1345 |
print("\nsearching arXiv for top 100 papers.. ")
|
1346 |
results, searched_papers = self.search(query, max_search=max_search)
|