Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
β’
3d2ca49
1
Parent(s):
b2af341
reorganized and created a pipeline for CBF
Browse files- .gitignore +2 -1
- config/main.yaml +13 -0
- requirements.txt β config/requirements.txt +2 -1
- prediction-flow-sample.ipynb +207 -5
- __init__.py β src/__init__.py +0 -0
- cleaning.py β src/cleaning.py +12 -0
- embedding.py β src/embedding.py +18 -1
- src/model.py +33 -0
- src/search.py +30 -0
- storage.py β src/storage.py +15 -7
.gitignore
CHANGED
@@ -7,4 +7,5 @@ env
|
|
7 |
settings.json
|
8 |
collection.ipynb
|
9 |
testing.ipynb
|
10 |
-
testnb.ipynb
|
|
|
|
7 |
settings.json
|
8 |
collection.ipynb
|
9 |
testing.ipynb
|
10 |
+
testnb.ipynb
|
11 |
+
output
|
config/main.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
input_id:
|
4 |
+
id: 1602.00730
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
paths:
|
11 |
+
path_to_library: ./data/libraries/APSP_50_allenai-specter
|
12 |
+
path_to_save_recs: ./output/first_recs.feather
|
13 |
+
|
requirements.txt β config/requirements.txt
RENAMED
@@ -4,4 +4,5 @@ numpy
|
|
4 |
arxiv
|
5 |
sentence_transformers
|
6 |
regex
|
7 |
-
sklearn
|
|
|
|
4 |
arxiv
|
5 |
sentence_transformers
|
6 |
regex
|
7 |
+
sklearn
|
8 |
+
hydra
|
prediction-flow-sample.ipynb
CHANGED
@@ -37,11 +37,11 @@
|
|
37 |
}
|
38 |
],
|
39 |
"source": [
|
40 |
-
"import embedding\n",
|
41 |
-
"import storage\n",
|
42 |
-
"from storage import ArXivData\n",
|
43 |
-
"from cleaning import TextCleaner\n",
|
44 |
-
"from embedding import Embedder\n",
|
45 |
"from sentence_transformers import util\n",
|
46 |
"\n",
|
47 |
"importlib.reload(embedding)\n",
|
@@ -301,6 +301,208 @@
|
|
301 |
"\n",
|
302 |
"prepped_library._returned_metadata.iloc[indices]"
|
303 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
}
|
305 |
],
|
306 |
"metadata": {
|
|
|
37 |
}
|
38 |
],
|
39 |
"source": [
|
40 |
+
"import src.embedding as embedding\n",
|
41 |
+
"import src.storage as storage\n",
|
42 |
+
"from src.storage import ArXivData\n",
|
43 |
+
"from src.cleaning import TextCleaner\n",
|
44 |
+
"from src.embedding import Embedder\n",
|
45 |
"from sentence_transformers import util\n",
|
46 |
"\n",
|
47 |
"importlib.reload(embedding)\n",
|
|
|
301 |
"\n",
|
302 |
"prepped_library._returned_metadata.iloc[indices]"
|
303 |
]
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"cell_type": "code",
|
307 |
+
"execution_count": 20,
|
308 |
+
"metadata": {},
|
309 |
+
"outputs": [],
|
310 |
+
"source": [
|
311 |
+
"id_list = [\"1602.00730\"]"
|
312 |
+
]
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"cell_type": "code",
|
316 |
+
"execution_count": 46,
|
317 |
+
"metadata": {},
|
318 |
+
"outputs": [
|
319 |
+
{
|
320 |
+
"data": {
|
321 |
+
"text/plain": [
|
322 |
+
"<module 'search' from 'c:\\\\Users\\\\Leems\\\\Desktop\\\\Coding\\\\Projects\\\\Fritz\\\\search.py'>"
|
323 |
+
]
|
324 |
+
},
|
325 |
+
"execution_count": 46,
|
326 |
+
"metadata": {},
|
327 |
+
"output_type": "execute_result"
|
328 |
+
}
|
329 |
+
],
|
330 |
+
"source": [
|
331 |
+
"import src.embedding as embedding\n",
|
332 |
+
"import src.search as search\n",
|
333 |
+
"import importlib\n",
|
334 |
+
"from src.storage import Fetch\n",
|
335 |
+
"from src.cleaning import TextCleaner\n",
|
336 |
+
"from src.embedding import Embedder\n",
|
337 |
+
"from src.search import Search\n",
|
338 |
+
"\n",
|
339 |
+
"importlib.reload(embedding)\n",
|
340 |
+
"importlib.reload(search)"
|
341 |
+
]
|
342 |
+
},
|
343 |
+
{
|
344 |
+
"cell_type": "code",
|
345 |
+
"execution_count": 47,
|
346 |
+
"metadata": {},
|
347 |
+
"outputs": [],
|
348 |
+
"source": [
|
349 |
+
"## Fetch metadata of input\n",
|
350 |
+
"getter = Fetch()\n",
|
351 |
+
"into_cleaner = getter.transform(X=id_list)"
|
352 |
+
]
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"cell_type": "code",
|
356 |
+
"execution_count": 48,
|
357 |
+
"metadata": {},
|
358 |
+
"outputs": [],
|
359 |
+
"source": [
|
360 |
+
"cleaner = TextCleaner()\n",
|
361 |
+
"\n",
|
362 |
+
"into_embedder = cleaner.transform(into_cleaner)"
|
363 |
+
]
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"cell_type": "code",
|
367 |
+
"execution_count": 49,
|
368 |
+
"metadata": {},
|
369 |
+
"outputs": [],
|
370 |
+
"source": [
|
371 |
+
"embedder = Embedder(model_name=\"allenai-specter\")\n",
|
372 |
+
"into_search = embedder.transform(into_embedder)"
|
373 |
+
]
|
374 |
+
},
|
375 |
+
{
|
376 |
+
"cell_type": "code",
|
377 |
+
"execution_count": 52,
|
378 |
+
"metadata": {},
|
379 |
+
"outputs": [
|
380 |
+
{
|
381 |
+
"data": {
|
382 |
+
"text/plain": [
|
383 |
+
"['1602.00730v1', '1411.0658v3', '1905.05136v3', '1103.1276v4', '2003.04597v2']"
|
384 |
+
]
|
385 |
+
},
|
386 |
+
"execution_count": 52,
|
387 |
+
"metadata": {},
|
388 |
+
"output_type": "execute_result"
|
389 |
+
}
|
390 |
+
],
|
391 |
+
"source": [
|
392 |
+
"search = Search(path_to_library=\"./data/libraries/APSP_50_allenai-specter/\")\n",
|
393 |
+
"\n",
|
394 |
+
"search.transform(X=into_search).id.to_list()"
|
395 |
+
]
|
396 |
+
},
|
397 |
+
{
|
398 |
+
"cell_type": "code",
|
399 |
+
"execution_count": 54,
|
400 |
+
"metadata": {},
|
401 |
+
"outputs": [
|
402 |
+
{
|
403 |
+
"data": {
|
404 |
+
"text/html": [
|
405 |
+
"<div>\n",
|
406 |
+
"<style scoped>\n",
|
407 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
408 |
+
" vertical-align: middle;\n",
|
409 |
+
" }\n",
|
410 |
+
"\n",
|
411 |
+
" .dataframe tbody tr th {\n",
|
412 |
+
" vertical-align: top;\n",
|
413 |
+
" }\n",
|
414 |
+
"\n",
|
415 |
+
" .dataframe thead th {\n",
|
416 |
+
" text-align: right;\n",
|
417 |
+
" }\n",
|
418 |
+
"</style>\n",
|
419 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
420 |
+
" <thead>\n",
|
421 |
+
" <tr style=\"text-align: right;\">\n",
|
422 |
+
" <th></th>\n",
|
423 |
+
" <th>title</th>\n",
|
424 |
+
" <th>abstract</th>\n",
|
425 |
+
" <th>id</th>\n",
|
426 |
+
" <th>arxiv_subjects</th>\n",
|
427 |
+
" <th>msc_tags</th>\n",
|
428 |
+
" </tr>\n",
|
429 |
+
" </thead>\n",
|
430 |
+
" <tbody>\n",
|
431 |
+
" <tr>\n",
|
432 |
+
" <th>38787</th>\n",
|
433 |
+
" <td>C-infinity Scaling Asymptotics for the Spectra...</td>\n",
|
434 |
+
" <td>This article concerns new off-diagonal estimat...</td>\n",
|
435 |
+
" <td>1602.00730v1</td>\n",
|
436 |
+
" <td>[math.AP, math-ph, math.DG, math.FA, math.MP, ...</td>\n",
|
437 |
+
" <td>None</td>\n",
|
438 |
+
" </tr>\n",
|
439 |
+
" <tr>\n",
|
440 |
+
" <th>39127</th>\n",
|
441 |
+
" <td>Scaling Limit for the Kernel of the Spectral P...</td>\n",
|
442 |
+
" <td>Let (M, g) be a compact smooth Riemannian mani...</td>\n",
|
443 |
+
" <td>1411.0658v3</td>\n",
|
444 |
+
" <td>[math.SP, math.AP, math.DG]</td>\n",
|
445 |
+
" <td>None</td>\n",
|
446 |
+
" </tr>\n",
|
447 |
+
" <tr>\n",
|
448 |
+
" <th>9786</th>\n",
|
449 |
+
" <td>A logarithmic improvement in the two-point Wey...</td>\n",
|
450 |
+
" <td>In this paper, we study the two-point Weyl Law...</td>\n",
|
451 |
+
" <td>1905.05136v3</td>\n",
|
452 |
+
" <td>[math.AP, math.SP]</td>\n",
|
453 |
+
" <td>[35P20]</td>\n",
|
454 |
+
" </tr>\n",
|
455 |
+
" <tr>\n",
|
456 |
+
" <th>49609</th>\n",
|
457 |
+
" <td>The blowup along the diagonal of the spectral ...</td>\n",
|
458 |
+
" <td>We formulate a precise conjecture about the un...</td>\n",
|
459 |
+
" <td>1103.1276v4</td>\n",
|
460 |
+
" <td>[math.DG, math-ph, math.AP, math.MP]</td>\n",
|
461 |
+
" <td>[58J50, 35J15, 33C45, 32C05]</td>\n",
|
462 |
+
" </tr>\n",
|
463 |
+
" <tr>\n",
|
464 |
+
" <th>14857</th>\n",
|
465 |
+
" <td>Growth of high $L^p$ norms for eigenfunctions:...</td>\n",
|
466 |
+
" <td>This work concerns $L^p$ norms of high energy ...</td>\n",
|
467 |
+
" <td>2003.04597v2</td>\n",
|
468 |
+
" <td>[math.AP, math.SP]</td>\n",
|
469 |
+
" <td>None</td>\n",
|
470 |
+
" </tr>\n",
|
471 |
+
" </tbody>\n",
|
472 |
+
"</table>\n",
|
473 |
+
"</div>"
|
474 |
+
],
|
475 |
+
"text/plain": [
|
476 |
+
" title ... msc_tags\n",
|
477 |
+
"38787 C-infinity Scaling Asymptotics for the Spectra... ... None\n",
|
478 |
+
"39127 Scaling Limit for the Kernel of the Spectral P... ... None\n",
|
479 |
+
"9786 A logarithmic improvement in the two-point Wey... ... [35P20]\n",
|
480 |
+
"49609 The blowup along the diagonal of the spectral ... ... [58J50, 35J15, 33C45, 32C05]\n",
|
481 |
+
"14857 Growth of high $L^p$ norms for eigenfunctions:... ... None\n",
|
482 |
+
"\n",
|
483 |
+
"[5 rows x 5 columns]"
|
484 |
+
]
|
485 |
+
},
|
486 |
+
"execution_count": 54,
|
487 |
+
"metadata": {},
|
488 |
+
"output_type": "execute_result"
|
489 |
+
}
|
490 |
+
],
|
491 |
+
"source": [
|
492 |
+
"from sklearn.pipeline import Pipeline\n",
|
493 |
+
"\n",
|
494 |
+
"pipe = Pipeline(\n",
|
495 |
+
" [\n",
|
496 |
+
" (\"fetch\", Fetch()),\n",
|
497 |
+
" (\"clean\", TextCleaner()),\n",
|
498 |
+
" (\"embed\", Embedder(model_name=\"allenai-specter\")),\n",
|
499 |
+
" (\"search\", Search(path_to_library=\"./data/libraries/APSP_50_allenai-specter/\")),\n",
|
500 |
+
" ]\n",
|
501 |
+
")\n",
|
502 |
+
"\n",
|
503 |
+
"\n",
|
504 |
+
"pipe.transform(X=id_list)"
|
505 |
+
]
|
506 |
}
|
507 |
],
|
508 |
"metadata": {
|
__init__.py β src/__init__.py
RENAMED
File without changes
|
cleaning.py β src/cleaning.py
RENAMED
@@ -9,6 +9,18 @@ from sklearn.base import BaseEstimator, TransformerMixin
|
|
9 |
|
10 |
|
11 |
class TextCleaner(BaseEstimator, TransformerMixin):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"""Return ArXivData class object with its metadata attribute modified so that
|
13 |
1. The 'title' and 'abstract' columns have been scrubbed of latex and accented characters
|
14 |
2. The msc tag list has been translated to english.
|
|
|
9 |
|
10 |
|
11 |
class TextCleaner(BaseEstimator, TransformerMixin):
|
12 |
+
def fit(self, X, y=None):
|
13 |
+
return self
|
14 |
+
|
15 |
+
def transform(self, X, y=None):
|
16 |
+
doc_strings = (
|
17 |
+
X.title.apply(cleanse) + " " + X.abstract.apply(cleanse)
|
18 |
+
).to_list()
|
19 |
+
|
20 |
+
return doc_strings
|
21 |
+
|
22 |
+
|
23 |
+
class FullTextCleaner(BaseEstimator, TransformerMixin):
|
24 |
"""Return ArXivData class object with its metadata attribute modified so that
|
25 |
1. The 'title' and 'abstract' columns have been scrubbed of latex and accented characters
|
26 |
2. The msc tag list has been translated to english.
|
embedding.py β src/embedding.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
import cleaning as clean
|
2 |
from sentence_transformers import SentenceTransformer, util
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
@@ -8,6 +8,23 @@ import os
|
|
8 |
|
9 |
|
10 |
class Embedder(BaseEstimator, TransformerMixin):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"""A class to handle creating sentence transformer embeddings from a clean arxiv dataset."""
|
12 |
|
13 |
def fit(self, X, y=None):
|
|
|
1 |
+
import src.cleaning as clean
|
2 |
from sentence_transformers import SentenceTransformer, util
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
|
|
8 |
|
9 |
|
10 |
class Embedder(BaseEstimator, TransformerMixin):
|
11 |
+
"""Takes a list of clean strings and outputs a numpy array of their embeddings generated by the ST model model_name."""
|
12 |
+
|
13 |
+
def __init__(self, model_name) -> None:
|
14 |
+
super().__init__()
|
15 |
+
self.model_name = model_name
|
16 |
+
|
17 |
+
def fit(self, X, y=None):
|
18 |
+
return self
|
19 |
+
|
20 |
+
def transform(self, X, y=None):
|
21 |
+
encoder = SentenceTransformer(self.model_name)
|
22 |
+
embedded_documents = encoder.encode(sentences=X)
|
23 |
+
|
24 |
+
return embedded_documents
|
25 |
+
|
26 |
+
|
27 |
+
class FullEmbedder(BaseEstimator, TransformerMixin):
|
28 |
"""A class to handle creating sentence transformer embeddings from a clean arxiv dataset."""
|
29 |
|
30 |
def fit(self, X, y=None):
|
src/model.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.pipeline import Pipeline
|
3 |
+
from src.storage import Fetch
|
4 |
+
from src.cleaning import TextCleaner
|
5 |
+
from src.embedding import Embedder
|
6 |
+
from src.search import Search
|
7 |
+
|
8 |
+
|
9 |
+
def main():
|
10 |
+
id_list = [id]
|
11 |
+
path_to_library = ""
|
12 |
+
save_recs = False
|
13 |
+
path_to_save_recs = ""
|
14 |
+
|
15 |
+
## Create pipeline
|
16 |
+
|
17 |
+
model = Pipeline(
|
18 |
+
[
|
19 |
+
("fetch", Fetch()),
|
20 |
+
("clean", TextCleaner()),
|
21 |
+
("embed", Embedder(model_name="allenai-specter")),
|
22 |
+
("search", Search(path_to_library=path_to_library)),
|
23 |
+
]
|
24 |
+
)
|
25 |
+
|
26 |
+
recommendation_df = model.transform(id_list)
|
27 |
+
|
28 |
+
if save_recs:
|
29 |
+
recommendation_df.to_feather(path_to_save_recs)
|
30 |
+
|
31 |
+
|
32 |
+
if __name__ == "main":
|
33 |
+
main()
|
src/search.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import util
|
2 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
class Search(BaseEstimator, TransformerMixin):
|
8 |
+
def __init__(self, path_to_library) -> None:
|
9 |
+
super().__init__()
|
10 |
+
|
11 |
+
self.path_to_library = path_to_library
|
12 |
+
|
13 |
+
def fit(self):
|
14 |
+
return self
|
15 |
+
|
16 |
+
def transform(self, X, y=None):
|
17 |
+
library_metadata = pd.read_feather(
|
18 |
+
os.path.join(self.path_to_library, "metadata.feather")
|
19 |
+
)
|
20 |
+
library_embeddings = pd.read_feather(
|
21 |
+
os.path.join(self.path_to_library, "embeddings.feather")
|
22 |
+
).values
|
23 |
+
|
24 |
+
matches = util.semantic_search(
|
25 |
+
query_embeddings=X, corpus_embeddings=library_embeddings, top_k=5
|
26 |
+
)
|
27 |
+
|
28 |
+
recommended_indices = [dict["corpus_id"] for dict in matches[0]]
|
29 |
+
|
30 |
+
return library_metadata.iloc[recommended_indices]
|
storage.py β src/storage.py
RENAMED
@@ -1,8 +1,16 @@
|
|
1 |
import arxiv
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
-
import cleaning as clean
|
5 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
class ArXivData:
|
@@ -62,7 +70,7 @@ class ArXivData:
|
|
62 |
self.metadata.to_feather(path_to_dataset)
|
63 |
|
64 |
|
65 |
-
def query_to_df(query=None, id_list=None, max_results=
|
66 |
"""Returns the results of an arxiv API query in a pandas dataframe.
|
67 |
|
68 |
Args:
|
@@ -84,6 +92,7 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
|
|
84 |
client = arxiv.Client(page_size=2000, num_retries=10)
|
85 |
|
86 |
if id_list:
|
|
|
87 |
search = arxiv.Search(
|
88 |
id_list=id_list,
|
89 |
max_results=max_results,
|
@@ -102,8 +111,7 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
|
|
102 |
sort_by=arxiv.SortCriterion.LastUpdatedDate,
|
103 |
)
|
104 |
|
105 |
-
columns = ["title", "
|
106 |
-
index = range(offset, max_results)
|
107 |
|
108 |
results = client.results(search, offset=offset)
|
109 |
|
@@ -111,14 +119,14 @@ def query_to_df(query=None, id_list=None, max_results=None, offset=0):
|
|
111 |
(
|
112 |
result.title,
|
113 |
result.summary,
|
|
|
114 |
result.categories,
|
115 |
result.entry_id.split("/")[-1],
|
116 |
)
|
117 |
for result in results
|
118 |
)
|
119 |
|
120 |
-
returned_metadata = pd.DataFrame(metadata_generator, columns=columns
|
121 |
-
returned_metadata = returned_metadata.rename(columns={"summary": "abstract"})
|
122 |
return returned_metadata
|
123 |
|
124 |
|
|
|
1 |
import arxiv
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
+
import src.cleaning as clean
|
5 |
+
from sklearn.base import TransformerMixin, BaseEstimator
|
6 |
+
|
7 |
+
|
8 |
+
class Fetch(BaseEstimator, TransformerMixin):
|
9 |
+
def fit(self):
|
10 |
+
return self
|
11 |
+
|
12 |
+
def transform(self, X, y=None):
|
13 |
+
return query_to_df(id_list=X)
|
14 |
|
15 |
|
16 |
class ArXivData:
|
|
|
70 |
self.metadata.to_feather(path_to_dataset)
|
71 |
|
72 |
|
73 |
+
def query_to_df(query=None, id_list=None, max_results=10, offset=0):
|
74 |
"""Returns the results of an arxiv API query in a pandas dataframe.
|
75 |
|
76 |
Args:
|
|
|
92 |
client = arxiv.Client(page_size=2000, num_retries=10)
|
93 |
|
94 |
if id_list:
|
95 |
+
max_results = len(id_list)
|
96 |
search = arxiv.Search(
|
97 |
id_list=id_list,
|
98 |
max_results=max_results,
|
|
|
111 |
sort_by=arxiv.SortCriterion.LastUpdatedDate,
|
112 |
)
|
113 |
|
114 |
+
columns = ["title", "abstract", "authors", "categories", "id"]
|
|
|
115 |
|
116 |
results = client.results(search, offset=offset)
|
117 |
|
|
|
119 |
(
|
120 |
result.title,
|
121 |
result.summary,
|
122 |
+
[author.name for author in result.authors],
|
123 |
result.categories,
|
124 |
result.entry_id.split("/")[-1],
|
125 |
)
|
126 |
for result in results
|
127 |
)
|
128 |
|
129 |
+
returned_metadata = pd.DataFrame(metadata_generator, columns=columns)
|
|
|
130 |
return returned_metadata
|
131 |
|
132 |
|