Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
aad19c5
1
Parent(s):
62ba9f3
added arxiv query functions
Browse files
util.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
import glob
|
3 |
import pandas as pd
|
4 |
import regex
|
|
|
5 |
|
6 |
def category_map():
|
7 |
"""Maps arXiv subject categories to their full english names.
|
@@ -222,3 +223,61 @@ def find_hyph(text):
|
|
222 |
return None
|
223 |
else:
|
224 |
return list(set(keywords))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import glob
|
3 |
import pandas as pd
|
4 |
import regex
|
5 |
+
import arxiv
|
6 |
|
7 |
def category_map():
|
8 |
"""Maps arXiv subject categories to their full english names.
|
|
|
223 |
return None
|
224 |
else:
|
225 |
return list(set(keywords))
|
226 |
+
|
227 |
+
def format_query(author='',title='',cat='',abstract=''):
|
228 |
+
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
229 |
+
leave the corresponding argument blank.
|
230 |
+
|
231 |
+
e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
|
232 |
+
|
233 |
+
Args:
|
234 |
+
author: string to search for in the author field.
|
235 |
+
title: string to search for in the title field.
|
236 |
+
cat: A valid arxiv subject tag. See the full list of these at:
|
237 |
+
https://arxiv.org/category_taxonomy
|
238 |
+
abstract: string to search for in the abstract field.
|
239 |
+
|
240 |
+
Returns:
|
241 |
+
properly formatted query string to return all results simultaneously matching all specified fields.
|
242 |
+
"""
|
243 |
+
|
244 |
+
tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
|
245 |
+
query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
|
246 |
+
return query
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
def query_to_df(query,max_results):
|
251 |
+
"""Returns the results of an arxiv API query in a pandas dataframe.
|
252 |
+
|
253 |
+
Args:
|
254 |
+
query: string defining an arxiv query formatted according to
|
255 |
+
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
|
256 |
+
|
257 |
+
max_results: positive integer specifying the maximum number of results returned.
|
258 |
+
|
259 |
+
Returns:
|
260 |
+
pandas dataframe with one column for indivial piece of metadata of a returned result.
|
261 |
+
To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
|
262 |
+
http://lukasschwab.me/arxiv.py/index.html#Result
|
263 |
+
|
264 |
+
The 'links' column is dropped and the authors column is a list of each author's name as a string.
|
265 |
+
The categories column is also a list of all tags appearing.
|
266 |
+
"""
|
267 |
+
search = arxiv.Search(
|
268 |
+
query = query,
|
269 |
+
max_results=max_results,
|
270 |
+
sort_by=arxiv.SortCriterion.LastUpdatedDate
|
271 |
+
)
|
272 |
+
results = search.results()
|
273 |
+
|
274 |
+
drop_cols = ['authors','links','_raw']
|
275 |
+
df = pd.DataFrame()
|
276 |
+
|
277 |
+
for result in results:
|
278 |
+
row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
|
279 |
+
row_dict['authors'] = [author.name for author in result.authors]
|
280 |
+
row = pd.Series(row_dict)
|
281 |
+
df = pd.concat([df , row.to_frame().transpose()], axis = 0)
|
282 |
+
|
283 |
+
return df.reset_index(drop=True,inplace=False)
|