Michael-Geis commited on
Commit
aad19c5
1 Parent(s): 62ba9f3

added arxiv query functions

Browse files
Files changed (1) hide show
  1. util.py +59 -0
util.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import glob
3
  import pandas as pd
4
  import regex
 
5
 
6
  def category_map():
7
  """Maps arXiv subject categories to their full english names.
@@ -222,3 +223,61 @@ def find_hyph(text):
222
  return None
223
  else:
224
  return list(set(keywords))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import glob
3
  import pandas as pd
4
  import regex
5
+ import arxiv
6
 
7
  def category_map():
8
  """Maps arXiv subject categories to their full english names.
 
223
  return None
224
  else:
225
  return list(set(keywords))
226
+
227
+ def format_query(author='',title='',cat='',abstract=''):
228
+ """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
229
+ leave the corresponding argument blank.
230
+
231
+ e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
232
+
233
+ Args:
234
+ author: string to search for in the author field.
235
+ title: string to search for in the title field.
236
+ cat: A valid arxiv subject tag. See the full list of these at:
237
+ https://arxiv.org/category_taxonomy
238
+ abstract: string to search for in the abstract field.
239
+
240
+ Returns:
241
+ properly formatted query string to return all results simultaneously matching all specified fields.
242
+ """
243
+
244
+ tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
245
+ query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
246
+ return query
247
+
248
+
249
+
250
+ def query_to_df(query,max_results):
251
+ """Returns the results of an arxiv API query in a pandas dataframe.
252
+
253
+ Args:
254
+ query: string defining an arxiv query formatted according to
255
+ https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
256
+
257
+ max_results: positive integer specifying the maximum number of results returned.
258
+
259
+ Returns:
260
+ pandas dataframe with one column for indivial piece of metadata of a returned result.
261
+ To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
262
+ http://lukasschwab.me/arxiv.py/index.html#Result
263
+
264
+ The 'links' column is dropped and the authors column is a list of each author's name as a string.
265
+ The categories column is also a list of all tags appearing.
266
+ """
267
+ search = arxiv.Search(
268
+ query = query,
269
+ max_results=max_results,
270
+ sort_by=arxiv.SortCriterion.LastUpdatedDate
271
+ )
272
+ results = search.results()
273
+
274
+ drop_cols = ['authors','links','_raw']
275
+ df = pd.DataFrame()
276
+
277
+ for result in results:
278
+ row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
279
+ row_dict['authors'] = [author.name for author in result.authors]
280
+ row = pd.Series(row_dict)
281
+ df = pd.concat([df , row.to_frame().transpose()], axis = 0)
282
+
283
+ return df.reset_index(drop=True,inplace=False)