Michael-Geis commited on
Commit
283e21a
1 Parent(s): aad19c5

created lib class, msc dict

Browse files
Files changed (3) hide show
  1. collection.ipynb +0 -0
  2. library_class.py +25 -0
  3. util.py +38 -1
collection.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
library_class.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import util
2
+ import pandas as pd
3
+ import os
4
+
5
+ class Library(object):
6
+
7
+ def load_from_file(self,library_name):
8
+ self.raw_lib = pd.read_parquet(os.path.join('./data',library_name))
9
+
10
+ def load_from_query(self,query_string,max_results):
11
+ self.raw_lib = util.query_to_df(query_string,max_results)
12
+
13
+ def clean_library(self):
14
+
15
+ ## drop columns that we aren't going to modify
16
+ cols = ['title','summary','authors','primary_category','categories']
17
+ input_lib = self.raw_lib[cols].copy()
18
+
19
+ input_lib['title'] = input_lib['title'].apply(util.cleanse)
20
+ input_lib['summary'] = input_lib['summary'].apply(util.cleanse)
21
+ input_lib['hyph_in_summary'] = input_lib['summary'].apply(util.find_hyph)
22
+ input_lib['hyph_in_title'] = input_lib['title'].apply(util.find_hyph)
23
+ input_lib['msc_tags'] = input_lib.categories.apply(util.find_msc).apply(util.msc_to_eng)
24
+
25
+ self.clean_lib = input_lib
util.py CHANGED
@@ -3,6 +3,8 @@ import glob
3
  import pandas as pd
4
  import regex
5
  import arxiv
 
 
6
 
7
  def category_map():
8
  """Maps arXiv subject categories to their full english names.
@@ -175,6 +177,26 @@ def category_map():
175
  'stat.OT': 'Other Statistics',
176
  'stat.TH': 'Statistics Theory'}
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  ## 1. Latin-ize latex accents enclosed in brackets
179
  def remove_latex_accents(string):
180
  accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
@@ -224,6 +246,19 @@ def find_hyph(text):
224
  else:
225
  return list(set(keywords))
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  def format_query(author='',title='',cat='',abstract=''):
228
  """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
229
  leave the corresponding argument blank.
@@ -264,12 +299,13 @@ def query_to_df(query,max_results):
264
  The 'links' column is dropped and the authors column is a list of each author's name as a string.
265
  The categories column is also a list of all tags appearing.
266
  """
 
267
  search = arxiv.Search(
268
  query = query,
269
  max_results=max_results,
270
  sort_by=arxiv.SortCriterion.LastUpdatedDate
271
  )
272
- results = search.results()
273
 
274
  drop_cols = ['authors','links','_raw']
275
  df = pd.DataFrame()
@@ -277,6 +313,7 @@ def query_to_df(query,max_results):
277
  for result in results:
278
  row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
279
  row_dict['authors'] = [author.name for author in result.authors]
 
280
  row = pd.Series(row_dict)
281
  df = pd.concat([df , row.to_frame().transpose()], axis = 0)
282
 
 
3
  import pandas as pd
4
  import regex
5
  import arxiv
6
+ import json
7
+ import util
8
 
9
  def category_map():
10
  """Maps arXiv subject categories to their full english names.
 
177
  'stat.OT': 'Other Statistics',
178
  'stat.TH': 'Statistics Theory'}
179
 
180
+
181
+ def msc_tags():
182
+ with open('./data/msc.json','r') as file:
183
+ text = file.read()
184
+ return json.loads(text)
185
+
186
+ def msc_to_eng(msc_list):
187
+ out = []
188
+ if msc_list is None:
189
+ return None
190
+ for tag in msc_list:
191
+ if tag not in util.msc_tags().keys():
192
+ continue
193
+ else:
194
+ out.append(util.msc_tags()[tag])
195
+ return out
196
+
197
+
198
+
199
+
200
  ## 1. Latin-ize latex accents enclosed in brackets
201
  def remove_latex_accents(string):
202
  accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
 
246
  else:
247
  return list(set(keywords))
248
 
249
+ def find_msc(cat_list):
250
+ pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
251
+ out = []
252
+ for cat in cat_list:
253
+ tags = regex.findall(pattern,cat)
254
+ for tag in tags:
255
+ out.append(tag)
256
+ if out == []:
257
+ return None
258
+ else:
259
+ return out
260
+
261
+
262
  def format_query(author='',title='',cat='',abstract=''):
263
  """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
264
  leave the corresponding argument blank.
 
299
  The 'links' column is dropped and the authors column is a list of each author's name as a string.
300
  The categories column is also a list of all tags appearing.
301
  """
302
+ client = arxiv.Client(page_size=100,num_retries=3)
303
  search = arxiv.Search(
304
  query = query,
305
  max_results=max_results,
306
  sort_by=arxiv.SortCriterion.LastUpdatedDate
307
  )
308
+ results = client.results(search)
309
 
310
  drop_cols = ['authors','links','_raw']
311
  df = pd.DataFrame()
 
313
  for result in results:
314
  row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
315
  row_dict['authors'] = [author.name for author in result.authors]
316
+ row_dict['links'] = [link.href for link in result.links]
317
  row = pd.Series(row_dict)
318
  df = pd.concat([df , row.to_frame().transpose()], axis = 0)
319