Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
283e21a
1
Parent(s):
aad19c5
created lib class, msc dict
Browse files- collection.ipynb +0 -0
- library_class.py +25 -0
- util.py +38 -1
collection.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
library_class.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import util
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
|
5 |
+
class Library(object):
|
6 |
+
|
7 |
+
def load_from_file(self,library_name):
|
8 |
+
self.raw_lib = pd.read_parquet(os.path.join('./data',library_name))
|
9 |
+
|
10 |
+
def load_from_query(self,query_string,max_results):
|
11 |
+
self.raw_lib = util.query_to_df(query_string,max_results)
|
12 |
+
|
13 |
+
def clean_library(self):
|
14 |
+
|
15 |
+
## drop columns that we aren't going to modify
|
16 |
+
cols = ['title','summary','authors','primary_category','categories']
|
17 |
+
input_lib = self.raw_lib[cols].copy()
|
18 |
+
|
19 |
+
input_lib['title'] = input_lib['title'].apply(util.cleanse)
|
20 |
+
input_lib['summary'] = input_lib['summary'].apply(util.cleanse)
|
21 |
+
input_lib['hyph_in_summary'] = input_lib['summary'].apply(util.find_hyph)
|
22 |
+
input_lib['hyph_in_title'] = input_lib['title'].apply(util.find_hyph)
|
23 |
+
input_lib['msc_tags'] = input_lib.categories.apply(util.find_msc).apply(util.msc_to_eng)
|
24 |
+
|
25 |
+
self.clean_lib = input_lib
|
util.py
CHANGED
@@ -3,6 +3,8 @@ import glob
|
|
3 |
import pandas as pd
|
4 |
import regex
|
5 |
import arxiv
|
|
|
|
|
6 |
|
7 |
def category_map():
|
8 |
"""Maps arXiv subject categories to their full english names.
|
@@ -175,6 +177,26 @@ def category_map():
|
|
175 |
'stat.OT': 'Other Statistics',
|
176 |
'stat.TH': 'Statistics Theory'}
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
## 1. Latin-ize latex accents enclosed in brackets
|
179 |
def remove_latex_accents(string):
|
180 |
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
|
@@ -224,6 +246,19 @@ def find_hyph(text):
|
|
224 |
else:
|
225 |
return list(set(keywords))
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
def format_query(author='',title='',cat='',abstract=''):
|
228 |
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
229 |
leave the corresponding argument blank.
|
@@ -264,12 +299,13 @@ def query_to_df(query,max_results):
|
|
264 |
The 'links' column is dropped and the authors column is a list of each author's name as a string.
|
265 |
The categories column is also a list of all tags appearing.
|
266 |
"""
|
|
|
267 |
search = arxiv.Search(
|
268 |
query = query,
|
269 |
max_results=max_results,
|
270 |
sort_by=arxiv.SortCriterion.LastUpdatedDate
|
271 |
)
|
272 |
-
results =
|
273 |
|
274 |
drop_cols = ['authors','links','_raw']
|
275 |
df = pd.DataFrame()
|
@@ -277,6 +313,7 @@ def query_to_df(query,max_results):
|
|
277 |
for result in results:
|
278 |
row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
|
279 |
row_dict['authors'] = [author.name for author in result.authors]
|
|
|
280 |
row = pd.Series(row_dict)
|
281 |
df = pd.concat([df , row.to_frame().transpose()], axis = 0)
|
282 |
|
|
|
3 |
import pandas as pd
|
4 |
import regex
|
5 |
import arxiv
|
6 |
+
import json
|
7 |
+
import util
|
8 |
|
9 |
def category_map():
|
10 |
"""Maps arXiv subject categories to their full english names.
|
|
|
177 |
'stat.OT': 'Other Statistics',
|
178 |
'stat.TH': 'Statistics Theory'}
|
179 |
|
180 |
+
|
181 |
+
def msc_tags():
|
182 |
+
with open('./data/msc.json','r') as file:
|
183 |
+
text = file.read()
|
184 |
+
return json.loads(text)
|
185 |
+
|
186 |
+
def msc_to_eng(msc_list):
|
187 |
+
out = []
|
188 |
+
if msc_list is None:
|
189 |
+
return None
|
190 |
+
for tag in msc_list:
|
191 |
+
if tag not in util.msc_tags().keys():
|
192 |
+
continue
|
193 |
+
else:
|
194 |
+
out.append(util.msc_tags()[tag])
|
195 |
+
return out
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
|
200 |
## 1. Latin-ize latex accents enclosed in brackets
|
201 |
def remove_latex_accents(string):
|
202 |
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
|
|
|
246 |
else:
|
247 |
return list(set(keywords))
|
248 |
|
249 |
+
def find_msc(cat_list):
|
250 |
+
pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
|
251 |
+
out = []
|
252 |
+
for cat in cat_list:
|
253 |
+
tags = regex.findall(pattern,cat)
|
254 |
+
for tag in tags:
|
255 |
+
out.append(tag)
|
256 |
+
if out == []:
|
257 |
+
return None
|
258 |
+
else:
|
259 |
+
return out
|
260 |
+
|
261 |
+
|
262 |
def format_query(author='',title='',cat='',abstract=''):
|
263 |
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
264 |
leave the corresponding argument blank.
|
|
|
299 |
The 'links' column is dropped and the authors column is a list of each author's name as a string.
|
300 |
The categories column is also a list of all tags appearing.
|
301 |
"""
|
302 |
+
client = arxiv.Client(page_size=100,num_retries=3)
|
303 |
search = arxiv.Search(
|
304 |
query = query,
|
305 |
max_results=max_results,
|
306 |
sort_by=arxiv.SortCriterion.LastUpdatedDate
|
307 |
)
|
308 |
+
results = client.results(search)
|
309 |
|
310 |
drop_cols = ['authors','links','_raw']
|
311 |
df = pd.DataFrame()
|
|
|
313 |
for result in results:
|
314 |
row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
|
315 |
row_dict['authors'] = [author.name for author in result.authors]
|
316 |
+
row_dict['links'] = [link.href for link in result.links]
|
317 |
row = pd.Series(row_dict)
|
318 |
df = pd.concat([df , row.to_frame().transpose()], axis = 0)
|
319 |
|