Spaces:

mlgeis
/

ArXivRecommenderSystem

Runtime error

ArXivRecommenderSystem / arxiv_query_retrieval.py

Michael-Geis

wrote class for holding arxiv data

9b818c8 about 1 year ago

4.38 kB

	import arxiv
	import pandas as pd
	import data_cleaning as clean
	from sklearn.preprocessing import MultiLabelBinarizer

	class ArXivData():
	"""A light class for storing the metadata of a collection of arXiv papers.
	"""

	def __init__(self):
	"""
	data: dataframe holding the metadata. Each row represents a paper and each column is
	a separate piece of metadata.

	query: A tuple of the form (query_string,max_results) where query_string is the formatted
	string that produced the raw data and max_results is the value of that parameter passed to the
	arXiv API.

	raw: The original, raw dataset as returned by the arXiv API, if current data is clean.

	cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
	"""

	self.data = None
	self.query = None
	self.raw = None
	self.categories = None

	def get_from_query(self,query_string,max_results):
	self.data = query_to_df(query=query_string,max_results=max_results)
	self.query = (query_string,max_results)
	self.raw = self.data
	self.categories = self.get_OHE_cats()


	def clean(self,dataset):
	"""Constructs this dataset by cleaning another one.

	Args:
	dataset: An ArXivData object containing data to be cleaned.
	"""
	self.data = clean.clean(dataset)
	self.query = dataset.query
	self.raw = dataset.raw
	self.categories = dataset.categories

	def get_OHE_cats(self):
	mlb = MultiLabelBinarizer()
	OHE_category_array = mlb.fit_transform(self.data.categories)
	return pd.DataFrame(
	OHE_category_array, columns = mlb.classes_).rename(
	mapper=clean.category_map())





	def format_query(author='',title='',cat='',abstract=''):
	"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
	leave the corresponding argument blank.

	e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.

	Args:
	author: string to search for in the author field.
	title: string to search for in the title field.
	cat: A valid arxiv subject tag. See the full list of these at:
	https://arxiv.org/category_taxonomy
	abstract: string to search for in the abstract field.

	Returns:
	properly formatted query string to return all results simultaneously matching all specified fields.
	"""

	tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
	query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
	return query



	def query_to_df(query,max_results):
	"""Returns the results of an arxiv API query in a pandas dataframe.

	Args:
	query: string defining an arxiv query formatted according to
	https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction

	max_results: positive integer specifying the maximum number of results returned.

	Returns:
	pandas dataframe with one column for indivial piece of metadata of a returned result.
	To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
	http://lukasschwab.me/arxiv.py/index.html#Result

	The 'links' column is dropped and the authors column is a list of each author's name as a string.
	The categories column is also a list of all tags appearing.
	"""
	client = arxiv.Client(page_size=100,num_retries=3)
	search = arxiv.Search(
	query = query,
	max_results=max_results,
	sort_by=arxiv.SortCriterion.LastUpdatedDate
	)
	results = client.results(search)

	drop_cols = ['authors','links','_raw']
	df = pd.DataFrame()

	for result in results:
	row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
	row_dict['authors'] = [author.name for author in result.authors]
	row_dict['links'] = [link.href for link in result.links]
	row = pd.Series(row_dict)
	df = pd.concat([df , row.to_frame().transpose()], axis = 0)

	return df.reset_index(drop=True,inplace=False)