Spaces:

mlgeis
/

ArXivRecommenderSystem

Runtime error

Michael-Geis

renamed modules

c2fc17c over 1 year ago

4.59 kB

	import arxiv
	import pandas as pd
	import numpy as np
	import cleaning as clean
	from dataclasses import dataclass, astuple, asdict


	class ArXivData:
	"""A class for storing the metadata of a collection of arXiv papers."""

	def __init__(self) -> None:
	self._returned_metadata = None
	self.metadata = None
	self.arxiv_subjects = None
	self.doc_strings = "title and abstract"
	self.embeddings = None

	def load_from_feather(self, path_to_dataset):
	"""Loads metadata from a saved feather file.

	Args:
	path_to_dataset: path to the feather file containing the dataset.
	"""
	self._returned_metadata = pd.read_feather(path_to_dataset)
	self.metadata = self._returned_metadata
	self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

	def load_from_query(self, query, max_results, offset=0):
	"""Loads instance with data returned from an ArXiv API query.

	Args:
	query: query string used to call the API
	max_results: maximum number of results from the API call to return
	offset: number of results to skip over initially. Defaults to 0.
	"""

	self._returned_metadata = query_to_df(
	query=query, max_results=max_results, offset=offset
	)
	self.metadata = clean.split_categories(self._returned_metadata)
	self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

	def save_as_feather(self, path_to_dataset):
	"""Saves a dataset as a feather file.

	Args:
	path_to_dataset: directory to save the dataset

	Raises:
	Exception: Raises exception if there is no data to be saved.
	"""

	if self.metadata.empty:
	raise Exception(
	"No data stored. Run load_from_query or load_from_feather to retrieve data."
	)
	self.metadata.to_feather(path_to_dataset)


	def query_to_df(query, max_results, offset):
	"""Returns the results of an arxiv API query in a pandas dataframe.

	Args:
	query: string defining an arxiv query formatted according to
	https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction

	max_results: positive integer specifying the maximum number of results returned.

	chunksize:

	Returns:
	pandas dataframe with one column for indivial piece of metadata of a returned result.
	To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
	http://lukasschwab.me/arxiv.py/index.html#Result

	The 'links' column is dropped and the authors column is a list of each author's name as a string.
	The categories column is also a list of all tags appearing.
	"""
	client = arxiv.Client(page_size=2000, num_retries=10)
	search = arxiv.Search(
	query=query,
	max_results=max_results,
	sort_by=arxiv.SortCriterion.LastUpdatedDate,
	)

	columns = ["title", "summary", "categories", "id"]
	index = range(offset, max_results)

	results = client.results(search, offset=offset)

	metadata_generator = (
	(
	result.title,
	result.summary,
	result.categories,
	result.entry_id.split("/")[-1],
	)
	for result in results
	)

	returned_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
	returned_metadata = returned_metadata.rename(columns={"summary": "abstract"})
	return returned_metadata


	# def format_query(author="", title="", cat="", abstract=""):
	# """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
	# leave the corresponding argument blank.

	# e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.

	# Args:
	# author: string to search for in the author field.
	# title: string to search for in the title field.
	# cat: A valid arxiv subject tag. See the full list of these at:
	# https://arxiv.org/category_taxonomy
	# abstract: string to search for in the abstract field.

	# Returns:
	# properly formatted query string to return all results simultaneously matching all specified fields.
	# """

	# tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
	# query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
	# return query