import arxiv import pandas as pd import data_cleaning as clean from sklearn.preprocessing import MultiLabelBinarizer class ArXivData(): """A light class for storing the metadata of a collection of arXiv papers. """ def __init__(self): """ data: dataframe holding the metadata. Each row represents a paper and each column is a separate piece of metadata. query: A tuple of the form (query_string,max_results) where query_string is the formatted string that produced the raw data and max_results is the value of that parameter passed to the arXiv API. raw: The original, raw dataset as returned by the arXiv API, if current data is clean. cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame. """ self.data = None self.query = None self.raw = None self.categories = None def get_from_query(self,query_string,max_results): self.data = query_to_df(query=query_string,max_results=max_results) self.query = (query_string,max_results) self.raw = self.data self.categories = self.get_OHE_cats() def clean(self,dataset): """Constructs this dataset by cleaning another one. Args: dataset: An ArXivData object containing data to be cleaned. """ self.data = clean.clean(dataset) self.query = dataset.query self.raw = dataset.raw self.categories = dataset.categories def get_OHE_cats(self): mlb = MultiLabelBinarizer() OHE_category_array = mlb.fit_transform(self.data.categories) return pd.DataFrame( OHE_category_array, columns = mlb.classes_).rename( mapper=clean.category_map()) def format_query(author='',title='',cat='',abstract=''): """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified, leave the corresponding argument blank. e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'. Args: author: string to search for in the author field. title: string to search for in the title field. cat: A valid arxiv subject tag. See the full list of these at: https://arxiv.org/category_taxonomy abstract: string to search for in the abstract field. Returns: properly formatted query string to return all results simultaneously matching all specified fields. """ tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}'] query = ' AND '.join([tag for tag in tags if not tag.endswith(':')]) return query def query_to_df(query,max_results): """Returns the results of an arxiv API query in a pandas dataframe. Args: query: string defining an arxiv query formatted according to https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction max_results: positive integer specifying the maximum number of results returned. Returns: pandas dataframe with one column for indivial piece of metadata of a returned result. To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here: http://lukasschwab.me/arxiv.py/index.html#Result The 'links' column is dropped and the authors column is a list of each author's name as a string. The categories column is also a list of all tags appearing. """ client = arxiv.Client(page_size=100,num_retries=3) search = arxiv.Search( query = query, max_results=max_results, sort_by=arxiv.SortCriterion.LastUpdatedDate ) results = client.results(search) drop_cols = ['authors','links','_raw'] df = pd.DataFrame() for result in results: row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols} row_dict['authors'] = [author.name for author in result.authors] row_dict['links'] = [link.href for link in result.links] row = pd.Series(row_dict) df = pd.concat([df , row.to_frame().transpose()], axis = 0) return df.reset_index(drop=True,inplace=False)