Spaces:
Runtime error
Runtime error
import arxiv | |
import pandas as pd | |
import data_cleaning as clean | |
from sklearn.preprocessing import MultiLabelBinarizer | |
import os | |
class ArXivData: | |
"""A light class for storing the metadata of a collection of arXiv papers.""" | |
def __init__(self): | |
self.metadata = None | |
self.arxiv_subjects = None | |
self._returned_metadata = None | |
def load_from_file(self, dataset_file_name, path_to_data_dir): | |
path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name) | |
self._returned_metadata = pd.read_feather(path_to_dataset) | |
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata) | |
self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"]) | |
def load_from_query(self, query_string, max_results, offset=0): | |
self._returned_metadata = query_to_df( | |
query=query_string, max_results=max_results, offset=offset | |
) | |
self.metadata = self._returned_metadata.drop(columns="arxiv_subjects") | |
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata) | |
def clean(self, dataset): | |
"""Constructs this dataset by cleaning another one. | |
Args: | |
dataset: An ArXivData object containing data to be cleaned. | |
""" | |
self.data = clean.clean(dataset) | |
self.query = dataset.query | |
self.raw = dataset.raw | |
self.categories = dataset.categories | |
def get_OHE_arxiv_subjects(returned_metadata): | |
mlb = MultiLabelBinarizer() | |
OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects) | |
arxiv_subject_labels = clean.category_map() | |
return pd.DataFrame(OHE_arxiv_subjects_array, columns=mlb.classes_).rename( | |
columns=arxiv_subject_labels | |
) | |
def format_query(author="", title="", cat="", abstract=""): | |
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified, | |
leave the corresponding argument blank. | |
e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'. | |
Args: | |
author: string to search for in the author field. | |
title: string to search for in the title field. | |
cat: A valid arxiv subject tag. See the full list of these at: | |
https://arxiv.org/category_taxonomy | |
abstract: string to search for in the abstract field. | |
Returns: | |
properly formatted query string to return all results simultaneously matching all specified fields. | |
""" | |
tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"] | |
query = " AND ".join([tag for tag in tags if not tag.endswith(":")]) | |
return query | |
def query_to_df(query, max_results, offset): | |
"""Returns the results of an arxiv API query in a pandas dataframe. | |
Args: | |
query: string defining an arxiv query formatted according to | |
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction | |
max_results: positive integer specifying the maximum number of results returned. | |
chunksize: | |
Returns: | |
pandas dataframe with one column for indivial piece of metadata of a returned result. | |
To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here: | |
http://lukasschwab.me/arxiv.py/index.html#Result | |
The 'links' column is dropped and the authors column is a list of each author's name as a string. | |
The categories column is also a list of all tags appearing. | |
""" | |
client = arxiv.Client(page_size=2000, num_retries=3) | |
search = arxiv.Search( | |
query=query, | |
max_results=max_results, | |
sort_by=arxiv.SortCriterion.LastUpdatedDate, | |
) | |
columns = ["title", "summary", "categories", "id"] | |
index = range(offset, max_results) | |
results = client.results(search, offset=offset) | |
metadata_generator = ( | |
( | |
result.title, | |
result.summary, | |
result.categories, | |
result.entry_id.split("/")[-1], | |
) | |
for result in results | |
) | |
raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index) | |
returned_metadata = raw_metadata.copy().drop(columns=["categories"]) | |
returned_metadata["arxiv_subjects"] = clean.extract_tags( | |
raw_metadata, arxiv_tag=True | |
) | |
returned_metadata["msc_tags"] = clean.extract_tags(raw_metadata, arxiv_tag=False) | |
return returned_metadata | |