Spaces:
Runtime error
Runtime error
File size: 5,424 Bytes
458942a 8f895f2 1d67a6e 3d2ca49 9b818c8 73994b7 8f895f2 9b818c8 8f895f2 cbdef5e 8f895f2 9b818c8 8f895f2 cbdef5e 8f895f2 9b818c8 8f895f2 73994b7 8f895f2 cbdef5e 8f895f2 b2af341 8f895f2 fcfd917 8f895f2 73994b7 8f895f2 b777cd0 8f895f2 b777cd0 73994b7 3d2ca49 458942a 73994b7 458942a 73994b7 458942a b2af341 b0e8ca7 458942a 8f895f2 b2af341 3d2ca49 b2af341 73994b7 3d2ca49 73994b7 3d2ca49 73994b7 b0e8ca7 3d2ca49 8f895f2 cbdef5e fcfd917 8f895f2 b0e8ca7 8f895f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import arxiv
import pandas as pd
import numpy as np
import cleaning as clean
from sklearn.base import TransformerMixin, BaseEstimator
class Fetch(BaseEstimator, TransformerMixin):
def fit(self):
return self
def transform(self, X, y=None):
return query_to_df(id_list=X)
class ArXivData:
"""A class for storing the metadata of a collection of arXiv papers."""
def __init__(self) -> None:
self._returned_metadata = None
self.metadata = None
self.arxiv_subjects = None
self.doc_strings = "title and abstract"
self.embeddings = None
def load_from_feather(self, path_to_dataset):
"""Loads metadata from a saved feather file.
Args:
path_to_dataset: path to the feather file containing the dataset.
"""
self._returned_metadata = pd.read_feather(path_to_dataset)
self.metadata = self._returned_metadata
self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)
def load_from_query(self, query, max_results, offset=0):
"""Loads instance with data returned from an ArXiv API query.
Args:
query: query string used to call the API
max_results: maximum number of results from the API call to return
offset: number of results to skip over initially. Defaults to 0.
"""
self._returned_metadata = query_to_df(
query=query, max_results=max_results, offset=offset
)
self.metadata = clean.split_categories(self._returned_metadata)
self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)
def load_from_id_list(self, id_list):
self._returned_metadata = query_to_df(id_list=id_list, max_results=len(id_list))
self.metadata = clean.split_categories(self._returned_metadata)
self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)
def save_as_feather(self, path_to_dataset):
"""Saves a dataset as a feather file.
Args:
path_to_dataset: directory to save the dataset
Raises:
Exception: Raises exception if there is no data to be saved.
"""
if self.metadata.empty:
raise Exception(
"No data stored. Run load_from_query or load_from_feather to retrieve data."
)
self.metadata.to_feather(path_to_dataset)
def query_to_df(query=None, id_list=None, max_results=10, offset=0):
"""Returns the results of an arxiv API query in a pandas dataframe.
Args:
query: string defining an arxiv query formatted according to
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
max_results: positive integer specifying the maximum number of results returned.
id_list: A list of arxiv ids as strings to retrieve
Returns:
pandas dataframe with one column for indivial piece of metadata of a returned result.
To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
http://lukasschwab.me/arxiv.py/index.html#Result
The 'links' column is dropped and the authors column is a list of each author's name as a string.
The categories column is also a list of all tags appearing.
"""
client = arxiv.Client(page_size=2000, num_retries=10)
if id_list:
max_results = len(id_list)
search = arxiv.Search(
id_list=id_list,
max_results=max_results,
sort_by=arxiv.SortCriterion.LastUpdatedDate,
)
else:
if not query:
raise Exception(
"You must pass either a query string or a list of arxiv IDs"
)
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.LastUpdatedDate,
)
columns = ["title", "abstract", "authors", "categories", "id"]
results = client.results(search, offset=offset)
metadata_generator = (
(
result.title,
result.summary,
[author.name for author in result.authors],
result.categories,
result.entry_id.split("/")[-1],
)
for result in results
)
returned_metadata = pd.DataFrame(metadata_generator, columns=columns)
return returned_metadata
# def format_query(author="", title="", cat="", abstract=""):
# """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
# leave the corresponding argument blank.
# e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
# Args:
# author: string to search for in the author field.
# title: string to search for in the title field.
# cat: A valid arxiv subject tag. See the full list of these at:
# https://arxiv.org/category_taxonomy
# abstract: string to search for in the abstract field.
# Returns:
# properly formatted query string to return all results simultaneously matching all specified fields.
# """
# tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
# query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
# return query
|