File size: 5,424 Bytes
458942a
 
8f895f2
1d67a6e
3d2ca49
 
 
 
 
 
 
 
 
9b818c8
73994b7
 
8f895f2
9b818c8
8f895f2
 
cbdef5e
 
8f895f2
 
9b818c8
8f895f2
 
 
 
 
 
cbdef5e
8f895f2
 
9b818c8
8f895f2
 
73994b7
8f895f2
 
 
 
 
cbdef5e
8f895f2
 
 
 
 
 
b2af341
 
 
 
 
8f895f2
 
fcfd917
8f895f2
 
73994b7
8f895f2
 
 
 
 
b777cd0
 
 
8f895f2
b777cd0
73994b7
3d2ca49
458942a
 
 
73994b7
458942a
73994b7
458942a
 
b2af341
b0e8ca7
458942a
 
 
 
 
 
 
 
8f895f2
b2af341
 
3d2ca49
b2af341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73994b7
3d2ca49
73994b7
 
 
 
 
 
 
3d2ca49
73994b7
 
 
 
 
b0e8ca7
3d2ca49
8f895f2
cbdef5e
fcfd917
8f895f2
 
 
b0e8ca7
8f895f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import arxiv
import pandas as pd
import numpy as np
import cleaning as clean
from sklearn.base import TransformerMixin, BaseEstimator


class Fetch(BaseEstimator, TransformerMixin):
    def fit(self):
        return self

    def transform(self, X, y=None):
        return query_to_df(id_list=X)


class ArXivData:
    """A class for storing the metadata of a collection of arXiv papers."""

    def __init__(self) -> None:
        self._returned_metadata = None
        self.metadata = None
        self.arxiv_subjects = None
        self.doc_strings = "title and abstract"
        self.embeddings = None

    def load_from_feather(self, path_to_dataset):
        """Loads metadata from a saved feather file.

        Args:
            path_to_dataset: path to the feather file containing the dataset.
        """
        self._returned_metadata = pd.read_feather(path_to_dataset)
        self.metadata = self._returned_metadata
        self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

    def load_from_query(self, query, max_results, offset=0):
        """Loads instance with data returned from an ArXiv API query.

        Args:
            query: query string used to call the API
            max_results: maximum number of results from the API call to return
            offset: number of results to skip over initially. Defaults to 0.
        """

        self._returned_metadata = query_to_df(
            query=query, max_results=max_results, offset=offset
        )
        self.metadata = clean.split_categories(self._returned_metadata)
        self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

    def load_from_id_list(self, id_list):
        self._returned_metadata = query_to_df(id_list=id_list, max_results=len(id_list))
        self.metadata = clean.split_categories(self._returned_metadata)
        self.arxiv_subjects = clean.OHE_arxiv_subjects(self.metadata)

    def save_as_feather(self, path_to_dataset):
        """Saves a dataset as a feather file.

        Args:
            path_to_dataset: directory to save the dataset

        Raises:
            Exception: Raises exception if there is no data to be saved.
        """

        if self.metadata.empty:
            raise Exception(
                "No data stored. Run load_from_query or load_from_feather to retrieve data."
            )
        self.metadata.to_feather(path_to_dataset)


def query_to_df(query=None, id_list=None, max_results=10, offset=0):
    """Returns the results of an arxiv API query in a pandas dataframe.

    Args:
        query: string defining an arxiv query formatted according to
        https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction

        max_results: positive integer specifying the maximum number of results returned.

        id_list: A list of arxiv ids as strings to retrieve

    Returns:
        pandas dataframe with one column for indivial piece of metadata of a returned result.
        To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
        http://lukasschwab.me/arxiv.py/index.html#Result

        The 'links' column is dropped and the authors column is a list of each author's name as a string.
        The categories column is also a list of all tags appearing.
    """
    client = arxiv.Client(page_size=2000, num_retries=10)

    if id_list:
        max_results = len(id_list)
        search = arxiv.Search(
            id_list=id_list,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.LastUpdatedDate,
        )

    else:
        if not query:
            raise Exception(
                "You must pass either a query string or a list of arxiv IDs"
            )

        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.LastUpdatedDate,
        )

    columns = ["title", "abstract", "authors", "categories", "id"]

    results = client.results(search, offset=offset)

    metadata_generator = (
        (
            result.title,
            result.summary,
            [author.name for author in result.authors],
            result.categories,
            result.entry_id.split("/")[-1],
        )
        for result in results
    )

    returned_metadata = pd.DataFrame(metadata_generator, columns=columns)
    return returned_metadata


# def format_query(author="", title="", cat="", abstract=""):
#     """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
#     leave the corresponding argument blank.

#     e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.

#     Args:
#         author: string to search for in the author field.
#         title: string to search for in the title field.
#         cat: A valid arxiv subject tag. See the full list of these at:
#         https://arxiv.org/category_taxonomy
#         abstract: string to search for in the abstract field.

#     Returns:
#         properly formatted query string to return all results simultaneously matching all specified fields.
#     """

#     tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
#     query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
#     return query