File size: 4,340 Bytes
458942a
 
9b818c8
 
 
73994b7
 
 
9b818c8
 
 
 
 
73994b7
 
9b818c8
 
 
 
 
 
 
 
 
 
 
 
b0e8ca7
73994b7
 
 
 
 
 
 
 
 
 
9b818c8
 
 
 
 
 
 
 
 
 
 
 
 
73994b7
 
 
9b818c8
 
73994b7
458942a
 
73994b7
458942a
 
 
 
 
 
 
 
 
 
 
 
 
73994b7
 
458942a
 
 
73994b7
458942a
 
 
73994b7
458942a
73994b7
458942a
 
b0e8ca7
 
458942a
 
 
 
 
 
 
 
73994b7
458942a
73994b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e8ca7
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import arxiv
import pandas as pd
import data_cleaning as clean
from sklearn.preprocessing import MultiLabelBinarizer


class ArXivData:
    """A light class for storing the metadata of a collection of arXiv papers."""

    def __init__(self):
        """
        data: dataframe holding the metadata. Each row represents a paper and each column is
        a separate piece of metadata.

        query: A tuple of the form (query_string,max_results) where query_string is the formatted
        string that produced the raw data and max_results is the value of that parameter passed to the
        arXiv API.

        raw: The original, raw dataset as returned by the arXiv API, if current data is clean.

        cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
        """

        self.data = None
        self.query = None
        self.categories = None

    def load_from_file():
        pass

    def load_from_query(self, query_string, max_results, offset):
        self.data = query_to_df(
            query=query_string, max_results=max_results, offset=offset
        )
        self.query = (query_string, max_results)
        # self.categories = self.get_OHE_cats()

    def clean(self, dataset):
        """Constructs this dataset by cleaning another one.

        Args:
            dataset: An ArXivData object containing data to be cleaned.
        """
        self.data = clean.clean(dataset)
        self.query = dataset.query
        self.raw = dataset.raw
        self.categories = dataset.categories

    def get_OHE_cats(self):
        mlb = MultiLabelBinarizer()
        OHE_category_array = mlb.fit_transform(self.data.categories)
        return pd.DataFrame(OHE_category_array, columns=mlb.classes_).rename(
            mapper=clean.category_map()
        )


def format_query(author="", title="", cat="", abstract=""):
    """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
    leave the corresponding argument blank.

    e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.

    Args:
        author: string to search for in the author field.
        title: string to search for in the title field.
        cat: A valid arxiv subject tag. See the full list of these at:
        https://arxiv.org/category_taxonomy
        abstract: string to search for in the abstract field.

    Returns:
        properly formatted query string to return all results simultaneously matching all specified fields.
    """

    tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
    query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
    return query


def query_to_df(query, max_results, offset):
    """Returns the results of an arxiv API query in a pandas dataframe.

    Args:
        query: string defining an arxiv query formatted according to
        https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction

        max_results: positive integer specifying the maximum number of results returned.

        chunksize:

    Returns:
        pandas dataframe with one column for indivial piece of metadata of a returned result.
        To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
        http://lukasschwab.me/arxiv.py/index.html#Result

        The 'links' column is dropped and the authors column is a list of each author's name as a string.
        The categories column is also a list of all tags appearing.
    """
    client = arxiv.Client(page_size=2000, num_retries=3)
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.LastUpdatedDate,
    )

    columns = ["title", "summary", "categories", "id"]
    index = range(offset, max_results)

    results = client.results(search, offset=offset)

    metadata_generator = (
        (
            result.title,
            result.summary,
            result.categories,
            result.entry_id.split("/")[-1],
        )
        for result in results
    )

    metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)

    return metadata_dataframe