Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
73994b7
1
Parent(s):
b0e8ca7
turned black on
Browse files- .vscode/settings.json +6 -0
- data_storage.py +45 -51
.vscode/settings.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[python]": {
|
3 |
+
"editor.defaultFormatter": "ms-python.black-formatter"
|
4 |
+
},
|
5 |
+
"python.formatting.provider": "none"
|
6 |
+
}
|
data_storage.py
CHANGED
@@ -3,16 +3,16 @@ import pandas as pd
|
|
3 |
import data_cleaning as clean
|
4 |
from sklearn.preprocessing import MultiLabelBinarizer
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
"""
|
9 |
|
10 |
def __init__(self):
|
11 |
"""
|
12 |
data: dataframe holding the metadata. Each row represents a paper and each column is
|
13 |
a separate piece of metadata.
|
14 |
-
|
15 |
-
query: A tuple of the form (query_string,max_results) where query_string is the formatted
|
16 |
string that produced the raw data and max_results is the value of that parameter passed to the
|
17 |
arXiv API.
|
18 |
|
@@ -26,15 +26,16 @@ class ArXivData():
|
|
26 |
self.categories = None
|
27 |
|
28 |
def load_from_file():
|
29 |
-
pass
|
30 |
-
|
31 |
-
def load_from_query(self,query_string,max_results,offset):
|
32 |
-
self.data = query_to_df(
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
38 |
"""Constructs this dataset by cleaning another one.
|
39 |
|
40 |
Args:
|
@@ -48,18 +49,15 @@ class ArXivData():
|
|
48 |
def get_OHE_cats(self):
|
49 |
mlb = MultiLabelBinarizer()
|
50 |
OHE_category_array = mlb.fit_transform(self.data.categories)
|
51 |
-
return pd.DataFrame(
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
def format_query(author='',title='',cat='',abstract=''):
|
60 |
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
61 |
leave the corresponding argument blank.
|
62 |
-
|
63 |
e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
|
64 |
|
65 |
Args:
|
@@ -73,19 +71,18 @@ def format_query(author='',title='',cat='',abstract=''):
|
|
73 |
properly formatted query string to return all results simultaneously matching all specified fields.
|
74 |
"""
|
75 |
|
76 |
-
tags = [f
|
77 |
-
query =
|
78 |
return query
|
79 |
|
80 |
|
81 |
-
|
82 |
-
def query_to_df(query,max_results,offset):
|
83 |
"""Returns the results of an arxiv API query in a pandas dataframe.
|
84 |
|
85 |
Args:
|
86 |
-
query: string defining an arxiv query formatted according to
|
87 |
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
|
88 |
-
|
89 |
max_results: positive integer specifying the maximum number of results returned.
|
90 |
|
91 |
chunksize:
|
@@ -98,31 +95,28 @@ def query_to_df(query,max_results,offset):
|
|
98 |
The 'links' column is dropped and the authors column is a list of each author's name as a string.
|
99 |
The categories column is also a list of all tags appearing.
|
100 |
"""
|
101 |
-
client = arxiv.Client(page_size=2000,num_retries=3)
|
102 |
search = arxiv.Search(
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
columns = [
|
109 |
-
index = range(offset,max_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
|
112 |
-
results = client.results(search,offset=offset)
|
113 |
-
|
114 |
-
metadata_generator = ((result.title,result.summary,
|
115 |
-
result.categories,
|
116 |
-
result.entry_id.split('/')[-1]) for result in results)
|
117 |
-
|
118 |
metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
|
119 |
|
120 |
-
|
121 |
return metadata_dataframe
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
3 |
import data_cleaning as clean
|
4 |
from sklearn.preprocessing import MultiLabelBinarizer
|
5 |
|
6 |
+
|
7 |
+
class ArXivData:
|
8 |
+
"""A light class for storing the metadata of a collection of arXiv papers."""
|
9 |
|
10 |
def __init__(self):
|
11 |
"""
|
12 |
data: dataframe holding the metadata. Each row represents a paper and each column is
|
13 |
a separate piece of metadata.
|
14 |
+
|
15 |
+
query: A tuple of the form (query_string,max_results) where query_string is the formatted
|
16 |
string that produced the raw data and max_results is the value of that parameter passed to the
|
17 |
arXiv API.
|
18 |
|
|
|
26 |
self.categories = None
|
27 |
|
28 |
def load_from_file():
|
29 |
+
pass
|
30 |
+
|
31 |
+
def load_from_query(self, query_string, max_results, offset):
|
32 |
+
self.data = query_to_df(
|
33 |
+
query=query_string, max_results=max_results, offset=offset
|
34 |
+
)
|
35 |
+
self.query = (query_string, max_results)
|
36 |
+
# self.categories = self.get_OHE_cats()
|
37 |
+
|
38 |
+
def clean(self, dataset):
|
39 |
"""Constructs this dataset by cleaning another one.
|
40 |
|
41 |
Args:
|
|
|
49 |
def get_OHE_cats(self):
|
50 |
mlb = MultiLabelBinarizer()
|
51 |
OHE_category_array = mlb.fit_transform(self.data.categories)
|
52 |
+
return pd.DataFrame(OHE_category_array, columns=mlb.classes_).rename(
|
53 |
+
mapper=clean.category_map()
|
54 |
+
)
|
|
|
55 |
|
56 |
|
57 |
+
def format_query(author="", title="", cat="", abstract=""):
|
|
|
|
|
58 |
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
59 |
leave the corresponding argument blank.
|
60 |
+
|
61 |
e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
|
62 |
|
63 |
Args:
|
|
|
71 |
properly formatted query string to return all results simultaneously matching all specified fields.
|
72 |
"""
|
73 |
|
74 |
+
tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
|
75 |
+
query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
|
76 |
return query
|
77 |
|
78 |
|
79 |
+
def query_to_df(query, max_results, offset):
|
|
|
80 |
"""Returns the results of an arxiv API query in a pandas dataframe.
|
81 |
|
82 |
Args:
|
83 |
+
query: string defining an arxiv query formatted according to
|
84 |
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
|
85 |
+
|
86 |
max_results: positive integer specifying the maximum number of results returned.
|
87 |
|
88 |
chunksize:
|
|
|
95 |
The 'links' column is dropped and the authors column is a list of each author's name as a string.
|
96 |
The categories column is also a list of all tags appearing.
|
97 |
"""
|
98 |
+
client = arxiv.Client(page_size=2000, num_retries=3)
|
99 |
search = arxiv.Search(
|
100 |
+
query=query,
|
101 |
+
max_results=max_results,
|
102 |
+
sort_by=arxiv.SortCriterion.LastUpdatedDate,
|
103 |
+
)
|
104 |
+
|
105 |
+
columns = ["title", "summary", "categories", "id"]
|
106 |
+
index = range(offset, max_results)
|
107 |
+
|
108 |
+
results = client.results(search, offset=offset)
|
109 |
+
|
110 |
+
metadata_generator = (
|
111 |
+
(
|
112 |
+
result.title,
|
113 |
+
result.summary,
|
114 |
+
result.categories,
|
115 |
+
result.entry_id.split("/")[-1],
|
116 |
+
)
|
117 |
+
for result in results
|
118 |
+
)
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
|
121 |
|
|
|
122 |
return metadata_dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|