Michael-Geis commited on
Commit
73994b7
1 Parent(s): b0e8ca7

turned black on

Browse files
Files changed (2) hide show
  1. .vscode/settings.json +6 -0
  2. data_storage.py +45 -51
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
data_storage.py CHANGED
@@ -3,16 +3,16 @@ import pandas as pd
3
  import data_cleaning as clean
4
  from sklearn.preprocessing import MultiLabelBinarizer
5
 
6
- class ArXivData():
7
- """A light class for storing the metadata of a collection of arXiv papers.
8
- """
9
 
10
  def __init__(self):
11
  """
12
  data: dataframe holding the metadata. Each row represents a paper and each column is
13
  a separate piece of metadata.
14
-
15
- query: A tuple of the form (query_string,max_results) where query_string is the formatted
16
  string that produced the raw data and max_results is the value of that parameter passed to the
17
  arXiv API.
18
 
@@ -26,15 +26,16 @@ class ArXivData():
26
  self.categories = None
27
 
28
  def load_from_file():
29
- pass
30
-
31
- def load_from_query(self,query_string,max_results,offset):
32
- self.data = query_to_df(query=query_string,max_results=max_results,offset=offset)
33
- self.query = (query_string,max_results)
34
- #self.categories = self.get_OHE_cats()
35
-
36
-
37
- def clean(self,dataset):
 
38
  """Constructs this dataset by cleaning another one.
39
 
40
  Args:
@@ -48,18 +49,15 @@ class ArXivData():
48
  def get_OHE_cats(self):
49
  mlb = MultiLabelBinarizer()
50
  OHE_category_array = mlb.fit_transform(self.data.categories)
51
- return pd.DataFrame(
52
- OHE_category_array, columns = mlb.classes_).rename(
53
- mapper=clean.category_map())
54
-
55
 
56
 
57
-
58
-
59
- def format_query(author='',title='',cat='',abstract=''):
60
  """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
61
  leave the corresponding argument blank.
62
-
63
  e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
64
 
65
  Args:
@@ -73,19 +71,18 @@ def format_query(author='',title='',cat='',abstract=''):
73
  properly formatted query string to return all results simultaneously matching all specified fields.
74
  """
75
 
76
- tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
77
- query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
78
  return query
79
 
80
 
81
-
82
- def query_to_df(query,max_results,offset):
83
  """Returns the results of an arxiv API query in a pandas dataframe.
84
 
85
  Args:
86
- query: string defining an arxiv query formatted according to
87
  https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
88
-
89
  max_results: positive integer specifying the maximum number of results returned.
90
 
91
  chunksize:
@@ -98,31 +95,28 @@ def query_to_df(query,max_results,offset):
98
  The 'links' column is dropped and the authors column is a list of each author's name as a string.
99
  The categories column is also a list of all tags appearing.
100
  """
101
- client = arxiv.Client(page_size=2000,num_retries=3)
102
  search = arxiv.Search(
103
- query = query,
104
- max_results=max_results,
105
- sort_by=arxiv.SortCriterion.LastUpdatedDate
106
- )
107
-
108
- columns = ['title','summary','categories','id']
109
- index = range(offset,max_results)
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
-
112
- results = client.results(search,offset=offset)
113
-
114
- metadata_generator = ((result.title,result.summary,
115
- result.categories,
116
- result.entry_id.split('/')[-1]) for result in results)
117
-
118
  metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
119
 
120
-
121
  return metadata_dataframe
122
-
123
-
124
-
125
-
126
-
127
-
128
-
 
3
  import data_cleaning as clean
4
  from sklearn.preprocessing import MultiLabelBinarizer
5
 
6
+
7
+ class ArXivData:
8
+ """A light class for storing the metadata of a collection of arXiv papers."""
9
 
10
  def __init__(self):
11
  """
12
  data: dataframe holding the metadata. Each row represents a paper and each column is
13
  a separate piece of metadata.
14
+
15
+ query: A tuple of the form (query_string,max_results) where query_string is the formatted
16
  string that produced the raw data and max_results is the value of that parameter passed to the
17
  arXiv API.
18
 
 
26
  self.categories = None
27
 
28
  def load_from_file():
29
+ pass
30
+
31
+ def load_from_query(self, query_string, max_results, offset):
32
+ self.data = query_to_df(
33
+ query=query_string, max_results=max_results, offset=offset
34
+ )
35
+ self.query = (query_string, max_results)
36
+ # self.categories = self.get_OHE_cats()
37
+
38
+ def clean(self, dataset):
39
  """Constructs this dataset by cleaning another one.
40
 
41
  Args:
 
49
  def get_OHE_cats(self):
50
  mlb = MultiLabelBinarizer()
51
  OHE_category_array = mlb.fit_transform(self.data.categories)
52
+ return pd.DataFrame(OHE_category_array, columns=mlb.classes_).rename(
53
+ mapper=clean.category_map()
54
+ )
 
55
 
56
 
57
+ def format_query(author="", title="", cat="", abstract=""):
 
 
58
  """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
59
  leave the corresponding argument blank.
60
+
61
  e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
62
 
63
  Args:
 
71
  properly formatted query string to return all results simultaneously matching all specified fields.
72
  """
73
 
74
+ tags = [f"au:{author}", f"ti:{title}", f"cat:{cat}", f"abs:{abstract}"]
75
+ query = " AND ".join([tag for tag in tags if not tag.endswith(":")])
76
  return query
77
 
78
 
79
+ def query_to_df(query, max_results, offset):
 
80
  """Returns the results of an arxiv API query in a pandas dataframe.
81
 
82
  Args:
83
+ query: string defining an arxiv query formatted according to
84
  https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
85
+
86
  max_results: positive integer specifying the maximum number of results returned.
87
 
88
  chunksize:
 
95
  The 'links' column is dropped and the authors column is a list of each author's name as a string.
96
  The categories column is also a list of all tags appearing.
97
  """
98
+ client = arxiv.Client(page_size=2000, num_retries=3)
99
  search = arxiv.Search(
100
+ query=query,
101
+ max_results=max_results,
102
+ sort_by=arxiv.SortCriterion.LastUpdatedDate,
103
+ )
104
+
105
+ columns = ["title", "summary", "categories", "id"]
106
+ index = range(offset, max_results)
107
+
108
+ results = client.results(search, offset=offset)
109
+
110
+ metadata_generator = (
111
+ (
112
+ result.title,
113
+ result.summary,
114
+ result.categories,
115
+ result.entry_id.split("/")[-1],
116
+ )
117
+ for result in results
118
+ )
119
 
 
 
 
 
 
 
 
120
  metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
121
 
 
122
  return metadata_dataframe