Michael-Geis commited on
Commit
458942a
1 Parent(s): cd530cf

created module for arxiv query retrieval

Browse files
Files changed (2) hide show
  1. arxiv_query_retrieval.py +62 -0
  2. cleaning/cleaning.py +181 -0
arxiv_query_retrieval.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import arxiv
2
+ import pandas as pd
3
+
4
+ def format_query(author='',title='',cat='',abstract=''):
5
+ """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
6
+ leave the corresponding argument blank.
7
+
8
+ e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
9
+
10
+ Args:
11
+ author: string to search for in the author field.
12
+ title: string to search for in the title field.
13
+ cat: A valid arxiv subject tag. See the full list of these at:
14
+ https://arxiv.org/category_taxonomy
15
+ abstract: string to search for in the abstract field.
16
+
17
+ Returns:
18
+ properly formatted query string to return all results simultaneously matching all specified fields.
19
+ """
20
+
21
+ tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
22
+ query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
23
+ return query
24
+
25
+
26
+
27
+ def query_to_df(query,max_results):
28
+ """Returns the results of an arxiv API query in a pandas dataframe.
29
+
30
+ Args:
31
+ query: string defining an arxiv query formatted according to
32
+ https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
33
+
34
+ max_results: positive integer specifying the maximum number of results returned.
35
+
36
+ Returns:
37
+ pandas dataframe with one column for indivial piece of metadata of a returned result.
38
+ To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
39
+ http://lukasschwab.me/arxiv.py/index.html#Result
40
+
41
+ The 'links' column is dropped and the authors column is a list of each author's name as a string.
42
+ The categories column is also a list of all tags appearing.
43
+ """
44
+ client = arxiv.Client(page_size=100,num_retries=3)
45
+ search = arxiv.Search(
46
+ query = query,
47
+ max_results=max_results,
48
+ sort_by=arxiv.SortCriterion.LastUpdatedDate
49
+ )
50
+ results = client.results(search)
51
+
52
+ drop_cols = ['authors','links','_raw']
53
+ df = pd.DataFrame()
54
+
55
+ for result in results:
56
+ row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
57
+ row_dict['authors'] = [author.name for author in result.authors]
58
+ row_dict['links'] = [link.href for link in result.links]
59
+ row = pd.Series(row_dict)
60
+ df = pd.concat([df , row.to_frame().transpose()], axis = 0)
61
+
62
+ return df.reset_index(drop=True,inplace=False)
cleaning/cleaning.py CHANGED
@@ -19,6 +19,177 @@ def main(raw_metadata_df, path_to_embeddings):
19
 
20
  ##
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  ## 1. Latin-ize latex accents enclosed in brackets
24
  def remove_latex_accents(string):
@@ -61,6 +232,16 @@ def cleanse(string):
61
 
62
  ##
63
 
 
 
 
 
 
 
 
 
 
 
64
  def find_msc(cat_list):
65
  pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
66
  out = []
 
19
 
20
  ##
21
 
22
+ def category_map():
23
+ """Maps arXiv subject categories to their full english names.
24
+
25
+ Returns:
26
+ Python dict whose keys are arXiv tags and whose values are their English names.
27
+ Note that the list is not exhaustive in the sense that many categories have aliases that
28
+ are not included. (Some are, e.g. math.MP and math-ph).
29
+ """
30
+ return {'astro-ph': 'Astrophysics',
31
+ 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
32
+ 'astro-ph.EP': 'Earth and Planetary Astrophysics',
33
+ 'astro-ph.GA': 'Astrophysics of Galaxies',
34
+ 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
35
+ 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
36
+ 'astro-ph.SR': 'Solar and Stellar Astrophysics',
37
+ 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
38
+ 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
39
+ 'cond-mat.mtrl-sci': 'Materials Science',
40
+ 'cond-mat.other': 'Other Condensed Matter',
41
+ 'cond-mat.quant-gas': 'Quantum Gases',
42
+ 'cond-mat.soft': 'Soft Condensed Matter',
43
+ 'cond-mat.stat-mech': 'Statistical Mechanics',
44
+ 'cond-mat.str-el': 'Strongly Correlated Electrons',
45
+ 'cond-mat.supr-con': 'Superconductivity',
46
+ 'cond-mat': 'Condensed Matter',
47
+ 'cs.AI': 'Artificial Intelligence',
48
+ 'cs.AR': 'Hardware Architecture',
49
+ 'cs.CC': 'Computational Complexity',
50
+ 'cs.CE': 'Computational Engineering, Finance, and Science',
51
+ 'cs.CG': 'Computational Geometry',
52
+ 'cs.CL': 'Computation and Language',
53
+ 'cs.CR': 'Cryptography and Security',
54
+ 'cs.CV': 'Computer Vision and Pattern Recognition',
55
+ 'cs.CY': 'Computers and Society',
56
+ 'cs.DB': 'Databases',
57
+ 'cs.DC': 'Distributed, Parallel, and Cluster Computing',
58
+ 'cs.DL': 'Digital Libraries',
59
+ 'cs.DM': 'Discrete Mathematics',
60
+ 'cs.DS': 'Data Structures and Algorithms',
61
+ 'cs.ET': 'Emerging Technologies',
62
+ 'cs.FL': 'Formal Languages and Automata Theory',
63
+ 'cs.GL': 'General Literature',
64
+ 'cs.GR': 'Graphics',
65
+ 'cs.GT': 'Computer Science and Game Theory',
66
+ 'cs.HC': 'Human-Computer Interaction',
67
+ 'cs.IR': 'Information Retrieval',
68
+ 'cs.IT': 'Information Theory',
69
+ 'cs.LG': 'Machine Learning',
70
+ 'cs.LO': 'Logic in Computer Science',
71
+ 'cs.MA': 'Multiagent Systems',
72
+ 'cs.MM': 'Multimedia',
73
+ 'cs.MS': 'Mathematical Software',
74
+ 'cs.NA': 'Numerical Analysis',
75
+ 'cs.NE': 'Neural and Evolutionary Computing',
76
+ 'cs.NI': 'Networking and Internet Architecture',
77
+ 'cs.OH': 'Other Computer Science',
78
+ 'cs.OS': 'Operating Systems',
79
+ 'cs.PF': 'Performance',
80
+ 'cs.PL': 'Programming Languages',
81
+ 'cs.RO': 'Robotics',
82
+ 'cs.SC': 'Symbolic Computation',
83
+ 'cs.SD': 'Sound',
84
+ 'cs.SE': 'Software Engineering',
85
+ 'cs.SI': 'Social and Information Networks',
86
+ 'cs.SY': 'Systems and Control',
87
+ 'econ.EM': 'Econometrics',
88
+ 'econ.GN': 'General Economics',
89
+ 'econ.TH': 'Theoretical Economics',
90
+ 'eess.AS': 'Audio and Speech Processing',
91
+ 'eess.IV': 'Image and Video Processing',
92
+ 'eess.SP': 'Signal Processing',
93
+ 'eess.SY': 'Systems and Control',
94
+ 'dg-ga': 'Differential Geometry',
95
+ 'gr-qc': 'General Relativity and Quantum Cosmology',
96
+ 'hep-ex': 'High Energy Physics - Experiment',
97
+ 'hep-lat': 'High Energy Physics - Lattice',
98
+ 'hep-ph': 'High Energy Physics - Phenomenology',
99
+ 'hep-th': 'High Energy Physics - Theory',
100
+ 'math.AC': 'Commutative Algebra',
101
+ 'math.AG': 'Algebraic Geometry',
102
+ 'math.AP': 'Analysis of PDEs',
103
+ 'math.AT': 'Algebraic Topology',
104
+ 'math.CA': 'Classical Analysis and ODEs',
105
+ 'math.CO': 'Combinatorics',
106
+ 'math.CT': 'Category Theory',
107
+ 'math.CV': 'Complex Variables',
108
+ 'math.DG': 'Differential Geometry',
109
+ 'math.DS': 'Dynamical Systems',
110
+ 'math.FA': 'Functional Analysis',
111
+ 'math.GM': 'General Mathematics',
112
+ 'math.GN': 'General Topology',
113
+ 'math.GR': 'Group Theory',
114
+ 'math.GT': 'Geometric Topology',
115
+ 'math.HO': 'History and Overview',
116
+ 'math.IT': 'Information Theory',
117
+ 'math.KT': 'K-Theory and Homology',
118
+ 'math.LO': 'Logic',
119
+ 'math.MG': 'Metric Geometry',
120
+ 'math.MP': 'Mathematical Physics',
121
+ 'math.NA': 'Numerical Analysis',
122
+ 'math.NT': 'Number Theory',
123
+ 'math.OA': 'Operator Algebras',
124
+ 'math.OC': 'Optimization and Control',
125
+ 'math.PR': 'Probability',
126
+ 'math.QA': 'Quantum Algebra',
127
+ 'math.RA': 'Rings and Algebras',
128
+ 'math.RT': 'Representation Theory',
129
+ 'math.SG': 'Symplectic Geometry',
130
+ 'math.SP': 'Spectral Theory',
131
+ 'math.ST': 'Statistics Theory',
132
+ 'math-ph': 'Mathematical Physics',
133
+ 'funct-an': 'Functional Analysis',
134
+ 'alg-geom': 'Algebraic Geometry',
135
+ 'nlin.AO': 'Adaptation and Self-Organizing Systems',
136
+ 'chao-dyn': 'Chaotic Dynamics',
137
+ 'nlin.CD': 'Chaotic Dynamics',
138
+ 'nlin.CG': 'Cellular Automata and Lattice Gases',
139
+ 'nlin.PS': 'Pattern Formation and Solitons',
140
+ 'nlin.SI': 'Exactly Solvable and Integrable Systems',
141
+ 'nucl-ex': 'Nuclear Experiment',
142
+ 'nucl-th': 'Nuclear Theory',
143
+ 'physics.acc-ph': 'Accelerator Physics',
144
+ 'physics.ao-ph': 'Atmospheric and Oceanic Physics',
145
+ 'physics.app-ph': 'Applied Physics',
146
+ 'physics.atm-clus': 'Atomic and Molecular Clusters',
147
+ 'physics.atom-ph': 'Atomic Physics',
148
+ 'physics.bio-ph': 'Biological Physics',
149
+ 'physics.chem-ph': 'Chemical Physics',
150
+ 'physics.class-ph': 'Classical Physics',
151
+ 'physics.comp-ph': 'Computational Physics',
152
+ 'physics.data-an': 'Data Analysis, Statistics and Probability',
153
+ 'physics.ed-ph': 'Physics Education',
154
+ 'physics.flu-dyn': 'Fluid Dynamics',
155
+ 'physics.gen-ph': 'General Physics',
156
+ 'physics.geo-ph': 'Geophysics',
157
+ 'physics.hist-ph': 'History and Philosophy of Physics',
158
+ 'physics.ins-det': 'Instrumentation and Detectors',
159
+ 'physics.med-ph': 'Medical Physics',
160
+ 'physics.optics': 'Optics',
161
+ 'physics.plasm-ph': 'Plasma Physics',
162
+ 'physics.pop-ph': 'Popular Physics',
163
+ 'physics.soc-ph': 'Physics and Society',
164
+ 'physics.space-ph': 'Space Physics',
165
+ 'q-bio.BM': 'Biomolecules',
166
+ 'q-bio.CB': 'Cell Behavior',
167
+ 'q-bio.GN': 'Genomics',
168
+ 'q-bio.MN': 'Molecular Networks',
169
+ 'q-bio.NC': 'Neurons and Cognition',
170
+ 'q-bio.OT': 'Other Quantitative Biology',
171
+ 'q-bio.PE': 'Populations and Evolution',
172
+ 'q-bio.QM': 'Quantitative Methods',
173
+ 'q-bio.SC': 'Subcellular Processes',
174
+ 'q-bio.TO': 'Tissues and Organs',
175
+ 'q-fin.CP': 'Computational Finance',
176
+ 'q-fin.EC': 'Economics',
177
+ 'q-fin.GN': 'General Finance',
178
+ 'q-fin.MF': 'Mathematical Finance',
179
+ 'q-fin.PM': 'Portfolio Management',
180
+ 'q-fin.PR': 'Pricing of Securities',
181
+ 'q-fin.RM': 'Risk Management',
182
+ 'q-fin.ST': 'Statistical Finance',
183
+ 'q-fin.TR': 'Trading and Market Microstructure',
184
+ 'quant-ph': 'Quantum Physics',
185
+ 'q-alg' : 'Quantum Algebra',
186
+ 'stat.AP': 'Applications',
187
+ 'stat.CO': 'Computation',
188
+ 'stat.ME': 'Methodology',
189
+ 'stat.ML': 'Machine Learning',
190
+ 'stat.OT': 'Other Statistics',
191
+ 'stat.TH': 'Statistics Theory'}
192
+
193
 
194
  ## 1. Latin-ize latex accents enclosed in brackets
195
  def remove_latex_accents(string):
 
232
 
233
  ##
234
 
235
+ def find_hyph(text):
236
+ pattern = r'(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b'
237
+ keywords = regex.findall(pattern,text)
238
+
239
+ if keywords == []:
240
+ return None
241
+ else:
242
+ return list(set(keywords))
243
+
244
+
245
  def find_msc(cat_list):
246
  pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
247
  out = []