Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
458942a
1
Parent(s):
cd530cf
created module for arxiv query retrieval
Browse files- arxiv_query_retrieval.py +62 -0
- cleaning/cleaning.py +181 -0
arxiv_query_retrieval.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import arxiv
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def format_query(author='',title='',cat='',abstract=''):
|
5 |
+
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
6 |
+
leave the corresponding argument blank.
|
7 |
+
|
8 |
+
e.g. format_query(cat='math.AP') will return the string used to pull all articles with the subject tag 'PDEs'.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
author: string to search for in the author field.
|
12 |
+
title: string to search for in the title field.
|
13 |
+
cat: A valid arxiv subject tag. See the full list of these at:
|
14 |
+
https://arxiv.org/category_taxonomy
|
15 |
+
abstract: string to search for in the abstract field.
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
properly formatted query string to return all results simultaneously matching all specified fields.
|
19 |
+
"""
|
20 |
+
|
21 |
+
tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}']
|
22 |
+
query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
|
23 |
+
return query
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def query_to_df(query,max_results):
|
28 |
+
"""Returns the results of an arxiv API query in a pandas dataframe.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
query: string defining an arxiv query formatted according to
|
32 |
+
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
|
33 |
+
|
34 |
+
max_results: positive integer specifying the maximum number of results returned.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
pandas dataframe with one column for indivial piece of metadata of a returned result.
|
38 |
+
To see a list of these columns and their descriptions, see the documentation for the Results class of the arxiv package here:
|
39 |
+
http://lukasschwab.me/arxiv.py/index.html#Result
|
40 |
+
|
41 |
+
The 'links' column is dropped and the authors column is a list of each author's name as a string.
|
42 |
+
The categories column is also a list of all tags appearing.
|
43 |
+
"""
|
44 |
+
client = arxiv.Client(page_size=100,num_retries=3)
|
45 |
+
search = arxiv.Search(
|
46 |
+
query = query,
|
47 |
+
max_results=max_results,
|
48 |
+
sort_by=arxiv.SortCriterion.LastUpdatedDate
|
49 |
+
)
|
50 |
+
results = client.results(search)
|
51 |
+
|
52 |
+
drop_cols = ['authors','links','_raw']
|
53 |
+
df = pd.DataFrame()
|
54 |
+
|
55 |
+
for result in results:
|
56 |
+
row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
|
57 |
+
row_dict['authors'] = [author.name for author in result.authors]
|
58 |
+
row_dict['links'] = [link.href for link in result.links]
|
59 |
+
row = pd.Series(row_dict)
|
60 |
+
df = pd.concat([df , row.to_frame().transpose()], axis = 0)
|
61 |
+
|
62 |
+
return df.reset_index(drop=True,inplace=False)
|
cleaning/cleaning.py
CHANGED
@@ -19,6 +19,177 @@ def main(raw_metadata_df, path_to_embeddings):
|
|
19 |
|
20 |
##
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
## 1. Latin-ize latex accents enclosed in brackets
|
24 |
def remove_latex_accents(string):
|
@@ -61,6 +232,16 @@ def cleanse(string):
|
|
61 |
|
62 |
##
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def find_msc(cat_list):
|
65 |
pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
|
66 |
out = []
|
|
|
19 |
|
20 |
##
|
21 |
|
22 |
+
def category_map():
|
23 |
+
"""Maps arXiv subject categories to their full english names.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
Python dict whose keys are arXiv tags and whose values are their English names.
|
27 |
+
Note that the list is not exhaustive in the sense that many categories have aliases that
|
28 |
+
are not included. (Some are, e.g. math.MP and math-ph).
|
29 |
+
"""
|
30 |
+
return {'astro-ph': 'Astrophysics',
|
31 |
+
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
|
32 |
+
'astro-ph.EP': 'Earth and Planetary Astrophysics',
|
33 |
+
'astro-ph.GA': 'Astrophysics of Galaxies',
|
34 |
+
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
|
35 |
+
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
|
36 |
+
'astro-ph.SR': 'Solar and Stellar Astrophysics',
|
37 |
+
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
|
38 |
+
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
|
39 |
+
'cond-mat.mtrl-sci': 'Materials Science',
|
40 |
+
'cond-mat.other': 'Other Condensed Matter',
|
41 |
+
'cond-mat.quant-gas': 'Quantum Gases',
|
42 |
+
'cond-mat.soft': 'Soft Condensed Matter',
|
43 |
+
'cond-mat.stat-mech': 'Statistical Mechanics',
|
44 |
+
'cond-mat.str-el': 'Strongly Correlated Electrons',
|
45 |
+
'cond-mat.supr-con': 'Superconductivity',
|
46 |
+
'cond-mat': 'Condensed Matter',
|
47 |
+
'cs.AI': 'Artificial Intelligence',
|
48 |
+
'cs.AR': 'Hardware Architecture',
|
49 |
+
'cs.CC': 'Computational Complexity',
|
50 |
+
'cs.CE': 'Computational Engineering, Finance, and Science',
|
51 |
+
'cs.CG': 'Computational Geometry',
|
52 |
+
'cs.CL': 'Computation and Language',
|
53 |
+
'cs.CR': 'Cryptography and Security',
|
54 |
+
'cs.CV': 'Computer Vision and Pattern Recognition',
|
55 |
+
'cs.CY': 'Computers and Society',
|
56 |
+
'cs.DB': 'Databases',
|
57 |
+
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
|
58 |
+
'cs.DL': 'Digital Libraries',
|
59 |
+
'cs.DM': 'Discrete Mathematics',
|
60 |
+
'cs.DS': 'Data Structures and Algorithms',
|
61 |
+
'cs.ET': 'Emerging Technologies',
|
62 |
+
'cs.FL': 'Formal Languages and Automata Theory',
|
63 |
+
'cs.GL': 'General Literature',
|
64 |
+
'cs.GR': 'Graphics',
|
65 |
+
'cs.GT': 'Computer Science and Game Theory',
|
66 |
+
'cs.HC': 'Human-Computer Interaction',
|
67 |
+
'cs.IR': 'Information Retrieval',
|
68 |
+
'cs.IT': 'Information Theory',
|
69 |
+
'cs.LG': 'Machine Learning',
|
70 |
+
'cs.LO': 'Logic in Computer Science',
|
71 |
+
'cs.MA': 'Multiagent Systems',
|
72 |
+
'cs.MM': 'Multimedia',
|
73 |
+
'cs.MS': 'Mathematical Software',
|
74 |
+
'cs.NA': 'Numerical Analysis',
|
75 |
+
'cs.NE': 'Neural and Evolutionary Computing',
|
76 |
+
'cs.NI': 'Networking and Internet Architecture',
|
77 |
+
'cs.OH': 'Other Computer Science',
|
78 |
+
'cs.OS': 'Operating Systems',
|
79 |
+
'cs.PF': 'Performance',
|
80 |
+
'cs.PL': 'Programming Languages',
|
81 |
+
'cs.RO': 'Robotics',
|
82 |
+
'cs.SC': 'Symbolic Computation',
|
83 |
+
'cs.SD': 'Sound',
|
84 |
+
'cs.SE': 'Software Engineering',
|
85 |
+
'cs.SI': 'Social and Information Networks',
|
86 |
+
'cs.SY': 'Systems and Control',
|
87 |
+
'econ.EM': 'Econometrics',
|
88 |
+
'econ.GN': 'General Economics',
|
89 |
+
'econ.TH': 'Theoretical Economics',
|
90 |
+
'eess.AS': 'Audio and Speech Processing',
|
91 |
+
'eess.IV': 'Image and Video Processing',
|
92 |
+
'eess.SP': 'Signal Processing',
|
93 |
+
'eess.SY': 'Systems and Control',
|
94 |
+
'dg-ga': 'Differential Geometry',
|
95 |
+
'gr-qc': 'General Relativity and Quantum Cosmology',
|
96 |
+
'hep-ex': 'High Energy Physics - Experiment',
|
97 |
+
'hep-lat': 'High Energy Physics - Lattice',
|
98 |
+
'hep-ph': 'High Energy Physics - Phenomenology',
|
99 |
+
'hep-th': 'High Energy Physics - Theory',
|
100 |
+
'math.AC': 'Commutative Algebra',
|
101 |
+
'math.AG': 'Algebraic Geometry',
|
102 |
+
'math.AP': 'Analysis of PDEs',
|
103 |
+
'math.AT': 'Algebraic Topology',
|
104 |
+
'math.CA': 'Classical Analysis and ODEs',
|
105 |
+
'math.CO': 'Combinatorics',
|
106 |
+
'math.CT': 'Category Theory',
|
107 |
+
'math.CV': 'Complex Variables',
|
108 |
+
'math.DG': 'Differential Geometry',
|
109 |
+
'math.DS': 'Dynamical Systems',
|
110 |
+
'math.FA': 'Functional Analysis',
|
111 |
+
'math.GM': 'General Mathematics',
|
112 |
+
'math.GN': 'General Topology',
|
113 |
+
'math.GR': 'Group Theory',
|
114 |
+
'math.GT': 'Geometric Topology',
|
115 |
+
'math.HO': 'History and Overview',
|
116 |
+
'math.IT': 'Information Theory',
|
117 |
+
'math.KT': 'K-Theory and Homology',
|
118 |
+
'math.LO': 'Logic',
|
119 |
+
'math.MG': 'Metric Geometry',
|
120 |
+
'math.MP': 'Mathematical Physics',
|
121 |
+
'math.NA': 'Numerical Analysis',
|
122 |
+
'math.NT': 'Number Theory',
|
123 |
+
'math.OA': 'Operator Algebras',
|
124 |
+
'math.OC': 'Optimization and Control',
|
125 |
+
'math.PR': 'Probability',
|
126 |
+
'math.QA': 'Quantum Algebra',
|
127 |
+
'math.RA': 'Rings and Algebras',
|
128 |
+
'math.RT': 'Representation Theory',
|
129 |
+
'math.SG': 'Symplectic Geometry',
|
130 |
+
'math.SP': 'Spectral Theory',
|
131 |
+
'math.ST': 'Statistics Theory',
|
132 |
+
'math-ph': 'Mathematical Physics',
|
133 |
+
'funct-an': 'Functional Analysis',
|
134 |
+
'alg-geom': 'Algebraic Geometry',
|
135 |
+
'nlin.AO': 'Adaptation and Self-Organizing Systems',
|
136 |
+
'chao-dyn': 'Chaotic Dynamics',
|
137 |
+
'nlin.CD': 'Chaotic Dynamics',
|
138 |
+
'nlin.CG': 'Cellular Automata and Lattice Gases',
|
139 |
+
'nlin.PS': 'Pattern Formation and Solitons',
|
140 |
+
'nlin.SI': 'Exactly Solvable and Integrable Systems',
|
141 |
+
'nucl-ex': 'Nuclear Experiment',
|
142 |
+
'nucl-th': 'Nuclear Theory',
|
143 |
+
'physics.acc-ph': 'Accelerator Physics',
|
144 |
+
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
|
145 |
+
'physics.app-ph': 'Applied Physics',
|
146 |
+
'physics.atm-clus': 'Atomic and Molecular Clusters',
|
147 |
+
'physics.atom-ph': 'Atomic Physics',
|
148 |
+
'physics.bio-ph': 'Biological Physics',
|
149 |
+
'physics.chem-ph': 'Chemical Physics',
|
150 |
+
'physics.class-ph': 'Classical Physics',
|
151 |
+
'physics.comp-ph': 'Computational Physics',
|
152 |
+
'physics.data-an': 'Data Analysis, Statistics and Probability',
|
153 |
+
'physics.ed-ph': 'Physics Education',
|
154 |
+
'physics.flu-dyn': 'Fluid Dynamics',
|
155 |
+
'physics.gen-ph': 'General Physics',
|
156 |
+
'physics.geo-ph': 'Geophysics',
|
157 |
+
'physics.hist-ph': 'History and Philosophy of Physics',
|
158 |
+
'physics.ins-det': 'Instrumentation and Detectors',
|
159 |
+
'physics.med-ph': 'Medical Physics',
|
160 |
+
'physics.optics': 'Optics',
|
161 |
+
'physics.plasm-ph': 'Plasma Physics',
|
162 |
+
'physics.pop-ph': 'Popular Physics',
|
163 |
+
'physics.soc-ph': 'Physics and Society',
|
164 |
+
'physics.space-ph': 'Space Physics',
|
165 |
+
'q-bio.BM': 'Biomolecules',
|
166 |
+
'q-bio.CB': 'Cell Behavior',
|
167 |
+
'q-bio.GN': 'Genomics',
|
168 |
+
'q-bio.MN': 'Molecular Networks',
|
169 |
+
'q-bio.NC': 'Neurons and Cognition',
|
170 |
+
'q-bio.OT': 'Other Quantitative Biology',
|
171 |
+
'q-bio.PE': 'Populations and Evolution',
|
172 |
+
'q-bio.QM': 'Quantitative Methods',
|
173 |
+
'q-bio.SC': 'Subcellular Processes',
|
174 |
+
'q-bio.TO': 'Tissues and Organs',
|
175 |
+
'q-fin.CP': 'Computational Finance',
|
176 |
+
'q-fin.EC': 'Economics',
|
177 |
+
'q-fin.GN': 'General Finance',
|
178 |
+
'q-fin.MF': 'Mathematical Finance',
|
179 |
+
'q-fin.PM': 'Portfolio Management',
|
180 |
+
'q-fin.PR': 'Pricing of Securities',
|
181 |
+
'q-fin.RM': 'Risk Management',
|
182 |
+
'q-fin.ST': 'Statistical Finance',
|
183 |
+
'q-fin.TR': 'Trading and Market Microstructure',
|
184 |
+
'quant-ph': 'Quantum Physics',
|
185 |
+
'q-alg' : 'Quantum Algebra',
|
186 |
+
'stat.AP': 'Applications',
|
187 |
+
'stat.CO': 'Computation',
|
188 |
+
'stat.ME': 'Methodology',
|
189 |
+
'stat.ML': 'Machine Learning',
|
190 |
+
'stat.OT': 'Other Statistics',
|
191 |
+
'stat.TH': 'Statistics Theory'}
|
192 |
+
|
193 |
|
194 |
## 1. Latin-ize latex accents enclosed in brackets
|
195 |
def remove_latex_accents(string):
|
|
|
232 |
|
233 |
##
|
234 |
|
235 |
+
def find_hyph(text):
|
236 |
+
pattern = r'(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b'
|
237 |
+
keywords = regex.findall(pattern,text)
|
238 |
+
|
239 |
+
if keywords == []:
|
240 |
+
return None
|
241 |
+
else:
|
242 |
+
return list(set(keywords))
|
243 |
+
|
244 |
+
|
245 |
def find_msc(cat_list):
|
246 |
pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
|
247 |
out = []
|