Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
17e4444
0
Parent(s):
initialize local repo
Browse files- .gitignore +2 -0
- README.txt +1 -0
- cleaning-abstracts.ipynb +0 -0
- data-exploration.ipynb +34 -0
- util.py +224 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
data/
|
2 |
+
__pycache__
|
README.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Fritz is a baby hippo who lives in Cincinatti.
|
cleaning-abstracts.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data-exploration.ipynb
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"attachments": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# EDA for cleaned arXiv dataset"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"attachments": {},
|
13 |
+
"cell_type": "markdown",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"## 1. Frequently occuring author names. Discover some large subsets of the data consisting of papers who have at least one author with a given name. What are the most commonly occuring names?"
|
17 |
+
]
|
18 |
+
}
|
19 |
+
],
|
20 |
+
"metadata": {
|
21 |
+
"kernelspec": {
|
22 |
+
"display_name": "Python 3",
|
23 |
+
"language": "python",
|
24 |
+
"name": "python3"
|
25 |
+
},
|
26 |
+
"language_info": {
|
27 |
+
"name": "python",
|
28 |
+
"version": "3.10.11"
|
29 |
+
},
|
30 |
+
"orig_nbformat": 4
|
31 |
+
},
|
32 |
+
"nbformat": 4,
|
33 |
+
"nbformat_minor": 2
|
34 |
+
}
|
util.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import pandas as pd
|
4 |
+
import regex
|
5 |
+
|
6 |
+
def category_map():
|
7 |
+
"""Maps arXiv subject categories to their full english names.
|
8 |
+
|
9 |
+
Returns:
|
10 |
+
Python dict whose keys are arXiv tags and whose values are their English names.
|
11 |
+
Note that the list is not exhaustive in the sense that many categories have aliases that
|
12 |
+
are not included. (Some are, e.g. math.MP and math-ph).
|
13 |
+
"""
|
14 |
+
return {'astro-ph': 'Astrophysics',
|
15 |
+
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
|
16 |
+
'astro-ph.EP': 'Earth and Planetary Astrophysics',
|
17 |
+
'astro-ph.GA': 'Astrophysics of Galaxies',
|
18 |
+
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
|
19 |
+
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
|
20 |
+
'astro-ph.SR': 'Solar and Stellar Astrophysics',
|
21 |
+
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
|
22 |
+
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
|
23 |
+
'cond-mat.mtrl-sci': 'Materials Science',
|
24 |
+
'cond-mat.other': 'Other Condensed Matter',
|
25 |
+
'cond-mat.quant-gas': 'Quantum Gases',
|
26 |
+
'cond-mat.soft': 'Soft Condensed Matter',
|
27 |
+
'cond-mat.stat-mech': 'Statistical Mechanics',
|
28 |
+
'cond-mat.str-el': 'Strongly Correlated Electrons',
|
29 |
+
'cond-mat.supr-con': 'Superconductivity',
|
30 |
+
'cond-mat': 'Condensed Matter',
|
31 |
+
'cs.AI': 'Artificial Intelligence',
|
32 |
+
'cs.AR': 'Hardware Architecture',
|
33 |
+
'cs.CC': 'Computational Complexity',
|
34 |
+
'cs.CE': 'Computational Engineering, Finance, and Science',
|
35 |
+
'cs.CG': 'Computational Geometry',
|
36 |
+
'cs.CL': 'Computation and Language',
|
37 |
+
'cs.CR': 'Cryptography and Security',
|
38 |
+
'cs.CV': 'Computer Vision and Pattern Recognition',
|
39 |
+
'cs.CY': 'Computers and Society',
|
40 |
+
'cs.DB': 'Databases',
|
41 |
+
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
|
42 |
+
'cs.DL': 'Digital Libraries',
|
43 |
+
'cs.DM': 'Discrete Mathematics',
|
44 |
+
'cs.DS': 'Data Structures and Algorithms',
|
45 |
+
'cs.ET': 'Emerging Technologies',
|
46 |
+
'cs.FL': 'Formal Languages and Automata Theory',
|
47 |
+
'cs.GL': 'General Literature',
|
48 |
+
'cs.GR': 'Graphics',
|
49 |
+
'cs.GT': 'Computer Science and Game Theory',
|
50 |
+
'cs.HC': 'Human-Computer Interaction',
|
51 |
+
'cs.IR': 'Information Retrieval',
|
52 |
+
'cs.IT': 'Information Theory',
|
53 |
+
'cs.LG': 'Machine Learning',
|
54 |
+
'cs.LO': 'Logic in Computer Science',
|
55 |
+
'cs.MA': 'Multiagent Systems',
|
56 |
+
'cs.MM': 'Multimedia',
|
57 |
+
'cs.MS': 'Mathematical Software',
|
58 |
+
'cs.NA': 'Numerical Analysis',
|
59 |
+
'cs.NE': 'Neural and Evolutionary Computing',
|
60 |
+
'cs.NI': 'Networking and Internet Architecture',
|
61 |
+
'cs.OH': 'Other Computer Science',
|
62 |
+
'cs.OS': 'Operating Systems',
|
63 |
+
'cs.PF': 'Performance',
|
64 |
+
'cs.PL': 'Programming Languages',
|
65 |
+
'cs.RO': 'Robotics',
|
66 |
+
'cs.SC': 'Symbolic Computation',
|
67 |
+
'cs.SD': 'Sound',
|
68 |
+
'cs.SE': 'Software Engineering',
|
69 |
+
'cs.SI': 'Social and Information Networks',
|
70 |
+
'cs.SY': 'Systems and Control',
|
71 |
+
'econ.EM': 'Econometrics',
|
72 |
+
'econ.GN': 'General Economics',
|
73 |
+
'econ.TH': 'Theoretical Economics',
|
74 |
+
'eess.AS': 'Audio and Speech Processing',
|
75 |
+
'eess.IV': 'Image and Video Processing',
|
76 |
+
'eess.SP': 'Signal Processing',
|
77 |
+
'eess.SY': 'Systems and Control',
|
78 |
+
'dg-ga': 'Differential Geometry',
|
79 |
+
'gr-qc': 'General Relativity and Quantum Cosmology',
|
80 |
+
'hep-ex': 'High Energy Physics - Experiment',
|
81 |
+
'hep-lat': 'High Energy Physics - Lattice',
|
82 |
+
'hep-ph': 'High Energy Physics - Phenomenology',
|
83 |
+
'hep-th': 'High Energy Physics - Theory',
|
84 |
+
'math.AC': 'Commutative Algebra',
|
85 |
+
'math.AG': 'Algebraic Geometry',
|
86 |
+
'math.AP': 'Analysis of PDEs',
|
87 |
+
'math.AT': 'Algebraic Topology',
|
88 |
+
'math.CA': 'Classical Analysis and ODEs',
|
89 |
+
'math.CO': 'Combinatorics',
|
90 |
+
'math.CT': 'Category Theory',
|
91 |
+
'math.CV': 'Complex Variables',
|
92 |
+
'math.DG': 'Differential Geometry',
|
93 |
+
'math.DS': 'Dynamical Systems',
|
94 |
+
'math.FA': 'Functional Analysis',
|
95 |
+
'math.GM': 'General Mathematics',
|
96 |
+
'math.GN': 'General Topology',
|
97 |
+
'math.GR': 'Group Theory',
|
98 |
+
'math.GT': 'Geometric Topology',
|
99 |
+
'math.HO': 'History and Overview',
|
100 |
+
'math.IT': 'Information Theory',
|
101 |
+
'math.KT': 'K-Theory and Homology',
|
102 |
+
'math.LO': 'Logic',
|
103 |
+
'math.MG': 'Metric Geometry',
|
104 |
+
'math.MP': 'Mathematical Physics',
|
105 |
+
'math.NA': 'Numerical Analysis',
|
106 |
+
'math.NT': 'Number Theory',
|
107 |
+
'math.OA': 'Operator Algebras',
|
108 |
+
'math.OC': 'Optimization and Control',
|
109 |
+
'math.PR': 'Probability',
|
110 |
+
'math.QA': 'Quantum Algebra',
|
111 |
+
'math.RA': 'Rings and Algebras',
|
112 |
+
'math.RT': 'Representation Theory',
|
113 |
+
'math.SG': 'Symplectic Geometry',
|
114 |
+
'math.SP': 'Spectral Theory',
|
115 |
+
'math.ST': 'Statistics Theory',
|
116 |
+
'math-ph': 'Mathematical Physics',
|
117 |
+
'funct-an': 'Functional Analysis',
|
118 |
+
'alg-geom': 'Algebraic Geometry',
|
119 |
+
'nlin.AO': 'Adaptation and Self-Organizing Systems',
|
120 |
+
'chao-dyn': 'Chaotic Dynamics',
|
121 |
+
'nlin.CD': 'Chaotic Dynamics',
|
122 |
+
'nlin.CG': 'Cellular Automata and Lattice Gases',
|
123 |
+
'nlin.PS': 'Pattern Formation and Solitons',
|
124 |
+
'nlin.SI': 'Exactly Solvable and Integrable Systems',
|
125 |
+
'nucl-ex': 'Nuclear Experiment',
|
126 |
+
'nucl-th': 'Nuclear Theory',
|
127 |
+
'physics.acc-ph': 'Accelerator Physics',
|
128 |
+
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
|
129 |
+
'physics.app-ph': 'Applied Physics',
|
130 |
+
'physics.atm-clus': 'Atomic and Molecular Clusters',
|
131 |
+
'physics.atom-ph': 'Atomic Physics',
|
132 |
+
'physics.bio-ph': 'Biological Physics',
|
133 |
+
'physics.chem-ph': 'Chemical Physics',
|
134 |
+
'physics.class-ph': 'Classical Physics',
|
135 |
+
'physics.comp-ph': 'Computational Physics',
|
136 |
+
'physics.data-an': 'Data Analysis, Statistics and Probability',
|
137 |
+
'physics.ed-ph': 'Physics Education',
|
138 |
+
'physics.flu-dyn': 'Fluid Dynamics',
|
139 |
+
'physics.gen-ph': 'General Physics',
|
140 |
+
'physics.geo-ph': 'Geophysics',
|
141 |
+
'physics.hist-ph': 'History and Philosophy of Physics',
|
142 |
+
'physics.ins-det': 'Instrumentation and Detectors',
|
143 |
+
'physics.med-ph': 'Medical Physics',
|
144 |
+
'physics.optics': 'Optics',
|
145 |
+
'physics.plasm-ph': 'Plasma Physics',
|
146 |
+
'physics.pop-ph': 'Popular Physics',
|
147 |
+
'physics.soc-ph': 'Physics and Society',
|
148 |
+
'physics.space-ph': 'Space Physics',
|
149 |
+
'q-bio.BM': 'Biomolecules',
|
150 |
+
'q-bio.CB': 'Cell Behavior',
|
151 |
+
'q-bio.GN': 'Genomics',
|
152 |
+
'q-bio.MN': 'Molecular Networks',
|
153 |
+
'q-bio.NC': 'Neurons and Cognition',
|
154 |
+
'q-bio.OT': 'Other Quantitative Biology',
|
155 |
+
'q-bio.PE': 'Populations and Evolution',
|
156 |
+
'q-bio.QM': 'Quantitative Methods',
|
157 |
+
'q-bio.SC': 'Subcellular Processes',
|
158 |
+
'q-bio.TO': 'Tissues and Organs',
|
159 |
+
'q-fin.CP': 'Computational Finance',
|
160 |
+
'q-fin.EC': 'Economics',
|
161 |
+
'q-fin.GN': 'General Finance',
|
162 |
+
'q-fin.MF': 'Mathematical Finance',
|
163 |
+
'q-fin.PM': 'Portfolio Management',
|
164 |
+
'q-fin.PR': 'Pricing of Securities',
|
165 |
+
'q-fin.RM': 'Risk Management',
|
166 |
+
'q-fin.ST': 'Statistical Finance',
|
167 |
+
'q-fin.TR': 'Trading and Market Microstructure',
|
168 |
+
'quant-ph': 'Quantum Physics',
|
169 |
+
'q-alg' : 'Quantum Algebra',
|
170 |
+
'stat.AP': 'Applications',
|
171 |
+
'stat.CO': 'Computation',
|
172 |
+
'stat.ME': 'Methodology',
|
173 |
+
'stat.ML': 'Machine Learning',
|
174 |
+
'stat.OT': 'Other Statistics',
|
175 |
+
'stat.TH': 'Statistics Theory'}
|
176 |
+
|
177 |
+
## 1. Latin-ize latex accents enclosed in brackets
|
178 |
+
def remove_latex_accents(string):
|
179 |
+
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
|
180 |
+
replacement = r'\1'
|
181 |
+
|
182 |
+
string = regex.sub(accent,replacement, string)
|
183 |
+
return string
|
184 |
+
|
185 |
+
## 2. Remove latex environments
|
186 |
+
def remove_env(string):
|
187 |
+
env = r'\\[a-z]{2,}{[^{}]+?}'
|
188 |
+
|
189 |
+
string = regex.sub(env,'',string)
|
190 |
+
return string
|
191 |
+
|
192 |
+
## 3. Latin-ize non-{} enclosed latex accents:
|
193 |
+
def remove_accents(string):
|
194 |
+
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])'
|
195 |
+
replacement = r'\1'
|
196 |
+
|
197 |
+
string = regex.sub(accent,replacement,string)
|
198 |
+
return string
|
199 |
+
|
200 |
+
## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
|
201 |
+
|
202 |
+
def remove_latex(string):
|
203 |
+
latex = r'\s(\$\$?)[^\$]*?\1\S*'
|
204 |
+
string = regex.sub(latex,' LATEX ',string)
|
205 |
+
return string
|
206 |
+
|
207 |
+
|
208 |
+
def cleanse(string):
|
209 |
+
string = string.replace('\n',' ')
|
210 |
+
string = remove_latex_accents(string)
|
211 |
+
string = remove_env(string)
|
212 |
+
string = remove_accents(string)
|
213 |
+
string = remove_latex(string)
|
214 |
+
return string
|
215 |
+
|
216 |
+
|
217 |
+
def find_hyph(text):
|
218 |
+
pattern = r'(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b'
|
219 |
+
keywords = regex.findall(pattern,text)
|
220 |
+
|
221 |
+
if keywords == []:
|
222 |
+
return None
|
223 |
+
else:
|
224 |
+
return list(set(keywords))
|