Michael-Geis commited on
Commit
17e4444
0 Parent(s):

initialize local repo

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. README.txt +1 -0
  3. cleaning-abstracts.ipynb +0 -0
  4. data-exploration.ipynb +34 -0
  5. util.py +224 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data/
2
+ __pycache__
README.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Fritz is a baby hippo who lives in Cincinatti.
cleaning-abstracts.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
data-exploration.ipynb ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# EDA for cleaned arXiv dataset"
9
+ ]
10
+ },
11
+ {
12
+ "attachments": {},
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "## 1. Frequently occuring author names. Discover some large subsets of the data consisting of papers who have at least one author with a given name. What are the most commonly occuring names?"
17
+ ]
18
+ }
19
+ ],
20
+ "metadata": {
21
+ "kernelspec": {
22
+ "display_name": "Python 3",
23
+ "language": "python",
24
+ "name": "python3"
25
+ },
26
+ "language_info": {
27
+ "name": "python",
28
+ "version": "3.10.11"
29
+ },
30
+ "orig_nbformat": 4
31
+ },
32
+ "nbformat": 4,
33
+ "nbformat_minor": 2
34
+ }
util.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import pandas as pd
4
+ import regex
5
+
6
+ def category_map():
7
+ """Maps arXiv subject categories to their full english names.
8
+
9
+ Returns:
10
+ Python dict whose keys are arXiv tags and whose values are their English names.
11
+ Note that the list is not exhaustive in the sense that many categories have aliases that
12
+ are not included. (Some are, e.g. math.MP and math-ph).
13
+ """
14
+ return {'astro-ph': 'Astrophysics',
15
+ 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
16
+ 'astro-ph.EP': 'Earth and Planetary Astrophysics',
17
+ 'astro-ph.GA': 'Astrophysics of Galaxies',
18
+ 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
19
+ 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
20
+ 'astro-ph.SR': 'Solar and Stellar Astrophysics',
21
+ 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
22
+ 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
23
+ 'cond-mat.mtrl-sci': 'Materials Science',
24
+ 'cond-mat.other': 'Other Condensed Matter',
25
+ 'cond-mat.quant-gas': 'Quantum Gases',
26
+ 'cond-mat.soft': 'Soft Condensed Matter',
27
+ 'cond-mat.stat-mech': 'Statistical Mechanics',
28
+ 'cond-mat.str-el': 'Strongly Correlated Electrons',
29
+ 'cond-mat.supr-con': 'Superconductivity',
30
+ 'cond-mat': 'Condensed Matter',
31
+ 'cs.AI': 'Artificial Intelligence',
32
+ 'cs.AR': 'Hardware Architecture',
33
+ 'cs.CC': 'Computational Complexity',
34
+ 'cs.CE': 'Computational Engineering, Finance, and Science',
35
+ 'cs.CG': 'Computational Geometry',
36
+ 'cs.CL': 'Computation and Language',
37
+ 'cs.CR': 'Cryptography and Security',
38
+ 'cs.CV': 'Computer Vision and Pattern Recognition',
39
+ 'cs.CY': 'Computers and Society',
40
+ 'cs.DB': 'Databases',
41
+ 'cs.DC': 'Distributed, Parallel, and Cluster Computing',
42
+ 'cs.DL': 'Digital Libraries',
43
+ 'cs.DM': 'Discrete Mathematics',
44
+ 'cs.DS': 'Data Structures and Algorithms',
45
+ 'cs.ET': 'Emerging Technologies',
46
+ 'cs.FL': 'Formal Languages and Automata Theory',
47
+ 'cs.GL': 'General Literature',
48
+ 'cs.GR': 'Graphics',
49
+ 'cs.GT': 'Computer Science and Game Theory',
50
+ 'cs.HC': 'Human-Computer Interaction',
51
+ 'cs.IR': 'Information Retrieval',
52
+ 'cs.IT': 'Information Theory',
53
+ 'cs.LG': 'Machine Learning',
54
+ 'cs.LO': 'Logic in Computer Science',
55
+ 'cs.MA': 'Multiagent Systems',
56
+ 'cs.MM': 'Multimedia',
57
+ 'cs.MS': 'Mathematical Software',
58
+ 'cs.NA': 'Numerical Analysis',
59
+ 'cs.NE': 'Neural and Evolutionary Computing',
60
+ 'cs.NI': 'Networking and Internet Architecture',
61
+ 'cs.OH': 'Other Computer Science',
62
+ 'cs.OS': 'Operating Systems',
63
+ 'cs.PF': 'Performance',
64
+ 'cs.PL': 'Programming Languages',
65
+ 'cs.RO': 'Robotics',
66
+ 'cs.SC': 'Symbolic Computation',
67
+ 'cs.SD': 'Sound',
68
+ 'cs.SE': 'Software Engineering',
69
+ 'cs.SI': 'Social and Information Networks',
70
+ 'cs.SY': 'Systems and Control',
71
+ 'econ.EM': 'Econometrics',
72
+ 'econ.GN': 'General Economics',
73
+ 'econ.TH': 'Theoretical Economics',
74
+ 'eess.AS': 'Audio and Speech Processing',
75
+ 'eess.IV': 'Image and Video Processing',
76
+ 'eess.SP': 'Signal Processing',
77
+ 'eess.SY': 'Systems and Control',
78
+ 'dg-ga': 'Differential Geometry',
79
+ 'gr-qc': 'General Relativity and Quantum Cosmology',
80
+ 'hep-ex': 'High Energy Physics - Experiment',
81
+ 'hep-lat': 'High Energy Physics - Lattice',
82
+ 'hep-ph': 'High Energy Physics - Phenomenology',
83
+ 'hep-th': 'High Energy Physics - Theory',
84
+ 'math.AC': 'Commutative Algebra',
85
+ 'math.AG': 'Algebraic Geometry',
86
+ 'math.AP': 'Analysis of PDEs',
87
+ 'math.AT': 'Algebraic Topology',
88
+ 'math.CA': 'Classical Analysis and ODEs',
89
+ 'math.CO': 'Combinatorics',
90
+ 'math.CT': 'Category Theory',
91
+ 'math.CV': 'Complex Variables',
92
+ 'math.DG': 'Differential Geometry',
93
+ 'math.DS': 'Dynamical Systems',
94
+ 'math.FA': 'Functional Analysis',
95
+ 'math.GM': 'General Mathematics',
96
+ 'math.GN': 'General Topology',
97
+ 'math.GR': 'Group Theory',
98
+ 'math.GT': 'Geometric Topology',
99
+ 'math.HO': 'History and Overview',
100
+ 'math.IT': 'Information Theory',
101
+ 'math.KT': 'K-Theory and Homology',
102
+ 'math.LO': 'Logic',
103
+ 'math.MG': 'Metric Geometry',
104
+ 'math.MP': 'Mathematical Physics',
105
+ 'math.NA': 'Numerical Analysis',
106
+ 'math.NT': 'Number Theory',
107
+ 'math.OA': 'Operator Algebras',
108
+ 'math.OC': 'Optimization and Control',
109
+ 'math.PR': 'Probability',
110
+ 'math.QA': 'Quantum Algebra',
111
+ 'math.RA': 'Rings and Algebras',
112
+ 'math.RT': 'Representation Theory',
113
+ 'math.SG': 'Symplectic Geometry',
114
+ 'math.SP': 'Spectral Theory',
115
+ 'math.ST': 'Statistics Theory',
116
+ 'math-ph': 'Mathematical Physics',
117
+ 'funct-an': 'Functional Analysis',
118
+ 'alg-geom': 'Algebraic Geometry',
119
+ 'nlin.AO': 'Adaptation and Self-Organizing Systems',
120
+ 'chao-dyn': 'Chaotic Dynamics',
121
+ 'nlin.CD': 'Chaotic Dynamics',
122
+ 'nlin.CG': 'Cellular Automata and Lattice Gases',
123
+ 'nlin.PS': 'Pattern Formation and Solitons',
124
+ 'nlin.SI': 'Exactly Solvable and Integrable Systems',
125
+ 'nucl-ex': 'Nuclear Experiment',
126
+ 'nucl-th': 'Nuclear Theory',
127
+ 'physics.acc-ph': 'Accelerator Physics',
128
+ 'physics.ao-ph': 'Atmospheric and Oceanic Physics',
129
+ 'physics.app-ph': 'Applied Physics',
130
+ 'physics.atm-clus': 'Atomic and Molecular Clusters',
131
+ 'physics.atom-ph': 'Atomic Physics',
132
+ 'physics.bio-ph': 'Biological Physics',
133
+ 'physics.chem-ph': 'Chemical Physics',
134
+ 'physics.class-ph': 'Classical Physics',
135
+ 'physics.comp-ph': 'Computational Physics',
136
+ 'physics.data-an': 'Data Analysis, Statistics and Probability',
137
+ 'physics.ed-ph': 'Physics Education',
138
+ 'physics.flu-dyn': 'Fluid Dynamics',
139
+ 'physics.gen-ph': 'General Physics',
140
+ 'physics.geo-ph': 'Geophysics',
141
+ 'physics.hist-ph': 'History and Philosophy of Physics',
142
+ 'physics.ins-det': 'Instrumentation and Detectors',
143
+ 'physics.med-ph': 'Medical Physics',
144
+ 'physics.optics': 'Optics',
145
+ 'physics.plasm-ph': 'Plasma Physics',
146
+ 'physics.pop-ph': 'Popular Physics',
147
+ 'physics.soc-ph': 'Physics and Society',
148
+ 'physics.space-ph': 'Space Physics',
149
+ 'q-bio.BM': 'Biomolecules',
150
+ 'q-bio.CB': 'Cell Behavior',
151
+ 'q-bio.GN': 'Genomics',
152
+ 'q-bio.MN': 'Molecular Networks',
153
+ 'q-bio.NC': 'Neurons and Cognition',
154
+ 'q-bio.OT': 'Other Quantitative Biology',
155
+ 'q-bio.PE': 'Populations and Evolution',
156
+ 'q-bio.QM': 'Quantitative Methods',
157
+ 'q-bio.SC': 'Subcellular Processes',
158
+ 'q-bio.TO': 'Tissues and Organs',
159
+ 'q-fin.CP': 'Computational Finance',
160
+ 'q-fin.EC': 'Economics',
161
+ 'q-fin.GN': 'General Finance',
162
+ 'q-fin.MF': 'Mathematical Finance',
163
+ 'q-fin.PM': 'Portfolio Management',
164
+ 'q-fin.PR': 'Pricing of Securities',
165
+ 'q-fin.RM': 'Risk Management',
166
+ 'q-fin.ST': 'Statistical Finance',
167
+ 'q-fin.TR': 'Trading and Market Microstructure',
168
+ 'quant-ph': 'Quantum Physics',
169
+ 'q-alg' : 'Quantum Algebra',
170
+ 'stat.AP': 'Applications',
171
+ 'stat.CO': 'Computation',
172
+ 'stat.ME': 'Methodology',
173
+ 'stat.ML': 'Machine Learning',
174
+ 'stat.OT': 'Other Statistics',
175
+ 'stat.TH': 'Statistics Theory'}
176
+
177
+ ## 1. Latin-ize latex accents enclosed in brackets
178
+ def remove_latex_accents(string):
179
+ accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
180
+ replacement = r'\1'
181
+
182
+ string = regex.sub(accent,replacement, string)
183
+ return string
184
+
185
+ ## 2. Remove latex environments
186
+ def remove_env(string):
187
+ env = r'\\[a-z]{2,}{[^{}]+?}'
188
+
189
+ string = regex.sub(env,'',string)
190
+ return string
191
+
192
+ ## 3. Latin-ize non-{} enclosed latex accents:
193
+ def remove_accents(string):
194
+ accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])'
195
+ replacement = r'\1'
196
+
197
+ string = regex.sub(accent,replacement,string)
198
+ return string
199
+
200
+ ## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
201
+
202
+ def remove_latex(string):
203
+ latex = r'\s(\$\$?)[^\$]*?\1\S*'
204
+ string = regex.sub(latex,' LATEX ',string)
205
+ return string
206
+
207
+
208
+ def cleanse(string):
209
+ string = string.replace('\n',' ')
210
+ string = remove_latex_accents(string)
211
+ string = remove_env(string)
212
+ string = remove_accents(string)
213
+ string = remove_latex(string)
214
+ return string
215
+
216
+
217
+ def find_hyph(text):
218
+ pattern = r'(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b'
219
+ keywords = regex.findall(pattern,text)
220
+
221
+ if keywords == []:
222
+ return None
223
+ else:
224
+ return list(set(keywords))