Michael-Geis commited on
Commit
cbdef5e
1 Parent(s): 3af1705

created methods for splitting categories into arxiv and msc tags

Browse files
Files changed (2) hide show
  1. data_cleaning.py +239 -199
  2. data_storage.py +29 -27
data_cleaning.py CHANGED
@@ -4,21 +4,24 @@ import json
4
  import sentence_transformers.util
5
  import os
6
 
 
7
  def main(raw_metadata_df, path_to_embeddings):
8
  clean_metadata_df = pd.DataFrame(
9
- columns=['sentences','authors','msc_tags','msc_cos_sim']
10
- )
11
 
12
  clean_title = raw_metadata_df.title.apply(cleanse)
13
  clean_abstract = raw_metadata_df.summary.apply(cleanse)
14
- clean_metadata_df.sentences = clean_title + ' ' + clean_abstract
15
  clean_metadata_df.authors = raw_metadata_df.authors
16
  clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
17
 
18
  return clean_metadata_df
19
 
 
20
  ##
21
 
 
22
  def category_map():
23
  """Maps arXiv subject categories to their full english names.
24
 
@@ -27,214 +30,253 @@ def category_map():
27
  Note that the list is not exhaustive in the sense that many categories have aliases that
28
  are not included. (Some are, e.g. math.MP and math-ph).
29
  """
30
- return {'astro-ph': 'Astrophysics',
31
- 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
32
- 'astro-ph.EP': 'Earth and Planetary Astrophysics',
33
- 'astro-ph.GA': 'Astrophysics of Galaxies',
34
- 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
35
- 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
36
- 'astro-ph.SR': 'Solar and Stellar Astrophysics',
37
- 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
38
- 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
39
- 'cond-mat.mtrl-sci': 'Materials Science',
40
- 'cond-mat.other': 'Other Condensed Matter',
41
- 'cond-mat.quant-gas': 'Quantum Gases',
42
- 'cond-mat.soft': 'Soft Condensed Matter',
43
- 'cond-mat.stat-mech': 'Statistical Mechanics',
44
- 'cond-mat.str-el': 'Strongly Correlated Electrons',
45
- 'cond-mat.supr-con': 'Superconductivity',
46
- 'cond-mat': 'Condensed Matter',
47
- 'cs.AI': 'Artificial Intelligence',
48
- 'cs.AR': 'Hardware Architecture',
49
- 'cs.CC': 'Computational Complexity',
50
- 'cs.CE': 'Computational Engineering, Finance, and Science',
51
- 'cs.CG': 'Computational Geometry',
52
- 'cs.CL': 'Computation and Language',
53
- 'cs.CR': 'Cryptography and Security',
54
- 'cs.CV': 'Computer Vision and Pattern Recognition',
55
- 'cs.CY': 'Computers and Society',
56
- 'cs.DB': 'Databases',
57
- 'cs.DC': 'Distributed, Parallel, and Cluster Computing',
58
- 'cs.DL': 'Digital Libraries',
59
- 'cs.DM': 'Discrete Mathematics',
60
- 'cs.DS': 'Data Structures and Algorithms',
61
- 'cs.ET': 'Emerging Technologies',
62
- 'cs.FL': 'Formal Languages and Automata Theory',
63
- 'cs.GL': 'General Literature',
64
- 'cs.GR': 'Graphics',
65
- 'cs.GT': 'Computer Science and Game Theory',
66
- 'cs.HC': 'Human-Computer Interaction',
67
- 'cs.IR': 'Information Retrieval',
68
- 'cs.IT': 'Information Theory',
69
- 'cs.LG': 'Machine Learning',
70
- 'cs.LO': 'Logic in Computer Science',
71
- 'cs.MA': 'Multiagent Systems',
72
- 'cs.MM': 'Multimedia',
73
- 'cs.MS': 'Mathematical Software',
74
- 'cs.NA': 'Numerical Analysis',
75
- 'cs.NE': 'Neural and Evolutionary Computing',
76
- 'cs.NI': 'Networking and Internet Architecture',
77
- 'cs.OH': 'Other Computer Science',
78
- 'cs.OS': 'Operating Systems',
79
- 'cs.PF': 'Performance',
80
- 'cs.PL': 'Programming Languages',
81
- 'cs.RO': 'Robotics',
82
- 'cs.SC': 'Symbolic Computation',
83
- 'cs.SD': 'Sound',
84
- 'cs.SE': 'Software Engineering',
85
- 'cs.SI': 'Social and Information Networks',
86
- 'cs.SY': 'Systems and Control',
87
- 'econ.EM': 'Econometrics',
88
- 'econ.GN': 'General Economics',
89
- 'econ.TH': 'Theoretical Economics',
90
- 'eess.AS': 'Audio and Speech Processing',
91
- 'eess.IV': 'Image and Video Processing',
92
- 'eess.SP': 'Signal Processing',
93
- 'eess.SY': 'Systems and Control',
94
- 'dg-ga': 'Differential Geometry',
95
- 'gr-qc': 'General Relativity and Quantum Cosmology',
96
- 'hep-ex': 'High Energy Physics - Experiment',
97
- 'hep-lat': 'High Energy Physics - Lattice',
98
- 'hep-ph': 'High Energy Physics - Phenomenology',
99
- 'hep-th': 'High Energy Physics - Theory',
100
- 'math.AC': 'Commutative Algebra',
101
- 'math.AG': 'Algebraic Geometry',
102
- 'math.AP': 'Analysis of PDEs',
103
- 'math.AT': 'Algebraic Topology',
104
- 'math.CA': 'Classical Analysis and ODEs',
105
- 'math.CO': 'Combinatorics',
106
- 'math.CT': 'Category Theory',
107
- 'math.CV': 'Complex Variables',
108
- 'math.DG': 'Differential Geometry',
109
- 'math.DS': 'Dynamical Systems',
110
- 'math.FA': 'Functional Analysis',
111
- 'math.GM': 'General Mathematics',
112
- 'math.GN': 'General Topology',
113
- 'math.GR': 'Group Theory',
114
- 'math.GT': 'Geometric Topology',
115
- 'math.HO': 'History and Overview',
116
- 'math.IT': 'Information Theory',
117
- 'math.KT': 'K-Theory and Homology',
118
- 'math.LO': 'Logic',
119
- 'math.MG': 'Metric Geometry',
120
- 'math.MP': 'Mathematical Physics',
121
- 'math.NA': 'Numerical Analysis',
122
- 'math.NT': 'Number Theory',
123
- 'math.OA': 'Operator Algebras',
124
- 'math.OC': 'Optimization and Control',
125
- 'math.PR': 'Probability',
126
- 'math.QA': 'Quantum Algebra',
127
- 'math.RA': 'Rings and Algebras',
128
- 'math.RT': 'Representation Theory',
129
- 'math.SG': 'Symplectic Geometry',
130
- 'math.SP': 'Spectral Theory',
131
- 'math.ST': 'Statistics Theory',
132
- 'math-ph': 'Mathematical Physics',
133
- 'funct-an': 'Functional Analysis',
134
- 'alg-geom': 'Algebraic Geometry',
135
- 'nlin.AO': 'Adaptation and Self-Organizing Systems',
136
- 'chao-dyn': 'Chaotic Dynamics',
137
- 'nlin.CD': 'Chaotic Dynamics',
138
- 'nlin.CG': 'Cellular Automata and Lattice Gases',
139
- 'nlin.PS': 'Pattern Formation and Solitons',
140
- 'nlin.SI': 'Exactly Solvable and Integrable Systems',
141
- 'nucl-ex': 'Nuclear Experiment',
142
- 'nucl-th': 'Nuclear Theory',
143
- 'physics.acc-ph': 'Accelerator Physics',
144
- 'physics.ao-ph': 'Atmospheric and Oceanic Physics',
145
- 'physics.app-ph': 'Applied Physics',
146
- 'physics.atm-clus': 'Atomic and Molecular Clusters',
147
- 'physics.atom-ph': 'Atomic Physics',
148
- 'physics.bio-ph': 'Biological Physics',
149
- 'physics.chem-ph': 'Chemical Physics',
150
- 'physics.class-ph': 'Classical Physics',
151
- 'physics.comp-ph': 'Computational Physics',
152
- 'physics.data-an': 'Data Analysis, Statistics and Probability',
153
- 'physics.ed-ph': 'Physics Education',
154
- 'physics.flu-dyn': 'Fluid Dynamics',
155
- 'physics.gen-ph': 'General Physics',
156
- 'physics.geo-ph': 'Geophysics',
157
- 'physics.hist-ph': 'History and Philosophy of Physics',
158
- 'physics.ins-det': 'Instrumentation and Detectors',
159
- 'physics.med-ph': 'Medical Physics',
160
- 'physics.optics': 'Optics',
161
- 'physics.plasm-ph': 'Plasma Physics',
162
- 'physics.pop-ph': 'Popular Physics',
163
- 'physics.soc-ph': 'Physics and Society',
164
- 'physics.space-ph': 'Space Physics',
165
- 'q-bio.BM': 'Biomolecules',
166
- 'q-bio.CB': 'Cell Behavior',
167
- 'q-bio.GN': 'Genomics',
168
- 'q-bio.MN': 'Molecular Networks',
169
- 'q-bio.NC': 'Neurons and Cognition',
170
- 'q-bio.OT': 'Other Quantitative Biology',
171
- 'q-bio.PE': 'Populations and Evolution',
172
- 'q-bio.QM': 'Quantitative Methods',
173
- 'q-bio.SC': 'Subcellular Processes',
174
- 'q-bio.TO': 'Tissues and Organs',
175
- 'q-fin.CP': 'Computational Finance',
176
- 'q-fin.EC': 'Economics',
177
- 'q-fin.GN': 'General Finance',
178
- 'q-fin.MF': 'Mathematical Finance',
179
- 'q-fin.PM': 'Portfolio Management',
180
- 'q-fin.PR': 'Pricing of Securities',
181
- 'q-fin.RM': 'Risk Management',
182
- 'q-fin.ST': 'Statistical Finance',
183
- 'q-fin.TR': 'Trading and Market Microstructure',
184
- 'quant-ph': 'Quantum Physics',
185
- 'q-alg' : 'Quantum Algebra',
186
- 'stat.AP': 'Applications',
187
- 'stat.CO': 'Computation',
188
- 'stat.ME': 'Methodology',
189
- 'stat.ML': 'Machine Learning',
190
- 'stat.OT': 'Other Statistics',
191
- 'stat.TH': 'Statistics Theory'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
 
194
  ## 1. Latin-ize latex accents enclosed in brackets
195
  def remove_latex_accents(string):
196
- accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
197
- replacement = r'\1'
198
 
199
- string = regex.sub(accent,replacement, string)
200
  return string
201
 
 
202
  ## 2. Remove latex environments
203
  def remove_env(string):
204
- env = r'\\[a-z]{2,}{[^{}]+?}'
205
 
206
- string = regex.sub(env,'',string)
207
  return string
208
 
 
209
  ## 3. Latin-ize non-{} enclosed latex accents:
210
  def remove_accents(string):
211
- accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])'
212
- replacement = r'\1'
 
 
 
213
 
214
- string = regex.sub(accent,replacement,string)
215
- return string
216
 
217
  ## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
218
 
 
219
  def remove_latex(string):
220
- latex = r'\s(\$\$?)[^\$]*?\1\S*'
221
- string = regex.sub(latex,' LATEX ',string)
222
- return string
223
 
224
 
225
  def cleanse(string):
226
- string = string.replace('\n',' ')
227
  string = remove_latex_accents(string)
228
  string = remove_env(string)
229
  string = remove_accents(string)
230
  string = remove_latex(string)
231
  return string
232
 
233
- ##
 
 
234
 
235
  def find_hyph(text):
236
- pattern = r'(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b'
237
- keywords = regex.findall(pattern,text)
238
 
239
  if keywords == []:
240
  return None
@@ -242,17 +284,14 @@ def find_hyph(text):
242
  return list(set(keywords))
243
 
244
 
245
- def find_msc(cat_list):
246
- pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
247
- out = []
248
- for cat in cat_list:
249
- tags = regex.findall(pattern,cat)
250
- for tag in tags:
251
- out.append(tag)
252
- return out
253
 
254
  def msc_tags():
255
- with open('./data/msc.json','r') as file:
256
  text = file.read()
257
  return json.loads(text)
258
 
@@ -268,33 +307,34 @@ def cats_to_msc(cat_list):
268
  return None
269
  else:
270
  return out
271
-
272
 
273
  ##
274
 
 
275
  def msc_encoded_dict():
276
- encoded_tags = pd.read_parquet('./data/msc_mini_embeddings.parquet').to_numpy()
277
- return {k : v for (k,v) in zip(msc_tags().values(), encoded_tags)}
 
278
 
279
  def doc_encoded_dict():
280
- library_embeddings = pd.read_parquet('./data/APSP_mini_vec.parquet')
281
 
282
  docs = library_embeddings.docs.to_list()
283
  encoded_docs = library_embeddings.vecs.to_numpy()
284
 
285
- return {k : v for (k,v) in zip(docs , encoded_docs)}
286
 
287
- def score_tags(processed_arxiv_row):
288
 
 
289
  tag_list = processed_arxiv_row.msc_tags
290
  title_plus_abstract = processed_arxiv_row.docs
291
 
292
  if tag_list is None:
293
  return None
294
  embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
295
-
296
  return sentence_transformers.util.semantic_search(
297
  query_embeddings=doc_encoded_dict()[title_plus_abstract],
298
  corpus_embeddings=embedded_msc_tags,
299
- )[0]
300
-
 
4
  import sentence_transformers.util
5
  import os
6
 
7
+
8
  def main(raw_metadata_df, path_to_embeddings):
9
  clean_metadata_df = pd.DataFrame(
10
+ columns=["sentences", "authors", "msc_tags", "msc_cos_sim"]
11
+ )
12
 
13
  clean_title = raw_metadata_df.title.apply(cleanse)
14
  clean_abstract = raw_metadata_df.summary.apply(cleanse)
15
+ clean_metadata_df.sentences = clean_title + " " + clean_abstract
16
  clean_metadata_df.authors = raw_metadata_df.authors
17
  clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
18
 
19
  return clean_metadata_df
20
 
21
+
22
  ##
23
 
24
+
25
  def category_map():
26
  """Maps arXiv subject categories to their full english names.
27
 
 
30
  Note that the list is not exhaustive in the sense that many categories have aliases that
31
  are not included. (Some are, e.g. math.MP and math-ph).
32
  """
33
+ return {
34
+ "astro-ph": "Astrophysics",
35
+ "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
36
+ "astro-ph.EP": "Earth and Planetary Astrophysics",
37
+ "astro-ph.GA": "Astrophysics of Galaxies",
38
+ "astro-ph.HE": "High Energy Astrophysical Phenomena",
39
+ "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
40
+ "astro-ph.SR": "Solar and Stellar Astrophysics",
41
+ "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
42
+ "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
43
+ "cond-mat.mtrl-sci": "Materials Science",
44
+ "cond-mat.other": "Other Condensed Matter",
45
+ "cond-mat.quant-gas": "Quantum Gases",
46
+ "cond-mat.soft": "Soft Condensed Matter",
47
+ "cond-mat.stat-mech": "Statistical Mechanics",
48
+ "cond-mat.str-el": "Strongly Correlated Electrons",
49
+ "cond-mat.supr-con": "Superconductivity",
50
+ "cond-mat": "Condensed Matter",
51
+ "cs.AI": "Artificial Intelligence",
52
+ "cs.AR": "Hardware Architecture",
53
+ "cs.CC": "Computational Complexity",
54
+ "cs.CE": "Computational Engineering, Finance, and Science",
55
+ "cs.CG": "Computational Geometry",
56
+ "cs.CL": "Computation and Language",
57
+ "cs.CR": "Cryptography and Security",
58
+ "cs.CV": "Computer Vision and Pattern Recognition",
59
+ "cs.CY": "Computers and Society",
60
+ "cs.DB": "Databases",
61
+ "cs.DC": "Distributed, Parallel, and Cluster Computing",
62
+ "cs.DL": "Digital Libraries",
63
+ "cs.DM": "Discrete Mathematics",
64
+ "cs.DS": "Data Structures and Algorithms",
65
+ "cs.ET": "Emerging Technologies",
66
+ "cs.FL": "Formal Languages and Automata Theory",
67
+ "cs.GL": "General Literature",
68
+ "cs.GR": "Graphics",
69
+ "cs.GT": "Computer Science and Game Theory",
70
+ "cs.HC": "Human-Computer Interaction",
71
+ "cs.IR": "Information Retrieval",
72
+ "cs.IT": "Information Theory",
73
+ "cs.LG": "Machine Learning",
74
+ "cs.LO": "Logic in Computer Science",
75
+ "cs.MA": "Multiagent Systems",
76
+ "cs.MM": "Multimedia",
77
+ "cs.MS": "Mathematical Software",
78
+ "cs.NA": "Numerical Analysis",
79
+ "cs.NE": "Neural and Evolutionary Computing",
80
+ "cs.NI": "Networking and Internet Architecture",
81
+ "cs.OH": "Other Computer Science",
82
+ "cs.OS": "Operating Systems",
83
+ "cs.PF": "Performance",
84
+ "cs.PL": "Programming Languages",
85
+ "cs.RO": "Robotics",
86
+ "cs.SC": "Symbolic Computation",
87
+ "cs.SD": "Sound",
88
+ "cs.SE": "Software Engineering",
89
+ "cs.SI": "Social and Information Networks",
90
+ "cs.SY": "Systems and Control",
91
+ "econ.EM": "Econometrics",
92
+ "econ.GN": "General Economics",
93
+ "econ.TH": "Theoretical Economics",
94
+ "eess.AS": "Audio and Speech Processing",
95
+ "eess.IV": "Image and Video Processing",
96
+ "eess.SP": "Signal Processing",
97
+ "eess.SY": "Systems and Control",
98
+ "dg-ga": "Differential Geometry",
99
+ "gr-qc": "General Relativity and Quantum Cosmology",
100
+ "hep-ex": "High Energy Physics - Experiment",
101
+ "hep-lat": "High Energy Physics - Lattice",
102
+ "hep-ph": "High Energy Physics - Phenomenology",
103
+ "hep-th": "High Energy Physics - Theory",
104
+ "math.AC": "Commutative Algebra",
105
+ "math.AG": "Algebraic Geometry",
106
+ "math.AP": "Analysis of PDEs",
107
+ "math.AT": "Algebraic Topology",
108
+ "math.CA": "Classical Analysis and ODEs",
109
+ "math.CO": "Combinatorics",
110
+ "math.CT": "Category Theory",
111
+ "math.CV": "Complex Variables",
112
+ "math.DG": "Differential Geometry",
113
+ "math.DS": "Dynamical Systems",
114
+ "math.FA": "Functional Analysis",
115
+ "math.GM": "General Mathematics",
116
+ "math.GN": "General Topology",
117
+ "math.GR": "Group Theory",
118
+ "math.GT": "Geometric Topology",
119
+ "math.HO": "History and Overview",
120
+ "math.IT": "Information Theory",
121
+ "math.KT": "K-Theory and Homology",
122
+ "math.LO": "Logic",
123
+ "math.MG": "Metric Geometry",
124
+ "math.MP": "Mathematical Physics",
125
+ "math.NA": "Numerical Analysis",
126
+ "math.NT": "Number Theory",
127
+ "math.OA": "Operator Algebras",
128
+ "math.OC": "Optimization and Control",
129
+ "math.PR": "Probability",
130
+ "math.QA": "Quantum Algebra",
131
+ "math.RA": "Rings and Algebras",
132
+ "math.RT": "Representation Theory",
133
+ "math.SG": "Symplectic Geometry",
134
+ "math.SP": "Spectral Theory",
135
+ "math.ST": "Statistics Theory",
136
+ "math-ph": "Mathematical Physics",
137
+ "funct-an": "Functional Analysis",
138
+ "alg-geom": "Algebraic Geometry",
139
+ "nlin.AO": "Adaptation and Self-Organizing Systems",
140
+ "chao-dyn": "Chaotic Dynamics",
141
+ "nlin.CD": "Chaotic Dynamics",
142
+ "nlin.CG": "Cellular Automata and Lattice Gases",
143
+ "nlin.PS": "Pattern Formation and Solitons",
144
+ "nlin.SI": "Exactly Solvable and Integrable Systems",
145
+ "nucl-ex": "Nuclear Experiment",
146
+ "nucl-th": "Nuclear Theory",
147
+ "physics.acc-ph": "Accelerator Physics",
148
+ "physics.ao-ph": "Atmospheric and Oceanic Physics",
149
+ "physics.app-ph": "Applied Physics",
150
+ "physics.atm-clus": "Atomic and Molecular Clusters",
151
+ "physics.atom-ph": "Atomic Physics",
152
+ "physics.bio-ph": "Biological Physics",
153
+ "physics.chem-ph": "Chemical Physics",
154
+ "physics.class-ph": "Classical Physics",
155
+ "physics.comp-ph": "Computational Physics",
156
+ "physics.data-an": "Data Analysis, Statistics and Probability",
157
+ "physics.ed-ph": "Physics Education",
158
+ "physics.flu-dyn": "Fluid Dynamics",
159
+ "physics.gen-ph": "General Physics",
160
+ "physics.geo-ph": "Geophysics",
161
+ "physics.hist-ph": "History and Philosophy of Physics",
162
+ "physics.ins-det": "Instrumentation and Detectors",
163
+ "physics.med-ph": "Medical Physics",
164
+ "physics.optics": "Optics",
165
+ "physics.plasm-ph": "Plasma Physics",
166
+ "physics.pop-ph": "Popular Physics",
167
+ "physics.soc-ph": "Physics and Society",
168
+ "physics.space-ph": "Space Physics",
169
+ "q-bio.BM": "Biomolecules",
170
+ "q-bio.CB": "Cell Behavior",
171
+ "q-bio.GN": "Genomics",
172
+ "q-bio.MN": "Molecular Networks",
173
+ "q-bio.NC": "Neurons and Cognition",
174
+ "q-bio.OT": "Other Quantitative Biology",
175
+ "q-bio.PE": "Populations and Evolution",
176
+ "q-bio.QM": "Quantitative Methods",
177
+ "q-bio.SC": "Subcellular Processes",
178
+ "q-bio.TO": "Tissues and Organs",
179
+ "q-fin.CP": "Computational Finance",
180
+ "q-fin.EC": "Economics",
181
+ "q-fin.GN": "General Finance",
182
+ "q-fin.MF": "Mathematical Finance",
183
+ "q-fin.PM": "Portfolio Management",
184
+ "q-fin.PR": "Pricing of Securities",
185
+ "q-fin.RM": "Risk Management",
186
+ "q-fin.ST": "Statistical Finance",
187
+ "q-fin.TR": "Trading and Market Microstructure",
188
+ "quant-ph": "Quantum Physics",
189
+ "q-alg": "Quantum Algebra",
190
+ "stat.AP": "Applications",
191
+ "stat.CO": "Computation",
192
+ "stat.ME": "Methodology",
193
+ "stat.ML": "Machine Learning",
194
+ "stat.OT": "Other Statistics",
195
+ "stat.TH": "Statistics Theory",
196
+ }
197
+
198
+
199
+ def split_categories_by_row(raw_metadata_row):
200
+ """Takes in row of a dataframe returned by an arxiv query search, returns a tuple with the list
201
+ of arXiv subject tags in the first slot, msc_tags in the second slot.
202
+
203
+ Args:
204
+ raw_metadata_row: row of a dataframe returned by an arXiv query request
205
+
206
+ Returns:
207
+ (x , y): x and y are lists; x is a list of arxiv subjects, y is a list of msc_tags.
208
+ """
209
+ categories = raw_metadata_row.categories
210
+ expanded_categories = pd.Series(categories)
211
+ arxiv_subject_labels = category_map()
212
+
213
+ if expanded_categories.isin(arxiv_subject_labels.keys()).all():
214
+ return (raw_metadata_row.categories, None)
215
+ else:
216
+ msc_tags = find_msc(raw_metadata_row.categories[-1])
217
+ return (raw_metadata_row.categories[:-2], msc_tags)
218
+
219
+
220
+ def extract_tags(raw_metadata, arxiv_tag):
221
+ split_categories = raw_metadata.apply(split_categories_by_row, axis=0)
222
+
223
+ flag = 1
224
+ if arxiv_tag:
225
+ flag = 0
226
+
227
+ return split_categories.apply(lambda x: x[flag])
228
 
229
 
230
  ## 1. Latin-ize latex accents enclosed in brackets
231
  def remove_latex_accents(string):
232
+ accent = r"\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}"
233
+ replacement = r"\1"
234
 
235
+ string = regex.sub(accent, replacement, string)
236
  return string
237
 
238
+
239
  ## 2. Remove latex environments
240
  def remove_env(string):
241
+ env = r"\\[a-z]{2,}{[^{}]+?}"
242
 
243
+ string = regex.sub(env, "", string)
244
  return string
245
 
246
+
247
  ## 3. Latin-ize non-{} enclosed latex accents:
248
  def remove_accents(string):
249
+ accent = r"\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])"
250
+ replacement = r"\1"
251
+
252
+ string = regex.sub(accent, replacement, string)
253
+ return string
254
 
 
 
255
 
256
  ## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
257
 
258
+
259
  def remove_latex(string):
260
+ latex = r"\s(\$\$?)[^\$]*?\1\S*"
261
+ string = regex.sub(latex, " LATEX ", string)
262
+ return string
263
 
264
 
265
  def cleanse(string):
266
+ string = string.replace("\n", " ")
267
  string = remove_latex_accents(string)
268
  string = remove_env(string)
269
  string = remove_accents(string)
270
  string = remove_latex(string)
271
  return string
272
 
273
+
274
+ ##
275
+
276
 
277
  def find_hyph(text):
278
+ pattern = r"(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b"
279
+ keywords = regex.findall(pattern, text)
280
 
281
  if keywords == []:
282
  return None
 
284
  return list(set(keywords))
285
 
286
 
287
+ def find_msc(msc_string):
288
+ pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
289
+ tags = regex.findall(pattern, msc_string)
290
+ return tags
291
+
 
 
 
292
 
293
  def msc_tags():
294
+ with open("./data/msc.json", "r") as file:
295
  text = file.read()
296
  return json.loads(text)
297
 
 
307
  return None
308
  else:
309
  return out
310
+
311
 
312
  ##
313
 
314
+
315
  def msc_encoded_dict():
316
+ encoded_tags = pd.read_parquet("./data/msc_mini_embeddings.parquet").to_numpy()
317
+ return {k: v for (k, v) in zip(msc_tags().values(), encoded_tags)}
318
+
319
 
320
  def doc_encoded_dict():
321
+ library_embeddings = pd.read_parquet("./data/APSP_mini_vec.parquet")
322
 
323
  docs = library_embeddings.docs.to_list()
324
  encoded_docs = library_embeddings.vecs.to_numpy()
325
 
326
+ return {k: v for (k, v) in zip(docs, encoded_docs)}
327
 
 
328
 
329
+ def score_tags(processed_arxiv_row):
330
  tag_list = processed_arxiv_row.msc_tags
331
  title_plus_abstract = processed_arxiv_row.docs
332
 
333
  if tag_list is None:
334
  return None
335
  embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
336
+
337
  return sentence_transformers.util.semantic_search(
338
  query_embeddings=doc_encoded_dict()[title_plus_abstract],
339
  corpus_embeddings=embedded_msc_tags,
340
+ )[0]
 
data_storage.py CHANGED
@@ -2,38 +2,31 @@ import arxiv
2
  import pandas as pd
3
  import data_cleaning as clean
4
  from sklearn.preprocessing import MultiLabelBinarizer
 
5
 
6
 
7
  class ArXivData:
8
  """A light class for storing the metadata of a collection of arXiv papers."""
9
 
10
  def __init__(self):
11
- """
12
- data: dataframe holding the metadata. Each row represents a paper and each column is
13
- a separate piece of metadata.
14
-
15
- query: A tuple of the form (query_string,max_results) where query_string is the formatted
16
- string that produced the raw data and max_results is the value of that parameter passed to the
17
- arXiv API.
18
-
19
- raw: The original, raw dataset as returned by the arXiv API, if current data is clean.
20
-
21
- cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
22
- """
23
 
24
- self.data = None
25
- self.query = None
26
- self.categories = None
27
 
28
- def load_from_file():
29
- pass
30
 
31
- def load_from_query(self, query_string, max_results, offset):
32
- self.data = query_to_df(
33
  query=query_string, max_results=max_results, offset=offset
34
  )
35
- self.query = (query_string, max_results)
36
- # self.categories = self.get_OHE_cats()
 
37
 
38
  def clean(self, dataset):
39
  """Constructs this dataset by cleaning another one.
@@ -46,11 +39,14 @@ class ArXivData:
46
  self.raw = dataset.raw
47
  self.categories = dataset.categories
48
 
49
- def get_OHE_cats(self):
50
  mlb = MultiLabelBinarizer()
51
- OHE_category_array = mlb.fit_transform(self.data.categories)
52
- return pd.DataFrame(OHE_category_array, columns=mlb.classes_).rename(
53
- mapper=clean.category_map()
 
 
 
54
  )
55
 
56
 
@@ -117,6 +113,12 @@ def query_to_df(query, max_results, offset):
117
  for result in results
118
  )
119
 
120
- metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
 
 
 
 
 
 
121
 
122
- return metadata_dataframe
 
2
  import pandas as pd
3
  import data_cleaning as clean
4
  from sklearn.preprocessing import MultiLabelBinarizer
5
+ import os
6
 
7
 
8
  class ArXivData:
9
  """A light class for storing the metadata of a collection of arXiv papers."""
10
 
11
  def __init__(self):
12
+ self.metadata = None
13
+ self.arxiv_subjects = None
14
+ self._returned_metadata = None
 
 
 
 
 
 
 
 
 
15
 
16
+ def load_from_file(self, dataset_file_name, path_to_data_dir):
17
+ path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
18
+ self._returned_metadata = pd.read_feather(path_to_dataset)
19
 
20
+ self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
21
+ self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
22
 
23
+ def load_from_query(self, query_string, max_results, offset=0):
24
+ self._returned_metadata = query_to_df(
25
  query=query_string, max_results=max_results, offset=offset
26
  )
27
+
28
+ self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
29
+ self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
30
 
31
  def clean(self, dataset):
32
  """Constructs this dataset by cleaning another one.
 
39
  self.raw = dataset.raw
40
  self.categories = dataset.categories
41
 
42
+ def get_OHE_arxiv_subjects(returned_metadata):
43
  mlb = MultiLabelBinarizer()
44
+
45
+ OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
46
+ arxiv_subject_labels = clean.category_map()
47
+
48
+ return pd.DataFrame(OHE_arxiv_subjects_array, columns=mlb.classes_).rename(
49
+ columns=arxiv_subject_labels
50
  )
51
 
52
 
 
113
  for result in results
114
  )
115
 
116
+ raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
117
+
118
+ returned_metadata = raw_metadata.copy().drop(columns=["categories"])
119
+ returned_metadata["arxiv_subjects"] = clean.extract_tags(
120
+ raw_metadata, arxiv_tag=True
121
+ )
122
+ returned_metadata["msc_tags"] = clean.extract_tags(raw_metadata, arxiv_tag=False)
123
 
124
+ return returned_metadata