Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
cbdef5e
1
Parent(s):
3af1705
created methods for splitting categories into arxiv and msc tags
Browse files- data_cleaning.py +239 -199
- data_storage.py +29 -27
data_cleaning.py
CHANGED
@@ -4,21 +4,24 @@ import json
|
|
4 |
import sentence_transformers.util
|
5 |
import os
|
6 |
|
|
|
7 |
def main(raw_metadata_df, path_to_embeddings):
|
8 |
clean_metadata_df = pd.DataFrame(
|
9 |
-
columns=[
|
10 |
-
|
11 |
|
12 |
clean_title = raw_metadata_df.title.apply(cleanse)
|
13 |
clean_abstract = raw_metadata_df.summary.apply(cleanse)
|
14 |
-
clean_metadata_df.sentences = clean_title +
|
15 |
clean_metadata_df.authors = raw_metadata_df.authors
|
16 |
clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
|
17 |
|
18 |
return clean_metadata_df
|
19 |
|
|
|
20 |
##
|
21 |
|
|
|
22 |
def category_map():
|
23 |
"""Maps arXiv subject categories to their full english names.
|
24 |
|
@@ -27,214 +30,253 @@ def category_map():
|
|
27 |
Note that the list is not exhaustive in the sense that many categories have aliases that
|
28 |
are not included. (Some are, e.g. math.MP and math-ph).
|
29 |
"""
|
30 |
-
return {
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
|
194 |
## 1. Latin-ize latex accents enclosed in brackets
|
195 |
def remove_latex_accents(string):
|
196 |
-
accent = r
|
197 |
-
replacement = r
|
198 |
|
199 |
-
string = regex.sub(accent,replacement, string)
|
200 |
return string
|
201 |
|
|
|
202 |
## 2. Remove latex environments
|
203 |
def remove_env(string):
|
204 |
-
env = r
|
205 |
|
206 |
-
string = regex.sub(env,
|
207 |
return string
|
208 |
|
|
|
209 |
## 3. Latin-ize non-{} enclosed latex accents:
|
210 |
def remove_accents(string):
|
211 |
-
accent = r
|
212 |
-
replacement = r
|
|
|
|
|
|
|
213 |
|
214 |
-
string = regex.sub(accent,replacement,string)
|
215 |
-
return string
|
216 |
|
217 |
## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
|
218 |
|
|
|
219 |
def remove_latex(string):
|
220 |
-
latex = r
|
221 |
-
string = regex.sub(latex,
|
222 |
-
return string
|
223 |
|
224 |
|
225 |
def cleanse(string):
|
226 |
-
string = string.replace(
|
227 |
string = remove_latex_accents(string)
|
228 |
string = remove_env(string)
|
229 |
string = remove_accents(string)
|
230 |
string = remove_latex(string)
|
231 |
return string
|
232 |
|
233 |
-
|
|
|
|
|
234 |
|
235 |
def find_hyph(text):
|
236 |
-
pattern = r
|
237 |
-
keywords = regex.findall(pattern,text)
|
238 |
|
239 |
if keywords == []:
|
240 |
return None
|
@@ -242,17 +284,14 @@ def find_hyph(text):
|
|
242 |
return list(set(keywords))
|
243 |
|
244 |
|
245 |
-
def find_msc(
|
246 |
-
pattern = r
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
for tag in tags:
|
251 |
-
out.append(tag)
|
252 |
-
return out
|
253 |
|
254 |
def msc_tags():
|
255 |
-
with open(
|
256 |
text = file.read()
|
257 |
return json.loads(text)
|
258 |
|
@@ -268,33 +307,34 @@ def cats_to_msc(cat_list):
|
|
268 |
return None
|
269 |
else:
|
270 |
return out
|
271 |
-
|
272 |
|
273 |
##
|
274 |
|
|
|
275 |
def msc_encoded_dict():
|
276 |
-
encoded_tags = pd.read_parquet(
|
277 |
-
return {k
|
|
|
278 |
|
279 |
def doc_encoded_dict():
|
280 |
-
library_embeddings = pd.read_parquet(
|
281 |
|
282 |
docs = library_embeddings.docs.to_list()
|
283 |
encoded_docs = library_embeddings.vecs.to_numpy()
|
284 |
|
285 |
-
return {k
|
286 |
|
287 |
-
def score_tags(processed_arxiv_row):
|
288 |
|
|
|
289 |
tag_list = processed_arxiv_row.msc_tags
|
290 |
title_plus_abstract = processed_arxiv_row.docs
|
291 |
|
292 |
if tag_list is None:
|
293 |
return None
|
294 |
embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
|
295 |
-
|
296 |
return sentence_transformers.util.semantic_search(
|
297 |
query_embeddings=doc_encoded_dict()[title_plus_abstract],
|
298 |
corpus_embeddings=embedded_msc_tags,
|
299 |
-
|
300 |
-
|
|
|
4 |
import sentence_transformers.util
|
5 |
import os
|
6 |
|
7 |
+
|
8 |
def main(raw_metadata_df, path_to_embeddings):
|
9 |
clean_metadata_df = pd.DataFrame(
|
10 |
+
columns=["sentences", "authors", "msc_tags", "msc_cos_sim"]
|
11 |
+
)
|
12 |
|
13 |
clean_title = raw_metadata_df.title.apply(cleanse)
|
14 |
clean_abstract = raw_metadata_df.summary.apply(cleanse)
|
15 |
+
clean_metadata_df.sentences = clean_title + " " + clean_abstract
|
16 |
clean_metadata_df.authors = raw_metadata_df.authors
|
17 |
clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
|
18 |
|
19 |
return clean_metadata_df
|
20 |
|
21 |
+
|
22 |
##
|
23 |
|
24 |
+
|
25 |
def category_map():
|
26 |
"""Maps arXiv subject categories to their full english names.
|
27 |
|
|
|
30 |
Note that the list is not exhaustive in the sense that many categories have aliases that
|
31 |
are not included. (Some are, e.g. math.MP and math-ph).
|
32 |
"""
|
33 |
+
return {
|
34 |
+
"astro-ph": "Astrophysics",
|
35 |
+
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
|
36 |
+
"astro-ph.EP": "Earth and Planetary Astrophysics",
|
37 |
+
"astro-ph.GA": "Astrophysics of Galaxies",
|
38 |
+
"astro-ph.HE": "High Energy Astrophysical Phenomena",
|
39 |
+
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
|
40 |
+
"astro-ph.SR": "Solar and Stellar Astrophysics",
|
41 |
+
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
|
42 |
+
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
|
43 |
+
"cond-mat.mtrl-sci": "Materials Science",
|
44 |
+
"cond-mat.other": "Other Condensed Matter",
|
45 |
+
"cond-mat.quant-gas": "Quantum Gases",
|
46 |
+
"cond-mat.soft": "Soft Condensed Matter",
|
47 |
+
"cond-mat.stat-mech": "Statistical Mechanics",
|
48 |
+
"cond-mat.str-el": "Strongly Correlated Electrons",
|
49 |
+
"cond-mat.supr-con": "Superconductivity",
|
50 |
+
"cond-mat": "Condensed Matter",
|
51 |
+
"cs.AI": "Artificial Intelligence",
|
52 |
+
"cs.AR": "Hardware Architecture",
|
53 |
+
"cs.CC": "Computational Complexity",
|
54 |
+
"cs.CE": "Computational Engineering, Finance, and Science",
|
55 |
+
"cs.CG": "Computational Geometry",
|
56 |
+
"cs.CL": "Computation and Language",
|
57 |
+
"cs.CR": "Cryptography and Security",
|
58 |
+
"cs.CV": "Computer Vision and Pattern Recognition",
|
59 |
+
"cs.CY": "Computers and Society",
|
60 |
+
"cs.DB": "Databases",
|
61 |
+
"cs.DC": "Distributed, Parallel, and Cluster Computing",
|
62 |
+
"cs.DL": "Digital Libraries",
|
63 |
+
"cs.DM": "Discrete Mathematics",
|
64 |
+
"cs.DS": "Data Structures and Algorithms",
|
65 |
+
"cs.ET": "Emerging Technologies",
|
66 |
+
"cs.FL": "Formal Languages and Automata Theory",
|
67 |
+
"cs.GL": "General Literature",
|
68 |
+
"cs.GR": "Graphics",
|
69 |
+
"cs.GT": "Computer Science and Game Theory",
|
70 |
+
"cs.HC": "Human-Computer Interaction",
|
71 |
+
"cs.IR": "Information Retrieval",
|
72 |
+
"cs.IT": "Information Theory",
|
73 |
+
"cs.LG": "Machine Learning",
|
74 |
+
"cs.LO": "Logic in Computer Science",
|
75 |
+
"cs.MA": "Multiagent Systems",
|
76 |
+
"cs.MM": "Multimedia",
|
77 |
+
"cs.MS": "Mathematical Software",
|
78 |
+
"cs.NA": "Numerical Analysis",
|
79 |
+
"cs.NE": "Neural and Evolutionary Computing",
|
80 |
+
"cs.NI": "Networking and Internet Architecture",
|
81 |
+
"cs.OH": "Other Computer Science",
|
82 |
+
"cs.OS": "Operating Systems",
|
83 |
+
"cs.PF": "Performance",
|
84 |
+
"cs.PL": "Programming Languages",
|
85 |
+
"cs.RO": "Robotics",
|
86 |
+
"cs.SC": "Symbolic Computation",
|
87 |
+
"cs.SD": "Sound",
|
88 |
+
"cs.SE": "Software Engineering",
|
89 |
+
"cs.SI": "Social and Information Networks",
|
90 |
+
"cs.SY": "Systems and Control",
|
91 |
+
"econ.EM": "Econometrics",
|
92 |
+
"econ.GN": "General Economics",
|
93 |
+
"econ.TH": "Theoretical Economics",
|
94 |
+
"eess.AS": "Audio and Speech Processing",
|
95 |
+
"eess.IV": "Image and Video Processing",
|
96 |
+
"eess.SP": "Signal Processing",
|
97 |
+
"eess.SY": "Systems and Control",
|
98 |
+
"dg-ga": "Differential Geometry",
|
99 |
+
"gr-qc": "General Relativity and Quantum Cosmology",
|
100 |
+
"hep-ex": "High Energy Physics - Experiment",
|
101 |
+
"hep-lat": "High Energy Physics - Lattice",
|
102 |
+
"hep-ph": "High Energy Physics - Phenomenology",
|
103 |
+
"hep-th": "High Energy Physics - Theory",
|
104 |
+
"math.AC": "Commutative Algebra",
|
105 |
+
"math.AG": "Algebraic Geometry",
|
106 |
+
"math.AP": "Analysis of PDEs",
|
107 |
+
"math.AT": "Algebraic Topology",
|
108 |
+
"math.CA": "Classical Analysis and ODEs",
|
109 |
+
"math.CO": "Combinatorics",
|
110 |
+
"math.CT": "Category Theory",
|
111 |
+
"math.CV": "Complex Variables",
|
112 |
+
"math.DG": "Differential Geometry",
|
113 |
+
"math.DS": "Dynamical Systems",
|
114 |
+
"math.FA": "Functional Analysis",
|
115 |
+
"math.GM": "General Mathematics",
|
116 |
+
"math.GN": "General Topology",
|
117 |
+
"math.GR": "Group Theory",
|
118 |
+
"math.GT": "Geometric Topology",
|
119 |
+
"math.HO": "History and Overview",
|
120 |
+
"math.IT": "Information Theory",
|
121 |
+
"math.KT": "K-Theory and Homology",
|
122 |
+
"math.LO": "Logic",
|
123 |
+
"math.MG": "Metric Geometry",
|
124 |
+
"math.MP": "Mathematical Physics",
|
125 |
+
"math.NA": "Numerical Analysis",
|
126 |
+
"math.NT": "Number Theory",
|
127 |
+
"math.OA": "Operator Algebras",
|
128 |
+
"math.OC": "Optimization and Control",
|
129 |
+
"math.PR": "Probability",
|
130 |
+
"math.QA": "Quantum Algebra",
|
131 |
+
"math.RA": "Rings and Algebras",
|
132 |
+
"math.RT": "Representation Theory",
|
133 |
+
"math.SG": "Symplectic Geometry",
|
134 |
+
"math.SP": "Spectral Theory",
|
135 |
+
"math.ST": "Statistics Theory",
|
136 |
+
"math-ph": "Mathematical Physics",
|
137 |
+
"funct-an": "Functional Analysis",
|
138 |
+
"alg-geom": "Algebraic Geometry",
|
139 |
+
"nlin.AO": "Adaptation and Self-Organizing Systems",
|
140 |
+
"chao-dyn": "Chaotic Dynamics",
|
141 |
+
"nlin.CD": "Chaotic Dynamics",
|
142 |
+
"nlin.CG": "Cellular Automata and Lattice Gases",
|
143 |
+
"nlin.PS": "Pattern Formation and Solitons",
|
144 |
+
"nlin.SI": "Exactly Solvable and Integrable Systems",
|
145 |
+
"nucl-ex": "Nuclear Experiment",
|
146 |
+
"nucl-th": "Nuclear Theory",
|
147 |
+
"physics.acc-ph": "Accelerator Physics",
|
148 |
+
"physics.ao-ph": "Atmospheric and Oceanic Physics",
|
149 |
+
"physics.app-ph": "Applied Physics",
|
150 |
+
"physics.atm-clus": "Atomic and Molecular Clusters",
|
151 |
+
"physics.atom-ph": "Atomic Physics",
|
152 |
+
"physics.bio-ph": "Biological Physics",
|
153 |
+
"physics.chem-ph": "Chemical Physics",
|
154 |
+
"physics.class-ph": "Classical Physics",
|
155 |
+
"physics.comp-ph": "Computational Physics",
|
156 |
+
"physics.data-an": "Data Analysis, Statistics and Probability",
|
157 |
+
"physics.ed-ph": "Physics Education",
|
158 |
+
"physics.flu-dyn": "Fluid Dynamics",
|
159 |
+
"physics.gen-ph": "General Physics",
|
160 |
+
"physics.geo-ph": "Geophysics",
|
161 |
+
"physics.hist-ph": "History and Philosophy of Physics",
|
162 |
+
"physics.ins-det": "Instrumentation and Detectors",
|
163 |
+
"physics.med-ph": "Medical Physics",
|
164 |
+
"physics.optics": "Optics",
|
165 |
+
"physics.plasm-ph": "Plasma Physics",
|
166 |
+
"physics.pop-ph": "Popular Physics",
|
167 |
+
"physics.soc-ph": "Physics and Society",
|
168 |
+
"physics.space-ph": "Space Physics",
|
169 |
+
"q-bio.BM": "Biomolecules",
|
170 |
+
"q-bio.CB": "Cell Behavior",
|
171 |
+
"q-bio.GN": "Genomics",
|
172 |
+
"q-bio.MN": "Molecular Networks",
|
173 |
+
"q-bio.NC": "Neurons and Cognition",
|
174 |
+
"q-bio.OT": "Other Quantitative Biology",
|
175 |
+
"q-bio.PE": "Populations and Evolution",
|
176 |
+
"q-bio.QM": "Quantitative Methods",
|
177 |
+
"q-bio.SC": "Subcellular Processes",
|
178 |
+
"q-bio.TO": "Tissues and Organs",
|
179 |
+
"q-fin.CP": "Computational Finance",
|
180 |
+
"q-fin.EC": "Economics",
|
181 |
+
"q-fin.GN": "General Finance",
|
182 |
+
"q-fin.MF": "Mathematical Finance",
|
183 |
+
"q-fin.PM": "Portfolio Management",
|
184 |
+
"q-fin.PR": "Pricing of Securities",
|
185 |
+
"q-fin.RM": "Risk Management",
|
186 |
+
"q-fin.ST": "Statistical Finance",
|
187 |
+
"q-fin.TR": "Trading and Market Microstructure",
|
188 |
+
"quant-ph": "Quantum Physics",
|
189 |
+
"q-alg": "Quantum Algebra",
|
190 |
+
"stat.AP": "Applications",
|
191 |
+
"stat.CO": "Computation",
|
192 |
+
"stat.ME": "Methodology",
|
193 |
+
"stat.ML": "Machine Learning",
|
194 |
+
"stat.OT": "Other Statistics",
|
195 |
+
"stat.TH": "Statistics Theory",
|
196 |
+
}
|
197 |
+
|
198 |
+
|
199 |
+
def split_categories_by_row(raw_metadata_row):
|
200 |
+
"""Takes in row of a dataframe returned by an arxiv query search, returns a tuple with the list
|
201 |
+
of arXiv subject tags in the first slot, msc_tags in the second slot.
|
202 |
+
|
203 |
+
Args:
|
204 |
+
raw_metadata_row: row of a dataframe returned by an arXiv query request
|
205 |
+
|
206 |
+
Returns:
|
207 |
+
(x , y): x and y are lists; x is a list of arxiv subjects, y is a list of msc_tags.
|
208 |
+
"""
|
209 |
+
categories = raw_metadata_row.categories
|
210 |
+
expanded_categories = pd.Series(categories)
|
211 |
+
arxiv_subject_labels = category_map()
|
212 |
+
|
213 |
+
if expanded_categories.isin(arxiv_subject_labels.keys()).all():
|
214 |
+
return (raw_metadata_row.categories, None)
|
215 |
+
else:
|
216 |
+
msc_tags = find_msc(raw_metadata_row.categories[-1])
|
217 |
+
return (raw_metadata_row.categories[:-2], msc_tags)
|
218 |
+
|
219 |
+
|
220 |
+
def extract_tags(raw_metadata, arxiv_tag):
|
221 |
+
split_categories = raw_metadata.apply(split_categories_by_row, axis=0)
|
222 |
+
|
223 |
+
flag = 1
|
224 |
+
if arxiv_tag:
|
225 |
+
flag = 0
|
226 |
+
|
227 |
+
return split_categories.apply(lambda x: x[flag])
|
228 |
|
229 |
|
230 |
## 1. Latin-ize latex accents enclosed in brackets
|
231 |
def remove_latex_accents(string):
|
232 |
+
accent = r"\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}"
|
233 |
+
replacement = r"\1"
|
234 |
|
235 |
+
string = regex.sub(accent, replacement, string)
|
236 |
return string
|
237 |
|
238 |
+
|
239 |
## 2. Remove latex environments
|
240 |
def remove_env(string):
|
241 |
+
env = r"\\[a-z]{2,}{[^{}]+?}"
|
242 |
|
243 |
+
string = regex.sub(env, "", string)
|
244 |
return string
|
245 |
|
246 |
+
|
247 |
## 3. Latin-ize non-{} enclosed latex accents:
|
248 |
def remove_accents(string):
|
249 |
+
accent = r"\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])"
|
250 |
+
replacement = r"\1"
|
251 |
+
|
252 |
+
string = regex.sub(accent, replacement, string)
|
253 |
+
return string
|
254 |
|
|
|
|
|
255 |
|
256 |
## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
|
257 |
|
258 |
+
|
259 |
def remove_latex(string):
|
260 |
+
latex = r"\s(\$\$?)[^\$]*?\1\S*"
|
261 |
+
string = regex.sub(latex, " LATEX ", string)
|
262 |
+
return string
|
263 |
|
264 |
|
265 |
def cleanse(string):
|
266 |
+
string = string.replace("\n", " ")
|
267 |
string = remove_latex_accents(string)
|
268 |
string = remove_env(string)
|
269 |
string = remove_accents(string)
|
270 |
string = remove_latex(string)
|
271 |
return string
|
272 |
|
273 |
+
|
274 |
+
##
|
275 |
+
|
276 |
|
277 |
def find_hyph(text):
|
278 |
+
pattern = r"(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b"
|
279 |
+
keywords = regex.findall(pattern, text)
|
280 |
|
281 |
if keywords == []:
|
282 |
return None
|
|
|
284 |
return list(set(keywords))
|
285 |
|
286 |
|
287 |
+
def find_msc(msc_string):
|
288 |
+
pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
|
289 |
+
tags = regex.findall(pattern, msc_string)
|
290 |
+
return tags
|
291 |
+
|
|
|
|
|
|
|
292 |
|
293 |
def msc_tags():
|
294 |
+
with open("./data/msc.json", "r") as file:
|
295 |
text = file.read()
|
296 |
return json.loads(text)
|
297 |
|
|
|
307 |
return None
|
308 |
else:
|
309 |
return out
|
310 |
+
|
311 |
|
312 |
##
|
313 |
|
314 |
+
|
315 |
def msc_encoded_dict():
|
316 |
+
encoded_tags = pd.read_parquet("./data/msc_mini_embeddings.parquet").to_numpy()
|
317 |
+
return {k: v for (k, v) in zip(msc_tags().values(), encoded_tags)}
|
318 |
+
|
319 |
|
320 |
def doc_encoded_dict():
|
321 |
+
library_embeddings = pd.read_parquet("./data/APSP_mini_vec.parquet")
|
322 |
|
323 |
docs = library_embeddings.docs.to_list()
|
324 |
encoded_docs = library_embeddings.vecs.to_numpy()
|
325 |
|
326 |
+
return {k: v for (k, v) in zip(docs, encoded_docs)}
|
327 |
|
|
|
328 |
|
329 |
+
def score_tags(processed_arxiv_row):
|
330 |
tag_list = processed_arxiv_row.msc_tags
|
331 |
title_plus_abstract = processed_arxiv_row.docs
|
332 |
|
333 |
if tag_list is None:
|
334 |
return None
|
335 |
embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
|
336 |
+
|
337 |
return sentence_transformers.util.semantic_search(
|
338 |
query_embeddings=doc_encoded_dict()[title_plus_abstract],
|
339 |
corpus_embeddings=embedded_msc_tags,
|
340 |
+
)[0]
|
|
data_storage.py
CHANGED
@@ -2,38 +2,31 @@ import arxiv
|
|
2 |
import pandas as pd
|
3 |
import data_cleaning as clean
|
4 |
from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
5 |
|
6 |
|
7 |
class ArXivData:
|
8 |
"""A light class for storing the metadata of a collection of arXiv papers."""
|
9 |
|
10 |
def __init__(self):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
query: A tuple of the form (query_string,max_results) where query_string is the formatted
|
16 |
-
string that produced the raw data and max_results is the value of that parameter passed to the
|
17 |
-
arXiv API.
|
18 |
-
|
19 |
-
raw: The original, raw dataset as returned by the arXiv API, if current data is clean.
|
20 |
-
|
21 |
-
cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
|
22 |
-
"""
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
self.
|
27 |
|
28 |
-
|
29 |
-
|
30 |
|
31 |
-
def load_from_query(self, query_string, max_results, offset):
|
32 |
-
self.
|
33 |
query=query_string, max_results=max_results, offset=offset
|
34 |
)
|
35 |
-
|
36 |
-
|
|
|
37 |
|
38 |
def clean(self, dataset):
|
39 |
"""Constructs this dataset by cleaning another one.
|
@@ -46,11 +39,14 @@ class ArXivData:
|
|
46 |
self.raw = dataset.raw
|
47 |
self.categories = dataset.categories
|
48 |
|
49 |
-
def
|
50 |
mlb = MultiLabelBinarizer()
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
54 |
)
|
55 |
|
56 |
|
@@ -117,6 +113,12 @@ def query_to_df(query, max_results, offset):
|
|
117 |
for result in results
|
118 |
)
|
119 |
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
-
return
|
|
|
2 |
import pandas as pd
|
3 |
import data_cleaning as clean
|
4 |
from sklearn.preprocessing import MultiLabelBinarizer
|
5 |
+
import os
|
6 |
|
7 |
|
8 |
class ArXivData:
|
9 |
"""A light class for storing the metadata of a collection of arXiv papers."""
|
10 |
|
11 |
def __init__(self):
|
12 |
+
self.metadata = None
|
13 |
+
self.arxiv_subjects = None
|
14 |
+
self._returned_metadata = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
def load_from_file(self, dataset_file_name, path_to_data_dir):
|
17 |
+
path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
|
18 |
+
self._returned_metadata = pd.read_feather(path_to_dataset)
|
19 |
|
20 |
+
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
21 |
+
self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
|
22 |
|
23 |
+
def load_from_query(self, query_string, max_results, offset=0):
|
24 |
+
self._returned_metadata = query_to_df(
|
25 |
query=query_string, max_results=max_results, offset=offset
|
26 |
)
|
27 |
+
|
28 |
+
self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
|
29 |
+
self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
|
30 |
|
31 |
def clean(self, dataset):
|
32 |
"""Constructs this dataset by cleaning another one.
|
|
|
39 |
self.raw = dataset.raw
|
40 |
self.categories = dataset.categories
|
41 |
|
42 |
+
def get_OHE_arxiv_subjects(returned_metadata):
|
43 |
mlb = MultiLabelBinarizer()
|
44 |
+
|
45 |
+
OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
|
46 |
+
arxiv_subject_labels = clean.category_map()
|
47 |
+
|
48 |
+
return pd.DataFrame(OHE_arxiv_subjects_array, columns=mlb.classes_).rename(
|
49 |
+
columns=arxiv_subject_labels
|
50 |
)
|
51 |
|
52 |
|
|
|
113 |
for result in results
|
114 |
)
|
115 |
|
116 |
+
raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
|
117 |
+
|
118 |
+
returned_metadata = raw_metadata.copy().drop(columns=["categories"])
|
119 |
+
returned_metadata["arxiv_subjects"] = clean.extract_tags(
|
120 |
+
raw_metadata, arxiv_tag=True
|
121 |
+
)
|
122 |
+
returned_metadata["msc_tags"] = clean.extract_tags(raw_metadata, arxiv_tag=False)
|
123 |
|
124 |
+
return returned_metadata
|