Michael-Geis commited on
Commit
62ba9f3
1 Parent(s): 17e4444

dataset does not have MSC tags.

Browse files
Files changed (1) hide show
  1. data-exploration.ipynb +482 -0
data-exploration.ipynb CHANGED
@@ -8,6 +8,480 @@
8
  "# EDA for cleaned arXiv dataset"
9
  ]
10
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "attachments": {},
13
  "cell_type": "markdown",
@@ -24,7 +498,15 @@
24
  "name": "python3"
25
  },
26
  "language_info": {
 
 
 
 
 
 
27
  "name": "python",
 
 
28
  "version": "3.10.11"
29
  },
30
  "orig_nbformat": 4
 
8
  "# EDA for cleaned arXiv dataset"
9
  ]
10
  },
11
+ {
12
+ "attachments": {},
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "## Imports"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 1,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "import pandas as pd\n",
26
+ "import numpy as np"
27
+ ]
28
+ },
29
+ {
30
+ "attachments": {},
31
+ "cell_type": "markdown",
32
+ "metadata": {},
33
+ "source": [
34
+ "## Which subject tag occurs the most frequently our 175k dataset?"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 2,
40
+ "metadata": {},
41
+ "outputs": [
42
+ {
43
+ "data": {
44
+ "text/html": [
45
+ "<div>\n",
46
+ "<style scoped>\n",
47
+ " .dataframe tbody tr th:only-of-type {\n",
48
+ " vertical-align: middle;\n",
49
+ " }\n",
50
+ "\n",
51
+ " .dataframe tbody tr th {\n",
52
+ " vertical-align: top;\n",
53
+ " }\n",
54
+ "\n",
55
+ " .dataframe thead th {\n",
56
+ " text-align: right;\n",
57
+ " }\n",
58
+ "</style>\n",
59
+ "<table border=\"1\" class=\"dataframe\">\n",
60
+ " <thead>\n",
61
+ " <tr style=\"text-align: right;\">\n",
62
+ " <th></th>\n",
63
+ " <th>Accelerator Physics</th>\n",
64
+ " <th>Adaptation and Self-Organizing Systems</th>\n",
65
+ " <th>Algebraic Geometry</th>\n",
66
+ " <th>Algebraic Topology</th>\n",
67
+ " <th>Analysis of PDEs</th>\n",
68
+ " <th>Applications</th>\n",
69
+ " <th>Applied Physics</th>\n",
70
+ " <th>Artificial Intelligence</th>\n",
71
+ " <th>Astrophysics</th>\n",
72
+ " <th>Astrophysics of Galaxies</th>\n",
73
+ " <th>...</th>\n",
74
+ " <th>Strongly Correlated Electrons</th>\n",
75
+ " <th>Subcellular Processes</th>\n",
76
+ " <th>Superconductivity</th>\n",
77
+ " <th>Symbolic Computation</th>\n",
78
+ " <th>Symplectic Geometry</th>\n",
79
+ " <th>Systems and Control</th>\n",
80
+ " <th>Theoretical Economics</th>\n",
81
+ " <th>Tissues and Organs</th>\n",
82
+ " <th>Trading and Market Microstructure</th>\n",
83
+ " <th>UNK</th>\n",
84
+ " </tr>\n",
85
+ " </thead>\n",
86
+ " <tbody>\n",
87
+ " <tr>\n",
88
+ " <th>0</th>\n",
89
+ " <td>False</td>\n",
90
+ " <td>False</td>\n",
91
+ " <td>False</td>\n",
92
+ " <td>False</td>\n",
93
+ " <td>False</td>\n",
94
+ " <td>False</td>\n",
95
+ " <td>False</td>\n",
96
+ " <td>False</td>\n",
97
+ " <td>False</td>\n",
98
+ " <td>False</td>\n",
99
+ " <td>...</td>\n",
100
+ " <td>False</td>\n",
101
+ " <td>False</td>\n",
102
+ " <td>False</td>\n",
103
+ " <td>False</td>\n",
104
+ " <td>False</td>\n",
105
+ " <td>False</td>\n",
106
+ " <td>False</td>\n",
107
+ " <td>False</td>\n",
108
+ " <td>False</td>\n",
109
+ " <td>False</td>\n",
110
+ " </tr>\n",
111
+ " <tr>\n",
112
+ " <th>1</th>\n",
113
+ " <td>False</td>\n",
114
+ " <td>False</td>\n",
115
+ " <td>False</td>\n",
116
+ " <td>False</td>\n",
117
+ " <td>False</td>\n",
118
+ " <td>False</td>\n",
119
+ " <td>False</td>\n",
120
+ " <td>False</td>\n",
121
+ " <td>False</td>\n",
122
+ " <td>False</td>\n",
123
+ " <td>...</td>\n",
124
+ " <td>False</td>\n",
125
+ " <td>False</td>\n",
126
+ " <td>False</td>\n",
127
+ " <td>False</td>\n",
128
+ " <td>False</td>\n",
129
+ " <td>False</td>\n",
130
+ " <td>False</td>\n",
131
+ " <td>False</td>\n",
132
+ " <td>False</td>\n",
133
+ " <td>False</td>\n",
134
+ " </tr>\n",
135
+ " <tr>\n",
136
+ " <th>2</th>\n",
137
+ " <td>False</td>\n",
138
+ " <td>False</td>\n",
139
+ " <td>False</td>\n",
140
+ " <td>False</td>\n",
141
+ " <td>False</td>\n",
142
+ " <td>False</td>\n",
143
+ " <td>False</td>\n",
144
+ " <td>False</td>\n",
145
+ " <td>False</td>\n",
146
+ " <td>False</td>\n",
147
+ " <td>...</td>\n",
148
+ " <td>False</td>\n",
149
+ " <td>False</td>\n",
150
+ " <td>False</td>\n",
151
+ " <td>False</td>\n",
152
+ " <td>False</td>\n",
153
+ " <td>False</td>\n",
154
+ " <td>False</td>\n",
155
+ " <td>False</td>\n",
156
+ " <td>False</td>\n",
157
+ " <td>False</td>\n",
158
+ " </tr>\n",
159
+ " <tr>\n",
160
+ " <th>3</th>\n",
161
+ " <td>False</td>\n",
162
+ " <td>False</td>\n",
163
+ " <td>False</td>\n",
164
+ " <td>False</td>\n",
165
+ " <td>False</td>\n",
166
+ " <td>False</td>\n",
167
+ " <td>False</td>\n",
168
+ " <td>False</td>\n",
169
+ " <td>False</td>\n",
170
+ " <td>False</td>\n",
171
+ " <td>...</td>\n",
172
+ " <td>False</td>\n",
173
+ " <td>False</td>\n",
174
+ " <td>False</td>\n",
175
+ " <td>False</td>\n",
176
+ " <td>False</td>\n",
177
+ " <td>False</td>\n",
178
+ " <td>False</td>\n",
179
+ " <td>False</td>\n",
180
+ " <td>False</td>\n",
181
+ " <td>False</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>4</th>\n",
185
+ " <td>False</td>\n",
186
+ " <td>False</td>\n",
187
+ " <td>False</td>\n",
188
+ " <td>False</td>\n",
189
+ " <td>False</td>\n",
190
+ " <td>False</td>\n",
191
+ " <td>False</td>\n",
192
+ " <td>False</td>\n",
193
+ " <td>False</td>\n",
194
+ " <td>False</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>False</td>\n",
197
+ " <td>False</td>\n",
198
+ " <td>False</td>\n",
199
+ " <td>False</td>\n",
200
+ " <td>False</td>\n",
201
+ " <td>False</td>\n",
202
+ " <td>False</td>\n",
203
+ " <td>False</td>\n",
204
+ " <td>False</td>\n",
205
+ " <td>False</td>\n",
206
+ " </tr>\n",
207
+ " </tbody>\n",
208
+ "</table>\n",
209
+ "<p>5 rows × 150 columns</p>\n",
210
+ "</div>"
211
+ ],
212
+ "text/plain": [
213
+ " Accelerator Physics Adaptation and Self-Organizing Systems \\\n",
214
+ "0 False False \n",
215
+ "1 False False \n",
216
+ "2 False False \n",
217
+ "3 False False \n",
218
+ "4 False False \n",
219
+ "\n",
220
+ " Algebraic Geometry Algebraic Topology Analysis of PDEs Applications \\\n",
221
+ "0 False False False False \n",
222
+ "1 False False False False \n",
223
+ "2 False False False False \n",
224
+ "3 False False False False \n",
225
+ "4 False False False False \n",
226
+ "\n",
227
+ " Applied Physics Artificial Intelligence Astrophysics \\\n",
228
+ "0 False False False \n",
229
+ "1 False False False \n",
230
+ "2 False False False \n",
231
+ "3 False False False \n",
232
+ "4 False False False \n",
233
+ "\n",
234
+ " Astrophysics of Galaxies ... Strongly Correlated Electrons \\\n",
235
+ "0 False ... False \n",
236
+ "1 False ... False \n",
237
+ "2 False ... False \n",
238
+ "3 False ... False \n",
239
+ "4 False ... False \n",
240
+ "\n",
241
+ " Subcellular Processes Superconductivity Symbolic Computation \\\n",
242
+ "0 False False False \n",
243
+ "1 False False False \n",
244
+ "2 False False False \n",
245
+ "3 False False False \n",
246
+ "4 False False False \n",
247
+ "\n",
248
+ " Symplectic Geometry Systems and Control Theoretical Economics \\\n",
249
+ "0 False False False \n",
250
+ "1 False False False \n",
251
+ "2 False False False \n",
252
+ "3 False False False \n",
253
+ "4 False False False \n",
254
+ "\n",
255
+ " Tissues and Organs Trading and Market Microstructure UNK \n",
256
+ "0 False False False \n",
257
+ "1 False False False \n",
258
+ "2 False False False \n",
259
+ "3 False False False \n",
260
+ "4 False False False \n",
261
+ "\n",
262
+ "[5 rows x 150 columns]"
263
+ ]
264
+ },
265
+ "execution_count": 2,
266
+ "metadata": {},
267
+ "output_type": "execute_result"
268
+ }
269
+ ],
270
+ "source": [
271
+ "cats = pd.read_parquet('./data/arXiv_cat.parquet')\n",
272
+ "cats.head()"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 3,
278
+ "metadata": {},
279
+ "outputs": [
280
+ {
281
+ "data": {
282
+ "text/plain": [
283
+ "Analysis of PDEs 18944\n",
284
+ "Combinatorics 18930\n",
285
+ "Optimization and Control 18284\n",
286
+ "Mathematical Physics 16381\n",
287
+ "Probability 15343\n",
288
+ "dtype: int64"
289
+ ]
290
+ },
291
+ "execution_count": 3,
292
+ "metadata": {},
293
+ "output_type": "execute_result"
294
+ }
295
+ ],
296
+ "source": [
297
+ "## Calculate the number of times each tag appears\n",
298
+ "\n",
299
+ "totals = cats.sum(axis=0).sort_values(ascending=False)\n",
300
+ "totals.head()"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": 5,
306
+ "metadata": {},
307
+ "outputs": [
308
+ {
309
+ "data": {
310
+ "text/html": [
311
+ "<div>\n",
312
+ "<style scoped>\n",
313
+ " .dataframe tbody tr th:only-of-type {\n",
314
+ " vertical-align: middle;\n",
315
+ " }\n",
316
+ "\n",
317
+ " .dataframe tbody tr th {\n",
318
+ " vertical-align: top;\n",
319
+ " }\n",
320
+ "\n",
321
+ " .dataframe thead th {\n",
322
+ " text-align: right;\n",
323
+ " }\n",
324
+ "</style>\n",
325
+ "<table border=\"1\" class=\"dataframe\">\n",
326
+ " <thead>\n",
327
+ " <tr style=\"text-align: right;\">\n",
328
+ " <th></th>\n",
329
+ " <th>raw_title</th>\n",
330
+ " <th>clean_title</th>\n",
331
+ " <th>hyph_in_title</th>\n",
332
+ " <th>raw_abstract</th>\n",
333
+ " <th>clean_abstract</th>\n",
334
+ " <th>hyph_in_abstract</th>\n",
335
+ " <th>authors_parsed</th>\n",
336
+ " <th>cat</th>\n",
337
+ " <th>update_date</th>\n",
338
+ " <th>id</th>\n",
339
+ " </tr>\n",
340
+ " </thead>\n",
341
+ " <tbody>\n",
342
+ " <tr>\n",
343
+ " <th>42</th>\n",
344
+ " <td>The Prolongation Problem for the Heavenly Equa...</td>\n",
345
+ " <td>The Prolongation Problem for the Heavenly Equa...</td>\n",
346
+ " <td>None</td>\n",
347
+ " <td>We provide an exact regular solution of an o...</td>\n",
348
+ " <td>We provide an exact regular solution of an o...</td>\n",
349
+ " <td>None</td>\n",
350
+ " <td>[['Palese', 'M.', '', 'Dept. Math. Univ. of To...</td>\n",
351
+ " <td>[math.AP, math-ph, math.MP]</td>\n",
352
+ " <td>2022-09-21</td>\n",
353
+ " <td>math/0311218</td>\n",
354
+ " </tr>\n",
355
+ " <tr>\n",
356
+ " <th>55</th>\n",
357
+ " <td>Null Controllability for a Degenerate Structur...</td>\n",
358
+ " <td>Null Controllability for a Degenerate Structur...</td>\n",
359
+ " <td>None</td>\n",
360
+ " <td>In this paper, we consider the infinite dime...</td>\n",
361
+ " <td>In this paper, we consider the infinite dime...</td>\n",
362
+ " <td>[final-state]</td>\n",
363
+ " <td>[['Simporé', 'Yacouba', ''], ['gantouh', 'Yass...</td>\n",
364
+ " <td>[math.OC, math.AP]</td>\n",
365
+ " <td>2022-09-09</td>\n",
366
+ " <td>2209.03645</td>\n",
367
+ " </tr>\n",
368
+ " <tr>\n",
369
+ " <th>59</th>\n",
370
+ " <td>Voting models and semilinear parabolic equations</td>\n",
371
+ " <td>Voting models and semilinear parabolic equations</td>\n",
372
+ " <td>None</td>\n",
373
+ " <td>We present probabilistic interpretations of ...</td>\n",
374
+ " <td>We present probabilistic interpretations of ...</td>\n",
375
+ " <td>[semi-linear, Fisher-KPP, group-based, pushmi-...</td>\n",
376
+ " <td>[['An', 'Jing', ''], ['Henderson', 'Christophe...</td>\n",
377
+ " <td>[math.AP, math.PR]</td>\n",
378
+ " <td>2022-09-09</td>\n",
379
+ " <td>2209.03435</td>\n",
380
+ " </tr>\n",
381
+ " <tr>\n",
382
+ " <th>72</th>\n",
383
+ " <td>Flows of $G_2$-Structures associated to Calabi...</td>\n",
384
+ " <td>Flows of LATEX associated to Calabi-Yau Manif...</td>\n",
385
+ " <td>[Calabi-Yau]</td>\n",
386
+ " <td>We establish a correspondence between a para...</td>\n",
387
+ " <td>We establish a correspondence between a para...</td>\n",
388
+ " <td>[Monge-Ampere, Monge-Ampere, torsion-free, Ric...</td>\n",
389
+ " <td>[['Picard', 'Sébastien', ''], ['Suan', 'Caleb'...</td>\n",
390
+ " <td>[math.DG, math.AP]</td>\n",
391
+ " <td>2022-09-09</td>\n",
392
+ " <td>2209.03411</td>\n",
393
+ " </tr>\n",
394
+ " <tr>\n",
395
+ " <th>78</th>\n",
396
+ " <td>On the dynamics of vortices in viscous 2D flows</td>\n",
397
+ " <td>On the dynamics of vortices in viscous 2D flows</td>\n",
398
+ " <td>None</td>\n",
399
+ " <td>We study the 2D Navier--Stokes solution star...</td>\n",
400
+ " <td>We study the 2D Navier--Stokes solution star...</td>\n",
401
+ " <td>None</td>\n",
402
+ " <td>[['Ceci', 'Stefano', ''], ['Seis', 'Christian'...</td>\n",
403
+ " <td>[math.AP]</td>\n",
404
+ " <td>2022-09-09</td>\n",
405
+ " <td>2203.07185</td>\n",
406
+ " </tr>\n",
407
+ " </tbody>\n",
408
+ "</table>\n",
409
+ "</div>"
410
+ ],
411
+ "text/plain": [
412
+ " raw_title \\\n",
413
+ "42 The Prolongation Problem for the Heavenly Equa... \n",
414
+ "55 Null Controllability for a Degenerate Structur... \n",
415
+ "59 Voting models and semilinear parabolic equations \n",
416
+ "72 Flows of $G_2$-Structures associated to Calabi... \n",
417
+ "78 On the dynamics of vortices in viscous 2D flows \n",
418
+ "\n",
419
+ " clean_title hyph_in_title \\\n",
420
+ "42 The Prolongation Problem for the Heavenly Equa... None \n",
421
+ "55 Null Controllability for a Degenerate Structur... None \n",
422
+ "59 Voting models and semilinear parabolic equations None \n",
423
+ "72 Flows of LATEX associated to Calabi-Yau Manif... [Calabi-Yau] \n",
424
+ "78 On the dynamics of vortices in viscous 2D flows None \n",
425
+ "\n",
426
+ " raw_abstract \\\n",
427
+ "42 We provide an exact regular solution of an o... \n",
428
+ "55 In this paper, we consider the infinite dime... \n",
429
+ "59 We present probabilistic interpretations of ... \n",
430
+ "72 We establish a correspondence between a para... \n",
431
+ "78 We study the 2D Navier--Stokes solution star... \n",
432
+ "\n",
433
+ " clean_abstract \\\n",
434
+ "42 We provide an exact regular solution of an o... \n",
435
+ "55 In this paper, we consider the infinite dime... \n",
436
+ "59 We present probabilistic interpretations of ... \n",
437
+ "72 We establish a correspondence between a para... \n",
438
+ "78 We study the 2D Navier--Stokes solution star... \n",
439
+ "\n",
440
+ " hyph_in_abstract \\\n",
441
+ "42 None \n",
442
+ "55 [final-state] \n",
443
+ "59 [semi-linear, Fisher-KPP, group-based, pushmi-... \n",
444
+ "72 [Monge-Ampere, Monge-Ampere, torsion-free, Ric... \n",
445
+ "78 None \n",
446
+ "\n",
447
+ " authors_parsed \\\n",
448
+ "42 [['Palese', 'M.', '', 'Dept. Math. Univ. of To... \n",
449
+ "55 [['Simporé', 'Yacouba', ''], ['gantouh', 'Yass... \n",
450
+ "59 [['An', 'Jing', ''], ['Henderson', 'Christophe... \n",
451
+ "72 [['Picard', 'Sébastien', ''], ['Suan', 'Caleb'... \n",
452
+ "78 [['Ceci', 'Stefano', ''], ['Seis', 'Christian'... \n",
453
+ "\n",
454
+ " cat update_date id \n",
455
+ "42 [math.AP, math-ph, math.MP] 2022-09-21 math/0311218 \n",
456
+ "55 [math.OC, math.AP] 2022-09-09 2209.03645 \n",
457
+ "59 [math.AP, math.PR] 2022-09-09 2209.03435 \n",
458
+ "72 [math.DG, math.AP] 2022-09-09 2209.03411 \n",
459
+ "78 [math.AP] 2022-09-09 2203.07185 "
460
+ ]
461
+ },
462
+ "execution_count": 5,
463
+ "metadata": {},
464
+ "output_type": "execute_result"
465
+ }
466
+ ],
467
+ "source": [
468
+ "## Create the dataset of all pde articles\n",
469
+ "\n",
470
+ "full_data = pd.read_parquet('./data/arXiv_clean.parquet')\n",
471
+ "pde = full_data.loc[cats['Analysis of PDEs'] == True]\n",
472
+ "pde.head()"
473
+ ]
474
+ },
475
+ {
476
+ "attachments": {},
477
+ "cell_type": "markdown",
478
+ "metadata": {},
479
+ "source": [
480
+ "## Next goal: Does the raw arxiv dataset contain the MSC subject information?\n",
481
+ "\n",
482
+ "No, it doesn't -- this was verified in a kaggle notebook. It has only arxiv subject tag information."
483
+ ]
484
+ },
485
  {
486
  "attachments": {},
487
  "cell_type": "markdown",
 
498
  "name": "python3"
499
  },
500
  "language_info": {
501
+ "codemirror_mode": {
502
+ "name": "ipython",
503
+ "version": 3
504
+ },
505
+ "file_extension": ".py",
506
+ "mimetype": "text/x-python",
507
  "name": "python",
508
+ "nbconvert_exporter": "python",
509
+ "pygments_lexer": "ipython3",
510
  "version": "3.10.11"
511
  },
512
  "orig_nbformat": 4