TovaHasi commited on
Commit
9811406
1 Parent(s): 43a8fb9

Upload Preprocessing.ipynb

Browse files
Files changed (1) hide show
  1. Preprocessing.ipynb +934 -0
Preprocessing.ipynb ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 329,
6
+ "id": "6cf95722",
7
+ "metadata": {
8
+ "cellId": "eziodlb8kics09v3tpfeks"
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "#!g1.1\n",
13
+ "from sklearn.preprocessing import LabelEncoder\n",
14
+ "import transformers\n",
15
+ "import torch\n",
16
+ "import nltk\n",
17
+ "import numpy as np\n",
18
+ "import pandas as pd"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 330,
24
+ "id": "c692d165",
25
+ "metadata": {
26
+ "cellId": "lvwy4cb1dnfnk3n391yiq"
27
+ },
28
+ "outputs": [],
29
+ "source": [
30
+ "#!g1.1\n",
31
+ "df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines = True)"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 331,
37
+ "id": "9b51b145",
38
+ "metadata": {
39
+ "cellId": "l71bdxo21obg5fforh5ppi"
40
+ },
41
+ "outputs": [
42
+ {
43
+ "data": {
44
+ "text/html": [
45
+ "<div>\n",
46
+ "<style scoped>\n",
47
+ " .dataframe tbody tr th:only-of-type {\n",
48
+ " vertical-align: middle;\n",
49
+ " }\n",
50
+ "\n",
51
+ " .dataframe tbody tr th {\n",
52
+ " vertical-align: top;\n",
53
+ " }\n",
54
+ "\n",
55
+ " .dataframe thead th {\n",
56
+ " text-align: right;\n",
57
+ " }\n",
58
+ "</style>\n",
59
+ "<table border=\"1\" class=\"dataframe\">\n",
60
+ " <thead>\n",
61
+ " <tr style=\"text-align: right;\">\n",
62
+ " <th></th>\n",
63
+ " <th>id</th>\n",
64
+ " <th>submitter</th>\n",
65
+ " <th>authors</th>\n",
66
+ " <th>title</th>\n",
67
+ " <th>comments</th>\n",
68
+ " <th>journal-ref</th>\n",
69
+ " <th>doi</th>\n",
70
+ " <th>report-no</th>\n",
71
+ " <th>categories</th>\n",
72
+ " <th>license</th>\n",
73
+ " <th>abstract</th>\n",
74
+ " <th>versions</th>\n",
75
+ " <th>update_date</th>\n",
76
+ " <th>authors_parsed</th>\n",
77
+ " </tr>\n",
78
+ " </thead>\n",
79
+ " <tbody>\n",
80
+ " <tr>\n",
81
+ " <th>0</th>\n",
82
+ " <td>0704.0001</td>\n",
83
+ " <td>Pavel Nadolsky</td>\n",
84
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
85
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
86
+ " <td>37 pages, 15 figures; published version</td>\n",
87
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
88
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
89
+ " <td>ANL-HEP-PR-07-12</td>\n",
90
+ " <td>hep-ph</td>\n",
91
+ " <td>None</td>\n",
92
+ " <td>A fully differential calculation in perturba...</td>\n",
93
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
94
+ " <td>2008-11-26</td>\n",
95
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
96
+ " </tr>\n",
97
+ " <tr>\n",
98
+ " <th>1</th>\n",
99
+ " <td>0704.0002</td>\n",
100
+ " <td>Louis Theran</td>\n",
101
+ " <td>Ileana Streinu and Louis Theran</td>\n",
102
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
103
+ " <td>To appear in Graphs and Combinatorics</td>\n",
104
+ " <td>None</td>\n",
105
+ " <td>None</td>\n",
106
+ " <td>None</td>\n",
107
+ " <td>math.CO cs.CG</td>\n",
108
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
109
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
110
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
111
+ " <td>2008-12-13</td>\n",
112
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
113
+ " </tr>\n",
114
+ " <tr>\n",
115
+ " <th>2</th>\n",
116
+ " <td>0704.0003</td>\n",
117
+ " <td>Hongjun Pan</td>\n",
118
+ " <td>Hongjun Pan</td>\n",
119
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
120
+ " <td>23 pages, 3 figures</td>\n",
121
+ " <td>None</td>\n",
122
+ " <td>None</td>\n",
123
+ " <td>None</td>\n",
124
+ " <td>physics.gen-ph</td>\n",
125
+ " <td>None</td>\n",
126
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
127
+ " <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
128
+ " <td>2008-01-13</td>\n",
129
+ " <td>[[Pan, Hongjun, ]]</td>\n",
130
+ " </tr>\n",
131
+ " <tr>\n",
132
+ " <th>3</th>\n",
133
+ " <td>0704.0004</td>\n",
134
+ " <td>David Callan</td>\n",
135
+ " <td>David Callan</td>\n",
136
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
137
+ " <td>11 pages</td>\n",
138
+ " <td>None</td>\n",
139
+ " <td>None</td>\n",
140
+ " <td>None</td>\n",
141
+ " <td>math.CO</td>\n",
142
+ " <td>None</td>\n",
143
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
144
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
145
+ " <td>2007-05-23</td>\n",
146
+ " <td>[[Callan, David, ]]</td>\n",
147
+ " </tr>\n",
148
+ " <tr>\n",
149
+ " <th>4</th>\n",
150
+ " <td>0704.0005</td>\n",
151
+ " <td>Alberto Torchinsky</td>\n",
152
+ " <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
153
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
154
+ " <td>None</td>\n",
155
+ " <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
156
+ " <td>None</td>\n",
157
+ " <td>None</td>\n",
158
+ " <td>math.CA math.FA</td>\n",
159
+ " <td>None</td>\n",
160
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
161
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
162
+ " <td>2013-10-15</td>\n",
163
+ " <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
164
+ " </tr>\n",
165
+ " </tbody>\n",
166
+ "</table>\n",
167
+ "</div>"
168
+ ],
169
+ "text/plain": [
170
+ " id submitter \\\n",
171
+ "0 0704.0001 Pavel Nadolsky \n",
172
+ "1 0704.0002 Louis Theran \n",
173
+ "2 0704.0003 Hongjun Pan \n",
174
+ "3 0704.0004 David Callan \n",
175
+ "4 0704.0005 Alberto Torchinsky \n",
176
+ "\n",
177
+ " authors \\\n",
178
+ "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \n",
179
+ "1 Ileana Streinu and Louis Theran \n",
180
+ "2 Hongjun Pan \n",
181
+ "3 David Callan \n",
182
+ "4 Wael Abu-Shammala and Alberto Torchinsky \n",
183
+ "\n",
184
+ " title \\\n",
185
+ "0 Calculation of prompt diphoton production cros... \n",
186
+ "1 Sparsity-certifying Graph Decompositions \n",
187
+ "2 The evolution of the Earth-Moon system based o... \n",
188
+ "3 A determinant of Stirling cycle numbers counts... \n",
189
+ "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
190
+ "\n",
191
+ " comments \\\n",
192
+ "0 37 pages, 15 figures; published version \n",
193
+ "1 To appear in Graphs and Combinatorics \n",
194
+ "2 23 pages, 3 figures \n",
195
+ "3 11 pages \n",
196
+ "4 None \n",
197
+ "\n",
198
+ " journal-ref doi \\\n",
199
+ "0 Phys.Rev.D76:013009,2007 10.1103/PhysRevD.76.013009 \n",
200
+ "1 None None \n",
201
+ "2 None None \n",
202
+ "3 None None \n",
203
+ "4 Illinois J. Math. 52 (2008) no.2, 681-689 None \n",
204
+ "\n",
205
+ " report-no categories \\\n",
206
+ "0 ANL-HEP-PR-07-12 hep-ph \n",
207
+ "1 None math.CO cs.CG \n",
208
+ "2 None physics.gen-ph \n",
209
+ "3 None math.CO \n",
210
+ "4 None math.CA math.FA \n",
211
+ "\n",
212
+ " license \\\n",
213
+ "0 None \n",
214
+ "1 http://arxiv.org/licenses/nonexclusive-distrib... \n",
215
+ "2 None \n",
216
+ "3 None \n",
217
+ "4 None \n",
218
+ "\n",
219
+ " abstract \\\n",
220
+ "0 A fully differential calculation in perturba... \n",
221
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
222
+ "2 The evolution of Earth-Moon system is descri... \n",
223
+ "3 We show that a determinant of Stirling cycle... \n",
224
+ "4 In this paper we show how to compute the $\\L... \n",
225
+ "\n",
226
+ " versions update_date \\\n",
227
+ "0 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2008-11-26 \n",
228
+ "1 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2008-12-13 \n",
229
+ "2 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2008-01-13 \n",
230
+ "3 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2007-05-23 \n",
231
+ "4 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2013-10-15 \n",
232
+ "\n",
233
+ " authors_parsed \n",
234
+ "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \n",
235
+ "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n",
236
+ "2 [[Pan, Hongjun, ]] \n",
237
+ "3 [[Callan, David, ]] \n",
238
+ "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] "
239
+ ]
240
+ },
241
+ "execution_count": 331,
242
+ "metadata": {},
243
+ "output_type": "execute_result"
244
+ }
245
+ ],
246
+ "source": [
247
+ "#!g1.1\n",
248
+ "df.head(5)"
249
+ ]
250
+ },
251
+ {
252
+ "cell_type": "code",
253
+ "execution_count": 332,
254
+ "id": "891dfcce",
255
+ "metadata": {
256
+ "cellId": "7769ktrdya6ae568sf1rk"
257
+ },
258
+ "outputs": [
259
+ {
260
+ "data": {
261
+ "text/plain": [
262
+ "'math.NT'"
263
+ ]
264
+ },
265
+ "execution_count": 332,
266
+ "metadata": {},
267
+ "output_type": "execute_result"
268
+ }
269
+ ],
270
+ "source": [
271
+ "#!g1.1\n",
272
+ "str(df[10:11]['categories']).split()[1]"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 333,
278
+ "id": "833b4037",
279
+ "metadata": {
280
+ "cellId": "sj1efyz6sjgr20rhngooc"
281
+ },
282
+ "outputs": [
283
+ {
284
+ "data": {
285
+ "text/html": [
286
+ "<div>\n",
287
+ "<style scoped>\n",
288
+ " .dataframe tbody tr th:only-of-type {\n",
289
+ " vertical-align: middle;\n",
290
+ " }\n",
291
+ "\n",
292
+ " .dataframe tbody tr th {\n",
293
+ " vertical-align: top;\n",
294
+ " }\n",
295
+ "\n",
296
+ " .dataframe thead th {\n",
297
+ " text-align: right;\n",
298
+ " }\n",
299
+ "</style>\n",
300
+ "<table border=\"1\" class=\"dataframe\">\n",
301
+ " <thead>\n",
302
+ " <tr style=\"text-align: right;\">\n",
303
+ " <th></th>\n",
304
+ " <th>id</th>\n",
305
+ " <th>submitter</th>\n",
306
+ " <th>authors</th>\n",
307
+ " <th>title</th>\n",
308
+ " <th>comments</th>\n",
309
+ " <th>journal-ref</th>\n",
310
+ " <th>doi</th>\n",
311
+ " <th>report-no</th>\n",
312
+ " <th>categories</th>\n",
313
+ " <th>license</th>\n",
314
+ " <th>abstract</th>\n",
315
+ " <th>versions</th>\n",
316
+ " <th>update_date</th>\n",
317
+ " <th>authors_parsed</th>\n",
318
+ " <th>tag</th>\n",
319
+ " </tr>\n",
320
+ " </thead>\n",
321
+ " <tbody>\n",
322
+ " <tr>\n",
323
+ " <th>0</th>\n",
324
+ " <td>0704.0001</td>\n",
325
+ " <td>Pavel Nadolsky</td>\n",
326
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
327
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
328
+ " <td>37 pages, 15 figures; published version</td>\n",
329
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
330
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
331
+ " <td>ANL-HEP-PR-07-12</td>\n",
332
+ " <td>hep-ph</td>\n",
333
+ " <td>None</td>\n",
334
+ " <td>A fully differential calculation in perturba...</td>\n",
335
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
336
+ " <td>2008-11-26</td>\n",
337
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
338
+ " <td>hep-ph</td>\n",
339
+ " </tr>\n",
340
+ " <tr>\n",
341
+ " <th>1</th>\n",
342
+ " <td>0704.0002</td>\n",
343
+ " <td>Louis Theran</td>\n",
344
+ " <td>Ileana Streinu and Louis Theran</td>\n",
345
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
346
+ " <td>To appear in Graphs and Combinatorics</td>\n",
347
+ " <td>None</td>\n",
348
+ " <td>None</td>\n",
349
+ " <td>None</td>\n",
350
+ " <td>math.CO cs.CG</td>\n",
351
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
352
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
353
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
354
+ " <td>2008-12-13</td>\n",
355
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
356
+ " <td>math</td>\n",
357
+ " </tr>\n",
358
+ " <tr>\n",
359
+ " <th>2</th>\n",
360
+ " <td>0704.0003</td>\n",
361
+ " <td>Hongjun Pan</td>\n",
362
+ " <td>Hongjun Pan</td>\n",
363
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
364
+ " <td>23 pages, 3 figures</td>\n",
365
+ " <td>None</td>\n",
366
+ " <td>None</td>\n",
367
+ " <td>None</td>\n",
368
+ " <td>physics.gen-ph</td>\n",
369
+ " <td>None</td>\n",
370
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
371
+ " <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
372
+ " <td>2008-01-13</td>\n",
373
+ " <td>[[Pan, Hongjun, ]]</td>\n",
374
+ " <td>physics</td>\n",
375
+ " </tr>\n",
376
+ " <tr>\n",
377
+ " <th>3</th>\n",
378
+ " <td>0704.0004</td>\n",
379
+ " <td>David Callan</td>\n",
380
+ " <td>David Callan</td>\n",
381
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
382
+ " <td>11 pages</td>\n",
383
+ " <td>None</td>\n",
384
+ " <td>None</td>\n",
385
+ " <td>None</td>\n",
386
+ " <td>math.CO</td>\n",
387
+ " <td>None</td>\n",
388
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
389
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
390
+ " <td>2007-05-23</td>\n",
391
+ " <td>[[Callan, David, ]]</td>\n",
392
+ " <td>math</td>\n",
393
+ " </tr>\n",
394
+ " <tr>\n",
395
+ " <th>4</th>\n",
396
+ " <td>0704.0005</td>\n",
397
+ " <td>Alberto Torchinsky</td>\n",
398
+ " <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
399
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
400
+ " <td>None</td>\n",
401
+ " <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
402
+ " <td>None</td>\n",
403
+ " <td>None</td>\n",
404
+ " <td>math.CA math.FA</td>\n",
405
+ " <td>None</td>\n",
406
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
407
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
408
+ " <td>2013-10-15</td>\n",
409
+ " <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
410
+ " <td>math</td>\n",
411
+ " </tr>\n",
412
+ " </tbody>\n",
413
+ "</table>\n",
414
+ "</div>"
415
+ ],
416
+ "text/plain": [
417
+ " id submitter \\\n",
418
+ "0 0704.0001 Pavel Nadolsky \n",
419
+ "1 0704.0002 Louis Theran \n",
420
+ "2 0704.0003 Hongjun Pan \n",
421
+ "3 0704.0004 David Callan \n",
422
+ "4 0704.0005 Alberto Torchinsky \n",
423
+ "\n",
424
+ " authors \\\n",
425
+ "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \n",
426
+ "1 Ileana Streinu and Louis Theran \n",
427
+ "2 Hongjun Pan \n",
428
+ "3 David Callan \n",
429
+ "4 Wael Abu-Shammala and Alberto Torchinsky \n",
430
+ "\n",
431
+ " title \\\n",
432
+ "0 Calculation of prompt diphoton production cros... \n",
433
+ "1 Sparsity-certifying Graph Decompositions \n",
434
+ "2 The evolution of the Earth-Moon system based o... \n",
435
+ "3 A determinant of Stirling cycle numbers counts... \n",
436
+ "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
437
+ "\n",
438
+ " comments \\\n",
439
+ "0 37 pages, 15 figures; published version \n",
440
+ "1 To appear in Graphs and Combinatorics \n",
441
+ "2 23 pages, 3 figures \n",
442
+ "3 11 pages \n",
443
+ "4 None \n",
444
+ "\n",
445
+ " journal-ref doi \\\n",
446
+ "0 Phys.Rev.D76:013009,2007 10.1103/PhysRevD.76.013009 \n",
447
+ "1 None None \n",
448
+ "2 None None \n",
449
+ "3 None None \n",
450
+ "4 Illinois J. Math. 52 (2008) no.2, 681-689 None \n",
451
+ "\n",
452
+ " report-no categories \\\n",
453
+ "0 ANL-HEP-PR-07-12 hep-ph \n",
454
+ "1 None math.CO cs.CG \n",
455
+ "2 None physics.gen-ph \n",
456
+ "3 None math.CO \n",
457
+ "4 None math.CA math.FA \n",
458
+ "\n",
459
+ " license \\\n",
460
+ "0 None \n",
461
+ "1 http://arxiv.org/licenses/nonexclusive-distrib... \n",
462
+ "2 None \n",
463
+ "3 None \n",
464
+ "4 None \n",
465
+ "\n",
466
+ " abstract \\\n",
467
+ "0 A fully differential calculation in perturba... \n",
468
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
469
+ "2 The evolution of Earth-Moon system is descri... \n",
470
+ "3 We show that a determinant of Stirling cycle... \n",
471
+ "4 In this paper we show how to compute the $\\L... \n",
472
+ "\n",
473
+ " versions update_date \\\n",
474
+ "0 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2008-11-26 \n",
475
+ "1 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2008-12-13 \n",
476
+ "2 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2008-01-13 \n",
477
+ "3 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2007-05-23 \n",
478
+ "4 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2013-10-15 \n",
479
+ "\n",
480
+ " authors_parsed tag \n",
481
+ "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... hep-ph \n",
482
+ "1 [[Streinu, Ileana, ], [Theran, Louis, ]] math \n",
483
+ "2 [[Pan, Hongjun, ]] physics \n",
484
+ "3 [[Callan, David, ]] math \n",
485
+ "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] math "
486
+ ]
487
+ },
488
+ "execution_count": 333,
489
+ "metadata": {},
490
+ "output_type": "execute_result"
491
+ },
492
+ {
493
+ "name": "stderr",
494
+ "output_type": "stream",
495
+ "text": [
496
+ "/kernel/lib/python3.8/site-packages/ml_kernel/ignored_keyboard_interrupt.py:16: UserWarning: State committing stage cannot be interrupted. Please wait.\n",
497
+ " warnings.warn(self._warn_message)\n"
498
+ ]
499
+ }
500
+ ],
501
+ "source": [
502
+ "#!g1.1\n",
503
+ "def get_tag(row):\n",
504
+ " try:\n",
505
+ " return str(row).split()[0].split('.')[0]\n",
506
+ " except Exception:\n",
507
+ " pass\n",
508
+ " \n",
509
+ "\n",
510
+ "df['tag'] = df['categories'].apply(lambda x: get_tag(x))\n",
511
+ "df.head(5)"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": 390,
517
+ "id": "18bc627a",
518
+ "metadata": {
519
+ "cellId": "yo10sp842yn569jdjhn34c"
520
+ },
521
+ "outputs": [],
522
+ "source": [
523
+ "#!g1.1\n",
524
+ "labels = []\n",
525
+ "for index, row in df.iterrows():\n",
526
+ " labels.append(row['tag'])\n",
527
+ " "
528
+ ]
529
+ },
530
+ {
531
+ "cell_type": "code",
532
+ "execution_count": 391,
533
+ "id": "6c522ad9",
534
+ "metadata": {
535
+ "cellId": "hpc6uu4tlccmq4o0dsx38c"
536
+ },
537
+ "outputs": [],
538
+ "source": [
539
+ "#!g1.1\n",
540
+ "map_labels = dict()\n",
541
+ "for label in labels:\n",
542
+ " if label in map_labels:\n",
543
+ " map_labels[label] += 1\n",
544
+ " else:\n",
545
+ " map_labels[label] = 1 "
546
+ ]
547
+ },
548
+ {
549
+ "cell_type": "code",
550
+ "execution_count": 397,
551
+ "id": "43b9f3cb",
552
+ "metadata": {
553
+ "cellId": "lbwz1g67k60opqhyjm3v6b"
554
+ },
555
+ "outputs": [
556
+ {
557
+ "data": {
558
+ "text/plain": [
559
+ "{'hep-ph': 120410,\n",
560
+ " 'math': 414977,\n",
561
+ " 'physics': 146186,\n",
562
+ " 'cond-mat': 276428,\n",
563
+ " 'gr-qc': 54599,\n",
564
+ " 'astro-ph': 266321,\n",
565
+ " 'hep-th': 96207,\n",
566
+ " 'hep-ex': 20735,\n",
567
+ " 'nlin': 17315,\n",
568
+ " 'q-bio': 23287,\n",
569
+ " 'quant-ph': 88923,\n",
570
+ " 'cs': 338681,\n",
571
+ " 'nucl-th': 30599,\n",
572
+ " 'math-ph': 28805,\n",
573
+ " 'hep-lat': 16623,\n",
574
+ " 'nucl-ex': 10500,\n",
575
+ " 'q-fin': 9131,\n",
576
+ " 'stat': 37132,\n",
577
+ " 'eess': 27531,\n",
578
+ " 'econ': 3904,\n",
579
+ " 'acc-phys': 46,\n",
580
+ " 'adap-org': 306,\n",
581
+ " 'alg-geom': 1209,\n",
582
+ " 'ao-sci': 13,\n",
583
+ " 'atom-ph': 68,\n",
584
+ " 'bayes-an': 11,\n",
585
+ " 'chao-dyn': 1770,\n",
586
+ " 'chem-ph': 129,\n",
587
+ " 'cmp-lg': 894,\n",
588
+ " 'comp-gas': 140,\n",
589
+ " 'dg-ga': 562,\n",
590
+ " 'funct-an': 320,\n",
591
+ " 'mtrl-th': 165,\n",
592
+ " 'patt-sol': 452,\n",
593
+ " 'plasm-ph': 28,\n",
594
+ " 'q-alg': 1177,\n",
595
+ " 'solv-int': 844,\n",
596
+ " 'supr-con': 69}"
597
+ ]
598
+ },
599
+ "execution_count": 397,
600
+ "metadata": {},
601
+ "output_type": "execute_result"
602
+ }
603
+ ],
604
+ "source": [
605
+ "#!g1.1\n",
606
+ "map_labels"
607
+ ]
608
+ },
609
+ {
610
+ "cell_type": "code",
611
+ "execution_count": 398,
612
+ "id": "7e8b9e14",
613
+ "metadata": {
614
+ "cellId": "0xg7kt36566l12t6gavfeoq"
615
+ },
616
+ "outputs": [],
617
+ "source": [
618
+ "#!g1.1\n",
619
+ "dict_label = {'math': 4,\n",
620
+ " 'physics': 5,\n",
621
+ " 'q-bio': 6,\n",
622
+ " 'cs': 1,\n",
623
+ " 'q-fin': 7,\n",
624
+ " 'stat': 8,\n",
625
+ " 'eess': 3,\n",
626
+ " 'econ': 2}"
627
+ ]
628
+ },
629
+ {
630
+ "cell_type": "code",
631
+ "execution_count": 399,
632
+ "id": "39bfd5dc",
633
+ "metadata": {
634
+ "cellId": "twjsbrjsiloyjbah2fbtm"
635
+ },
636
+ "outputs": [],
637
+ "source": [
638
+ "#!g1.1\n",
639
+ "new_map_labels = dict()\n",
640
+ "for key, value in map_labels.items():\n",
641
+ " if key in dict_label:\n",
642
+ " new_map_labels[key] = value "
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "code",
647
+ "execution_count": 400,
648
+ "id": "3eca80b5",
649
+ "metadata": {
650
+ "cellId": "9qzelfdhdtgizy330a4jzi"
651
+ },
652
+ "outputs": [
653
+ {
654
+ "data": {
655
+ "text/plain": [
656
+ "{'math': 414977,\n",
657
+ " 'physics': 146186,\n",
658
+ " 'q-bio': 23287,\n",
659
+ " 'cs': 338681,\n",
660
+ " 'q-fin': 9131,\n",
661
+ " 'stat': 37132,\n",
662
+ " 'eess': 27531,\n",
663
+ " 'econ': 3904}"
664
+ ]
665
+ },
666
+ "execution_count": 400,
667
+ "metadata": {},
668
+ "output_type": "execute_result"
669
+ }
670
+ ],
671
+ "source": [
672
+ "#!g1.1\n",
673
+ "new_map_labels"
674
+ ]
675
+ },
676
+ {
677
+ "cell_type": "code",
678
+ "execution_count": 401,
679
+ "id": "eaa44619",
680
+ "metadata": {
681
+ "cellId": "9o0bhawr3id19hxhy7m4xt"
682
+ },
683
+ "outputs": [
684
+ {
685
+ "data": {
686
+ "text/plain": [
687
+ "59904"
688
+ ]
689
+ },
690
+ "execution_count": 401,
691
+ "metadata": {},
692
+ "output_type": "execute_result"
693
+ }
694
+ ],
695
+ "source": [
696
+ "#!g1.1\n",
697
+ "sum_value = 0\n",
698
+ "for key, value in new_map_labels.items():\n",
699
+ " sum_value += min(value, 8000) \n",
700
+ " new_map_labels[key] = min(value, 8000) \n",
701
+ "sum_value"
702
+ ]
703
+ },
704
+ {
705
+ "cell_type": "code",
706
+ "execution_count": 405,
707
+ "id": "8b5d06a1",
708
+ "metadata": {
709
+ "cellId": "eyq1ocx4irpx2lplb54qu"
710
+ },
711
+ "outputs": [],
712
+ "source": [
713
+ "#!g1.1 \n",
714
+ "small_df = pd.DataFrame()\n",
715
+ "\n",
716
+ "for idx, row in df.iterrows():\n",
717
+ " if row['tag'] in new_map_labels:\n",
718
+ " \n",
719
+ " if new_map_labels[row['tag']] > 0:\n",
720
+ " new_map_labels[row['tag']] -= 1\n",
721
+ "\n",
722
+ " small_df = small_df.append(row, ignore_index=True)\n",
723
+ " "
724
+ ]
725
+ },
726
+ {
727
+ "cell_type": "code",
728
+ "execution_count": 406,
729
+ "id": "f0d17685",
730
+ "metadata": {
731
+ "cellId": "1xwjz5xpl4hfeu6hcby3sw"
732
+ },
733
+ "outputs": [
734
+ {
735
+ "data": {
736
+ "text/plain": [
737
+ "(59904, 15)"
738
+ ]
739
+ },
740
+ "execution_count": 406,
741
+ "metadata": {},
742
+ "output_type": "execute_result"
743
+ }
744
+ ],
745
+ "source": [
746
+ "#!g1.1\n",
747
+ "small_df.shape"
748
+ ]
749
+ },
750
+ {
751
+ "cell_type": "code",
752
+ "execution_count": 407,
753
+ "id": "0726c5ad",
754
+ "metadata": {
755
+ "cellId": "0hj81qc6cxmjk68l9h30wu"
756
+ },
757
+ "outputs": [],
758
+ "source": [
759
+ "#!g1.1\n",
760
+ "small_df[\"text\"] = small_df[\"title\"] + \". \" + small_df[\"abstract\"]\n",
761
+ "small_df[\"text\"] = small_df[\"text\"].map(lambda x : x.replace(\"\\n\", \" \"))\n",
762
+ "small_df[\"text\"] = small_df[\"text\"].str.lower()\n"
763
+ ]
764
+ },
765
+ {
766
+ "cell_type": "code",
767
+ "execution_count": 428,
768
+ "id": "85a623dd",
769
+ "metadata": {
770
+ "cellId": "cy86suh5aidkyhqe8g9l8"
771
+ },
772
+ "outputs": [],
773
+ "source": [
774
+ "#!g1.1\n",
775
+ "for idx, row in small_df.iterrows():\n",
776
+ " row['categories'] = dict_label[row['tag']] - 1\n",
777
+ " \n",
778
+ "small_df['label'] = small_df['categories']"
779
+ ]
780
+ },
781
+ {
782
+ "cell_type": "code",
783
+ "execution_count": 435,
784
+ "id": "1dbb6b66",
785
+ "metadata": {
786
+ "cellId": "j8fduhc4vzcf23ghd01uh"
787
+ },
788
+ "outputs": [],
789
+ "source": [
790
+ "#!g1.1\n",
791
+ "data = small_df[['text', 'label']]"
792
+ ]
793
+ },
794
+ {
795
+ "cell_type": "code",
796
+ "execution_count": 440,
797
+ "id": "0ec4a8d7",
798
+ "metadata": {
799
+ "cellId": "axjkoz0rhguz42agwj0azs"
800
+ },
801
+ "outputs": [
802
+ {
803
+ "data": {
804
+ "text/plain": [
805
+ "0"
806
+ ]
807
+ },
808
+ "execution_count": 440,
809
+ "metadata": {},
810
+ "output_type": "execute_result"
811
+ }
812
+ ],
813
+ "source": [
814
+ "#!g1.1\n",
815
+ "sum(data['label'] == 8)"
816
+ ]
817
+ },
818
+ {
819
+ "cell_type": "code",
820
+ "execution_count": 409,
821
+ "id": "1b0209aa",
822
+ "metadata": {
823
+ "cellId": "cr69xpmgajao3i52mfmsw"
824
+ },
825
+ "outputs": [],
826
+ "source": [
827
+ "#!g1.1\n",
828
+ "dict_label_map = dict()\n",
829
+ "for idx, row in small_df.iterrows():\n",
830
+ " if row['tag'] in dict_label_map:\n",
831
+ " dict_label_map[row['tag']] += 1\n",
832
+ " else:\n",
833
+ " dict_label_map[row['tag']] = 1"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "code",
838
+ "execution_count": 410,
839
+ "id": "e752a6ab",
840
+ "metadata": {
841
+ "cellId": "475aq0jltlwie0y5rce39f"
842
+ },
843
+ "outputs": [
844
+ {
845
+ "data": {
846
+ "text/plain": [
847
+ "{'math': 8000,\n",
848
+ " 'physics': 8000,\n",
849
+ " 'q-bio': 8000,\n",
850
+ " 'cs': 8000,\n",
851
+ " 'q-fin': 8000,\n",
852
+ " 'stat': 8000,\n",
853
+ " 'eess': 8000,\n",
854
+ " 'econ': 3904}"
855
+ ]
856
+ },
857
+ "execution_count": 410,
858
+ "metadata": {},
859
+ "output_type": "execute_result"
860
+ }
861
+ ],
862
+ "source": [
863
+ "#!g1.1\n",
864
+ "dict_label_map"
865
+ ]
866
+ },
867
+ {
868
+ "cell_type": "code",
869
+ "execution_count": 441,
870
+ "id": "f01b5e91",
871
+ "metadata": {
872
+ "cellId": "zvkn5omccecd93rwwd4v4"
873
+ },
874
+ "outputs": [],
875
+ "source": [
876
+ "#!g1.1\n",
877
+ "from sklearn.model_selection import train_test_split\n",
878
+ "train_data, test_data = train_test_split(data, test_size=0.2)\n",
879
+ "train_data, val_data = train_test_split(data, test_size=0.25)"
880
+ ]
881
+ },
882
+ {
883
+ "cell_type": "code",
884
+ "execution_count": 442,
885
+ "id": "ee989ca4",
886
+ "metadata": {
887
+ "cellId": "xlfdta76w5j3xypjxddpl"
888
+ },
889
+ "outputs": [],
890
+ "source": [
891
+ "#!g1.1\n",
892
+ "train_data.to_csv('train_data.csv', index=None)\n",
893
+ "val_data.to_csv('val_data.csv', index=None)\n",
894
+ "test_data.to_csv('test_data.csv', index=None)"
895
+ ]
896
+ },
897
+ {
898
+ "cell_type": "code",
899
+ "execution_count": 414,
900
+ "id": "f998553d",
901
+ "metadata": {
902
+ "cellId": "ftho4ti37uey8j3jmeqjk"
903
+ },
904
+ "outputs": [],
905
+ "source": [
906
+ "#!g1.1\n",
907
+ "# тут датасфера не дает мне нормальной возможности работать с токенизатором, поэтому перейдем в другой ��оутбук где и продолжим "
908
+ ]
909
+ }
910
+ ],
911
+ "metadata": {
912
+ "kernelspec": {
913
+ "display_name": "Yandex DataSphere Kernel",
914
+ "language": "python",
915
+ "name": "python3"
916
+ },
917
+ "language_info": {
918
+ "codemirror_mode": {
919
+ "name": "ipython",
920
+ "version": 3
921
+ },
922
+ "file_extension": ".py",
923
+ "mimetype": "text/x-python",
924
+ "name": "python",
925
+ "nbconvert_exporter": "python",
926
+ "pygments_lexer": "ipython3",
927
+ "version": "3.7.7"
928
+ },
929
+ "notebookId": "8cdde807-2f63-4837-814d-b292cfd142b6",
930
+ "notebookPath": "Untitled (1).ipynb"
931
+ },
932
+ "nbformat": 4,
933
+ "nbformat_minor": 5
934
+ }